+#if __arm64__
+ // Parameters: X0 = descriptor
+ // Result: X0 = address of TLV
+ // Note: all registers except X0, x16, and x17 are preserved
+ .align 2
+ .globl _tlv_get_addr
+ .private_extern _tlv_get_addr
+_tlv_get_addr:
+ ldr x16, [x0, #8] // get key from descriptor
+ mrs x17, TPIDRRO_EL0
+ and x17, x17, #-8 // clear low 3 bits???
+ ldr x17, [x17, x16, lsl #3] // get thread allocation address for this key
+ cbz x17, LlazyAllocate // if NULL, lazily allocate
+ ldr x16, [x0, #16] // get offset from descriptor
+ add x0, x17, x16 // return allocation+offset
+ ret lr
+
+LlazyAllocate:
+ stp fp, lr, [sp, #-16]!
+ mov fp, sp
+ sub sp, sp, #288
+ stp x1, x2, [sp, #-16]! // save all registers that C function might trash
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ stp x7, x8, [sp, #-16]!
+ stp x9, x10, [sp, #-16]!
+ stp x11, x12, [sp, #-16]!
+ stp x13, x14, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ stp q0, q1, [sp, #-32]!
+ stp q2, q3, [sp, #-32]!
+ stp q4, q5, [sp, #-32]!
+ stp q6, q7, [sp, #-32]!
+ stp x0, x17, [sp, #-16]! // save descriptor
+
+ mov x0, x16 // use key from descriptor as parameter
+ bl _tlv_allocate_and_initialize_for_key
+ ldp x16, x17, [sp], #16 // pop descriptor
+ ldr x16, [x16, #16] // get offset from descriptor
+ add x0, x0, x16 // return allocation+offset
+
+ ldp q6, q7, [sp], #32
+ ldp q4, q5, [sp], #32
+ ldp q2, q3, [sp], #32
+ ldp q0, q1, [sp], #32
+ ldp x15, x16, [sp], #16
+ ldp x13, x14, [sp], #16
+ ldp x11, x12, [sp], #16
+ ldp x9, x10, [sp], #16
+ ldp x7, x8, [sp], #16
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+
+ mov sp, fp
+ ldp fp, lr, [sp], #16
+ ret lr
+
+#endif