// bool save_xxm = (*((uint32_t*)_COMM_PAGE_CPU_CAPABILITIES) & kHasAVX1_0) != 0;
#if __x86_64__
- // returns address of TLV in %rax, all other registers preserved
- #define FP_SAVE -192
- #define VECTOR_SAVE -704
- #define STACK_SIZE 704
+#define RDI_SAVE_RBP -8
+#define RSI_SAVE_RBP -16
+#define RDX_SAVE_RBP -24
+#define RCX_SAVE_RBP -32
+#define RBX_SAVE_RBP -40
+#define R8_SAVE_RBP -48
+#define R9_SAVE_RBP -56
+#define R10_SAVE_RBP -64
+#define R11_SAVE_RBP -72
+#define STATIC_STACK_SIZE 256 // extra padding to allow it to be 64-byte aligned
+
+#define XMM0_SAVE_RSP 0x00
+#define XMM1_SAVE_RSP 0x10
+#define XMM2_SAVE_RSP 0x20
+#define XMM3_SAVE_RSP 0x30
+#define XMM4_SAVE_RSP 0x40
+#define XMM5_SAVE_RSP 0x50
+#define XMM6_SAVE_RSP 0x60
+#define XMM7_SAVE_RSP 0x70
+
+
+ // returns address of TLV in %rax, all other registers preserved
.globl _tlv_get_addr
.private_extern _tlv_get_addr
_tlv_get_addr:
addq 16(%rdi),%rax // add offset from descriptor
ret
LlazyAllocate:
- pushq %rbp
- movq %rsp, %rbp
- subq $STACK_SIZE,%rsp // fxsave uses 512 bytes of store, xsave uses
- movq %rdi,-8(%rbp)
- movq %rsi,-16(%rbp)
- movq %rdx,-24(%rbp)
- movq %rcx,-32(%rbp)
- movq %r8,-40(%rbp)
- movq %r9,-48(%rbp)
- movq %r10,-56(%rbp)
- movq %r11,-64(%rbp)
- fnsave FP_SAVE(%rbp)
- movq $(_COMM_PAGE_CPU_CAPABILITIES), %rcx
- movl (%rcx), %ecx
- testl $kHasAVX1_0, %ecx
- jne L2
- movdqa %xmm0, VECTOR_SAVE+0x00(%rbp)
- movdqa %xmm1, VECTOR_SAVE+0x10(%rbp)
- movdqa %xmm2, VECTOR_SAVE+0x20(%rbp)
- movdqa %xmm3, VECTOR_SAVE+0x30(%rbp)
- movdqa %xmm4, VECTOR_SAVE+0x40(%rbp)
- movdqa %xmm5, VECTOR_SAVE+0x50(%rbp)
- movdqa %xmm6, VECTOR_SAVE+0x60(%rbp)
- movdqa %xmm7, VECTOR_SAVE+0x70(%rbp)
- movdqa %xmm8, VECTOR_SAVE+0x80(%rbp)
- movdqa %xmm9, VECTOR_SAVE+0x90(%rbp)
- movdqa %xmm10,VECTOR_SAVE+0xA0(%rbp)
- movdqa %xmm11,VECTOR_SAVE+0xB0(%rbp)
- movdqa %xmm12,VECTOR_SAVE+0xC0(%rbp)
- movdqa %xmm13,VECTOR_SAVE+0xD0(%rbp)
- movdqa %xmm14,VECTOR_SAVE+0xE0(%rbp)
- movdqa %xmm15,VECTOR_SAVE+0xF0(%rbp)
- jmp L3
-L2: vmovdqu %ymm0, VECTOR_SAVE+0x00(%rbp)
- vmovdqu %ymm1, VECTOR_SAVE+0x20(%rbp)
- vmovdqu %ymm2, VECTOR_SAVE+0x40(%rbp)
- vmovdqu %ymm3, VECTOR_SAVE+0x60(%rbp)
- vmovdqu %ymm4, VECTOR_SAVE+0x80(%rbp)
- vmovdqu %ymm5, VECTOR_SAVE+0xA0(%rbp)
- vmovdqu %ymm6, VECTOR_SAVE+0xC0(%rbp)
- vmovdqu %ymm7, VECTOR_SAVE+0xE0(%rbp)
- vmovdqu %ymm8, VECTOR_SAVE+0x100(%rbp)
- vmovdqu %ymm9, VECTOR_SAVE+0x120(%rbp)
- vmovdqu %ymm10,VECTOR_SAVE+0x140(%rbp)
- vmovdqu %ymm11,VECTOR_SAVE+0x160(%rbp)
- vmovdqu %ymm12,VECTOR_SAVE+0x180(%rbp)
- vmovdqu %ymm13,VECTOR_SAVE+0x1A0(%rbp)
- vmovdqu %ymm14,VECTOR_SAVE+0x1C0(%rbp)
- vmovdqu %ymm15,VECTOR_SAVE+0x1E0(%rbp)
-L3: movq -32(%rbp),%rcx
- movq 8(%rdi),%rdi // get key from descriptor
- call _tlv_allocate_and_initialize_for_key
+ pushq %rbp
+ movq %rsp,%rbp
+ subq $STATIC_STACK_SIZE,%rsp
+ movq %rdi,RDI_SAVE_RBP(%rbp) # save registers that might be used as parameters
+ movq %rsi,RSI_SAVE_RBP(%rbp)
+ movq %rdx,RDX_SAVE_RBP(%rbp)
+ movq %rcx,RCX_SAVE_RBP(%rbp)
+ movq %rbx,RBX_SAVE_RBP(%rbp)
+ movq %r8, R8_SAVE_RBP(%rbp)
+ movq %r9, R9_SAVE_RBP(%rbp)
+ movq %r10,R10_SAVE_RBP(%rbp)
+ movq %r11,R11_SAVE_RBP(%rbp)
+
+ cmpl $0, _inited(%rip)
+ jne Linited
+ movl $0x01,%eax
+ cpuid # get cpu features to check on xsave instruction support
+ andl $0x08000000,%ecx # check OSXSAVE bit
+ movl %ecx,_hasXSave(%rip)
+ cmpl $0, %ecx
+ jne LxsaveInfo
+ movl $1, _inited(%rip)
+ jmp Lsse
+
+LxsaveInfo:
+ movl $0x0D,%eax
+ movl $0x00,%ecx
+ cpuid # get xsave parameter info
+ movl %eax,_features_lo32(%rip)
+ movl %edx,_features_hi32(%rip)
+ movl %ecx,_bufferSize32(%rip)
+ movl $1, _inited(%rip)
+
+Linited:
+ cmpl $0, _hasXSave(%rip)
+ jne Lxsave
- frstor FP_SAVE(%rbp)
- movq $(_COMM_PAGE_CPU_CAPABILITIES), %rcx
- movl (%rcx), %ecx
- testl $kHasAVX1_0, %ecx
- jne L4
- movdqa VECTOR_SAVE+0x00(%rbp), %xmm0
- movdqa VECTOR_SAVE+0x10(%rbp), %xmm1
- movdqa VECTOR_SAVE+0x20(%rbp), %xmm2
- movdqa VECTOR_SAVE+0x30(%rbp), %xmm3
- movdqa VECTOR_SAVE+0x40(%rbp), %xmm4
- movdqa VECTOR_SAVE+0x50(%rbp), %xmm5
- movdqa VECTOR_SAVE+0x60(%rbp), %xmm6
- movdqa VECTOR_SAVE+0x70(%rbp), %xmm7
- movdqa VECTOR_SAVE+0x80(%rbp), %xmm8
- movdqa VECTOR_SAVE+0x90(%rbp), %xmm9
- movdqa VECTOR_SAVE+0xA0(%rbp), %xmm10
- movdqa VECTOR_SAVE+0xB0(%rbp), %xmm11
- movdqa VECTOR_SAVE+0xC0(%rbp), %xmm12
- movdqa VECTOR_SAVE+0xD0(%rbp), %xmm13
- movdqa VECTOR_SAVE+0xE0(%rbp), %xmm14
- movdqa VECTOR_SAVE+0xF0(%rbp), %xmm15
- jmp L5
-L4: vmovdqu VECTOR_SAVE+0x00(%rbp), %ymm0
- vmovdqu VECTOR_SAVE+0x20(%rbp), %ymm1
- vmovdqu VECTOR_SAVE+0x40(%rbp), %ymm2
- vmovdqu VECTOR_SAVE+0x60(%rbp), %ymm3
- vmovdqu VECTOR_SAVE+0x80(%rbp), %ymm4
- vmovdqu VECTOR_SAVE+0xA0(%rbp), %ymm5
- vmovdqu VECTOR_SAVE+0xC0(%rbp), %ymm6
- vmovdqu VECTOR_SAVE+0xE0(%rbp), %ymm7
- vmovdqu VECTOR_SAVE+0x100(%rbp), %ymm8
- vmovdqu VECTOR_SAVE+0x120(%rbp), %ymm9
- vmovdqu VECTOR_SAVE+0x140(%rbp), %ymm10
- vmovdqu VECTOR_SAVE+0x160(%rbp), %ymm11
- vmovdqu VECTOR_SAVE+0x180(%rbp), %ymm12
- vmovdqu VECTOR_SAVE+0x1A0(%rbp), %ymm13
- vmovdqu VECTOR_SAVE+0x1C0(%rbp), %ymm14
- vmovdqu VECTOR_SAVE+0x1E0(%rbp), %ymm15
-L5: movq -64(%rbp),%r11
- movq -56(%rbp),%r10
- movq -48(%rbp),%r9
- movq -40(%rbp),%r8
- movq -32(%rbp),%rcx
- movq -24(%rbp),%rdx
- movq -16(%rbp),%rsi
- movq -8(%rbp),%rdi
- addq 16(%rdi),%rax // result = buffer + offset
- addq $STACK_SIZE,%rsp
- popq %rbp
+Lsse:
+ subq $128, %rsp
+ movdqa %xmm0, XMM0_SAVE_RSP(%rsp)
+ movdqa %xmm1, XMM1_SAVE_RSP(%rsp)
+ movdqa %xmm2, XMM2_SAVE_RSP(%rsp)
+ movdqa %xmm3, XMM3_SAVE_RSP(%rsp)
+ movdqa %xmm4, XMM4_SAVE_RSP(%rsp)
+ movdqa %xmm5, XMM5_SAVE_RSP(%rsp)
+ movdqa %xmm6, XMM6_SAVE_RSP(%rsp)
+ movdqa %xmm7, XMM7_SAVE_RSP(%rsp)
+ jmp Lalloc
+
+Lxsave:
+ movl _bufferSize32(%rip),%eax
+ movq %rsp, %rdi
+ subq %rax, %rdi # stack alloc buffer
+ andq $-64, %rdi # 64-byte align stack
+ movq %rdi, %rsp
+ # xsave requires buffer to be zero'ed out
+ movq $0, %rcx
+ movq %rdi, %r8
+ movq %rdi, %r9
+ addq %rax, %r9
+Lz: movq %rcx, (%r8)
+ addq $8, %r8
+ cmpq %r8,%r9
+ ja Lz
+
+ movl _features_lo32(%rip),%eax
+ movl _features_hi32(%rip),%edx
+ # call xsave with buffer on stack and eax:edx flag bits
+ # note: do not use xsaveopt, it assumes you are using the same
+ # buffer as previous xsaves, and this thread is on the same cpu.
+ xsave (%rsp)
+
+Lalloc:
+ movq RDI_SAVE_RBP(%rbp),%rdi
+ movq 8(%rdi),%rdi // get key from descriptor
+ call _tlv_allocate_and_initialize_for_key
+
+ cmpl $0, _hasXSave(%rip)
+ jne Lxrstror
+
+ movdqa XMM0_SAVE_RSP(%rsp),%xmm0
+ movdqa XMM1_SAVE_RSP(%rsp),%xmm1
+ movdqa XMM2_SAVE_RSP(%rsp),%xmm2
+ movdqa XMM3_SAVE_RSP(%rsp),%xmm3
+ movdqa XMM4_SAVE_RSP(%rsp),%xmm4
+ movdqa XMM5_SAVE_RSP(%rsp),%xmm5
+ movdqa XMM6_SAVE_RSP(%rsp),%xmm6
+ movdqa XMM7_SAVE_RSP(%rsp),%xmm7
+ jmp Ldone
+
+Lxrstror:
+ movq %rax,%r11
+ movl _features_lo32(%rip),%eax
+ movl _features_hi32(%rip),%edx
+ # call xsave with buffer on stack and eax:edx flag bits
+ xrstor (%rsp)
+ movq %r11,%rax
+
+Ldone:
+ movq RDI_SAVE_RBP(%rbp),%rdi
+ movq RSI_SAVE_RBP(%rbp),%rsi
+ movq RDX_SAVE_RBP(%rbp),%rdx
+ movq RCX_SAVE_RBP(%rbp),%rcx
+ movq RBX_SAVE_RBP(%rbp),%rbx
+ movq R8_SAVE_RBP(%rbp),%r8
+ movq R9_SAVE_RBP(%rbp),%r9
+ movq R10_SAVE_RBP(%rbp),%r10
+ movq R11_SAVE_RBP(%rbp),%r11
+ movq %rbp,%rsp
+ popq %rbp
+ addq 16(%rdi),%rax // result = buffer + offset
ret
+
+ .data
+# Cached info from cpuid.
+_inited: .long 0
+_features_lo32: .long 0
+_features_hi32: .long 0
+_bufferSize32: .long 0
+_hasXSave: .long 0
+
#endif
ret
#endif
+#if __arm64__
+ // Parameters: X0 = descriptor
+ // Result: X0 = address of TLV
+ // Note: all registers except X0, x16, and x17 are preserved
+ .align 2
+ .globl _tlv_get_addr
+ .private_extern _tlv_get_addr
+_tlv_get_addr:
+ ldr x16, [x0, #8] // get key from descriptor
+ mrs x17, TPIDRRO_EL0
+ and x17, x17, #-8 // clear low 3 bits???
+ ldr x17, [x17, x16, lsl #3] // get thread allocation address for this key
+ cbz x17, LlazyAllocate // if NULL, lazily allocate
+ ldr x16, [x0, #16] // get offset from descriptor
+ add x0, x17, x16 // return allocation+offset
+ ret lr
+
+LlazyAllocate:
+ stp fp, lr, [sp, #-16]!
+ mov fp, sp
+ sub sp, sp, #288
+ stp x1, x2, [sp, #-16]! // save all registers that C function might trash
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ stp x7, x8, [sp, #-16]!
+ stp x9, x10, [sp, #-16]!
+ stp x11, x12, [sp, #-16]!
+ stp x13, x14, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ stp q0, q1, [sp, #-32]!
+ stp q2, q3, [sp, #-32]!
+ stp q4, q5, [sp, #-32]!
+ stp q6, q7, [sp, #-32]!
+ stp x0, x17, [sp, #-16]! // save descriptor
+
+ mov x0, x16 // use key from descriptor as parameter
+ bl _tlv_allocate_and_initialize_for_key
+ ldp x16, x17, [sp], #16 // pop descriptor
+ ldr x16, [x16, #16] // get offset from descriptor
+ add x0, x0, x16 // return allocation+offset
+
+ ldp q6, q7, [sp], #32
+ ldp q4, q5, [sp], #32
+ ldp q2, q3, [sp], #32
+ ldp q0, q1, [sp], #32
+ ldp x15, x16, [sp], #16
+ ldp x13, x14, [sp], #16
+ ldp x11, x12, [sp], #16
+ ldp x9, x10, [sp], #16
+ ldp x7, x8, [sp], #16
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+
+ mov sp, fp
+ ldp fp, lr, [sp], #16
+ ret lr
+
+#endif
-#if 0
#if __arm__
// returns address of TLV in r0, all other registers preserved
.globl _tlv_get_addr
.private_extern _tlv_get_addr
_tlv_get_addr:
- push {r1,r2,r3,r7,lr}
- mov r7,r0 // save descriptor in r7
+ push {r1,r2,r3,r7,lr}
+#if __ARM_ARCH_7K__
+ sub sp, sp, #12 // align stack to 16 bytes
+#endif
+ mov r7, r0 // save descriptor in r7
ldr r0, [r7, #4] // get key from descriptor
bl _pthread_getspecific // get thread value
cmp r0, #0
bne L2 // if NULL, lazily allocate
+#if __ARM_ARCH_7K__
+ vpush {d0, d1, d2, d3, d4, d5, d6, d7}
+#endif
ldr r0, [r7, #4] // get key from descriptor
bl _tlv_allocate_and_initialize_for_key
+#if __ARM_ARCH_7K__
+ vpop {d0, d1, d2, d3, d4, d5, d6, d7}
+#endif
L2: ldr r1, [r7, #8] // get offset from descriptor
add r0, r1, r0 // add offset into allocation block
- pop {r1,r2,r3,r7,pc}
+#if __ARM_ARCH_7K__
+ add sp, sp, #12
#endif
+ pop {r1,r2,r3,r7,pc}
#endif
.subsections_via_symbols