/* This file provides x86_64/i386 hand implementation of the following function void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); which is a C function in sha2.c (from xnu). The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file with all ssse3 instructions replaced with sse3 or below instructions. sha256 algorithm per block description: 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 2. load 8 digests a-h from ctx->state 3. for r = 0:15 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; d += T1; h = T1 + Sigma0(a) + Maj(a,b,c) permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g 4. for r = 16:63 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; d += T1; h = T1 + Sigma0(a) + Maj(a,b,c) permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g In the assembly implementation: - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) the implementation per block looks like ---------------------------------------------------------------------------- load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 pre_calculate and store W+K(0:15) in stack load digests a-h from ctx->state; for (r=0;r<48;r+=4) { digests a-h update and permute round r:r+3 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration } for (r=48;r<64;r+=4) { digests a-h update and permute round r:r+3 } ctx->states += digests a-h; ---------------------------------------------------------------------------- our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block into the last 16 rounds of its previous block: ---------------------------------------------------------------------------- load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 pre_calculate and store W+K(0:15) in stack L_loop: load digests a-h from ctx->state; for (r=0;r<48;r+=4) { digests a-h update and permute round r:r+3 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration } num_block--; if (num_block==0) jmp L_last_block; for (r=48;r<64;r+=4) { digests a-h update and permute round r:r+3 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 pre_calculate and store W+K([r:r+3]%16) in stack } ctx->states += digests a-h; jmp L_loop; L_last_block: for (r=48;r<64;r+=4) { digests a-h update and permute round r:r+3 } ctx->states += digests a-h; ------------------------------------------------------------------------ Apple CoreOS vector & numerics cclee 8-3-10 */ #if defined KERNEL #include #else #include #endif // associate variables with registers or memory #if defined (__x86_64__) #define sp %rsp #define ctx %rdi #define data %rsi #define num_blocks %rdx #define a %r8d #define b %r9d #define c %r10d #define d %r11d #define e %r12d #define f %r13d #define g %r14d #define h %r15d #define K %rbx #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words #define xmm_save 80(sp) // starting address for xmm save/restore #else #define sp %esp #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument #define data_addr 24+stack_size(sp) // 2nd caller argument #define num_blocks 28+stack_size(sp) // 3rd caller argument #define a %ebx #define b %edx #define c 64(sp) #define d %ebp #define e %esi #define f 68(sp) #define g %edi #define h 72(sp) #define K 76(sp) // pointer to K256[] table #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words #define xmm_save 96(sp) // starting address for xmm save/restore #endif // 2 local variables #define t %eax #define s %ecx // a window (16 words) of message scheule #define W0 %xmm0 #define W1 %xmm1 #define W2 %xmm2 #define W3 %xmm3 // circular buffer for WK[(r:r+15)%16] #define WK(x) (x&15)*4(sp) // #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) .macro Ch mov $0, t // x mov $0, s // x not t // ~x and $1, s // x & y and $2, t // ~x & z xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); .endm // #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) .macro Maj mov $0, t // x mov $1, s // y and s, t // x&y and $2, s // y&z xor s, t // (x&y) ^ (y&z) mov $2, s // z and $0, s // (x&z) xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) .endm /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ // #define R(b,x) ((x) >> (b)) /* 32-bit Rotate-right (used in SHA-256): */ // #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) // #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) // performs sigma0_256 on 4 words on an xmm registers // use xmm6/xmm7 as intermediate registers .macro sigma0 movdqa $0, %xmm6 movdqa $0, %xmm7 psrld $$3, $0 // SHR3(x) psrld $$7, %xmm6 // part of ROTR7 pslld $$14, %xmm7 // part of ROTR18 pxor %xmm6, $0 pxor %xmm7, $0 psrld $$11, %xmm6 // part of ROTR18 pslld $$11, %xmm7 // part of ROTR7 pxor %xmm6, $0 pxor %xmm7, $0 .endm // #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) // performs sigma1_256 on 4 words on an xmm registers // use xmm6/xmm7 as intermediate registers .macro sigma1 movdqa $0, %xmm6 movdqa $0, %xmm7 psrld $$10, $0 // SHR10(x) psrld $$17, %xmm6 // part of ROTR17 pxor %xmm6, $0 pslld $$13, %xmm7 // part of ROTR19 pxor %xmm7, $0 psrld $$2, %xmm6 // part of ROTR19 pxor %xmm6, $0 pslld $$2, %xmm7 // part of ROTR17 pxor %xmm7, $0 .endm // #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) .macro Sigma0 mov $0, t // x mov $0, s // x ror $$2, t // S32(2, (x)) ror $$13, s // S32(13, (x)) xor s, t // S32(2, (x)) ^ S32(13, (x)) ror $$9, s // S32(22, (x)) xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) .endm // #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) .macro Sigma1 mov $0, s // x ror $$6, s // S32(6, (x)) mov s, t // S32(6, (x)) ror $$5, s // S32(11, (x)) xor s, t // S32(6, (x)) ^ S32(11, (x)) ror $$14, s // S32(25, (x)) xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) .endm // per round digests update .macro round Sigma1 $4 // t = T1 add t, $7 // use h to store h+Sigma1(e) Ch $4, $5, $6 // t = Ch (e, f, g); add $7, t // t = h+Sigma1(e)+Ch(e,f,g); add WK($8), t // h = T1 add t, $3 // d += T1; mov t, $7 // h = T1 Sigma0 $0 // t = Sigma0(a); add t, $7 // h = T1 + Sigma0(a); Maj $0, $1, $2 // t = Maj(a,b,c) add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); .endm // per 4 rounds digests update and permutation // permutation is absorbed by rotating the roles of digests a-h .macro rounds round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 .endm // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future .macro message_schedule // 4 32-bit K256 words in xmm5 #if defined (__x86_64__) movdqu (K), %xmm5 #else mov K, t movdqu (t), %xmm5 #endif add $$16, K // K points to next K256 word for next iteration movdqa $1, %xmm4 // W7:W4 palignr $$4, $0, %xmm4 // W4:W1 sigma0 %xmm4 // sigma0(W4:W1) movdqa $3, %xmm6 // W15:W12 paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) palignr $$4, $2, %xmm6 // W12:W9 paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 movdqa $3, %xmm4 // W15:W12 psrldq $$8, %xmm4 // 0,0,W15,W14 sigma1 %xmm4 // sigma1(0,0,W15,W14) paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 pslldq $$8, %xmm4 // W17, W16, 0, 0 sigma1 %xmm4 // sigma1(W17,W16,0,0) paddd %xmm4, $0 // W19:W16 paddd $0, %xmm5 // WK movdqa %xmm5, WK($4) .endm // this macro is used in the last 16 rounds of a current block // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] // and save into stack to prepare for next block .macro update_W_WK #if defined (__x86_64__) movdqu $0*16(data), $1 // read 4 4-byte words pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] movdqu $0*16(K), %xmm4 // K[r:r+3] #else mov data_addr, t movdqu $0*16(t), $1 // read 4 4-byte words pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] mov K, t movdqu $0*16(t), %xmm4 // K[r:r+3] #endif paddd $1, %xmm4 // WK[r:r+3] movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer .endm .text #if defined (__x86_64__) || defined (__i386__) .globl _SHA256_Transform _SHA256_Transform: // detect SSSE3 and dispatch appropriate code branch #if defined __x86_64__ movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities mov (%rax), %eax // %eax = __cpu_capabilities #else // i386 #if defined KERNEL leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities mov (%eax), %eax // %eax = __cpu_capabilities #else mov _COMM_PAGE_CPU_CAPABILITIES, %eax #endif #endif test $(kHasSupplementalSSE3), %eax je _SHA256_Transform_nossse3 // branch to no-ssse3 code // push callee-saved registers #if defined (__x86_64__) push %rbp push %rbx push %r12 push %r13 push %r14 push %r15 #else push %ebp push %ebx push %esi push %edi #endif // allocate stack space sub $stack_size, sp // if kernel code, save used xmm registers #if KERNEL movdqa %xmm0, 0*16+xmm_save movdqa %xmm1, 1*16+xmm_save movdqa %xmm2, 2*16+xmm_save movdqa %xmm3, 3*16+xmm_save movdqa %xmm4, 4*16+xmm_save movdqa %xmm5, 5*16+xmm_save movdqa %xmm6, 6*16+xmm_save movdqa %xmm7, 7*16+xmm_save #endif // set up bswap parameters in the aligned stack space and pointer to table K256[] #if defined (__x86_64__) lea _K256(%rip), K lea L_bswap(%rip), %rax movdqa (%rax), %xmm0 #else lea _K256, t mov t, K lea L_bswap, %eax movdqa (%eax), %xmm0 #endif movdqa %xmm0, L_aligned_bswap // load W[0:15] into xmm0-xmm3 #if defined (__x86_64__) movdqu 0*16(data), W0 movdqu 1*16(data), W1 movdqu 2*16(data), W2 movdqu 3*16(data), W3 add $64, data #else mov data_addr, t movdqu 0*16(t), W0 movdqu 1*16(t), W1 movdqu 2*16(t), W2 movdqu 3*16(t), W3 add $64, data_addr #endif pshufb L_aligned_bswap, W0 pshufb L_aligned_bswap, W1 pshufb L_aligned_bswap, W2 pshufb L_aligned_bswap, W3 // compute WK[0:15] and save in stack #if defined (__x86_64__) movdqu 0*16(K), %xmm4 movdqu 1*16(K), %xmm5 movdqu 2*16(K), %xmm6 movdqu 3*16(K), %xmm7 #else mov K, t movdqu 0*16(t), %xmm4 movdqu 1*16(t), %xmm5 movdqu 2*16(t), %xmm6 movdqu 3*16(t), %xmm7 #endif add $64, K paddd %xmm0, %xmm4 paddd %xmm1, %xmm5 paddd %xmm2, %xmm6 paddd %xmm3, %xmm7 movdqa %xmm4, WK(0) movdqa %xmm5, WK(4) movdqa %xmm6, WK(8) movdqa %xmm7, WK(12) L_loop: // digests a-h = ctx->states; #if defined (__x86_64__) mov 0*4(ctx), a mov 1*4(ctx), b mov 2*4(ctx), c mov 3*4(ctx), d mov 4*4(ctx), e mov 5*4(ctx), f mov 6*4(ctx), g mov 7*4(ctx), h #else mov ctx_addr, t mov 0*4(t), a mov 1*4(t), b mov 2*4(t), s mov s, c mov 3*4(t), d mov 4*4(t), e mov 5*4(t), s mov s, f mov 6*4(t), g mov 7*4(t), s mov s, h #endif // rounds 0:47 interleaved with W/WK update for rounds 16:63 rounds a, b, c, d, e, f, g, h, 0 message_schedule W0,W1,W2,W3,16 rounds e, f, g, h, a, b, c, d, 4 message_schedule W1,W2,W3,W0,20 rounds a, b, c, d, e, f, g, h, 8 message_schedule W2,W3,W0,W1,24 rounds e, f, g, h, a, b, c, d, 12 message_schedule W3,W0,W1,W2,28 rounds a, b, c, d, e, f, g, h, 16 message_schedule W0,W1,W2,W3,32 rounds e, f, g, h, a, b, c, d, 20 message_schedule W1,W2,W3,W0,36 rounds a, b, c, d, e, f, g, h, 24 message_schedule W2,W3,W0,W1,40 rounds e, f, g, h, a, b, c, d, 28 message_schedule W3,W0,W1,W2,44 rounds a, b, c, d, e, f, g, h, 32 message_schedule W0,W1,W2,W3,48 rounds e, f, g, h, a, b, c, d, 36 message_schedule W1,W2,W3,W0,52 rounds a, b, c, d, e, f, g, h, 40 message_schedule W2,W3,W0,W1,56 rounds e, f, g, h, a, b, c, d, 44 message_schedule W3,W0,W1,W2,60 // revert K to the beginning of K256[] #if defined __x86_64__ sub $256, K #else subl $256, K #endif sub $1, num_blocks // num_blocks-- je L_final_block // if final block, wrap up final rounds // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 rounds a, b, c, d, e, f, g, h, 48 update_W_WK 0, W0 rounds e, f, g, h, a, b, c, d, 52 update_W_WK 1, W1 rounds a, b, c, d, e, f, g, h, 56 update_W_WK 2, W2 rounds e, f, g, h, a, b, c, d, 60 update_W_WK 3, W3 add $64, K #if defined (__x86_64__) add $64, data #else add $64, data_addr #endif // ctx->states += digests a-h #if defined (__x86_64__) add a, 0*4(ctx) add b, 1*4(ctx) add c, 2*4(ctx) add d, 3*4(ctx) add e, 4*4(ctx) add f, 5*4(ctx) add g, 6*4(ctx) add h, 7*4(ctx) #else mov ctx_addr, t add a, 0*4(t) add b, 1*4(t) mov c, s add s, 2*4(t) add d, 3*4(t) add e, 4*4(t) mov f, s add s, 5*4(t) add g, 6*4(t) mov h, s add s, 7*4(t) #endif jmp L_loop // branch for next block // wrap up digest update round 48:63 for final block L_final_block: rounds a, b, c, d, e, f, g, h, 48 rounds e, f, g, h, a, b, c, d, 52 rounds a, b, c, d, e, f, g, h, 56 rounds e, f, g, h, a, b, c, d, 60 // ctx->states += digests a-h #if defined (__x86_64__) add a, 0*4(ctx) add b, 1*4(ctx) add c, 2*4(ctx) add d, 3*4(ctx) add e, 4*4(ctx) add f, 5*4(ctx) add g, 6*4(ctx) add h, 7*4(ctx) #else mov ctx_addr, t add a, 0*4(t) add b, 1*4(t) mov c, s add s, 2*4(t) add d, 3*4(t) add e, 4*4(t) mov f, s add s, 5*4(t) add g, 6*4(t) mov h, s add s, 7*4(t) #endif // if kernel, restore xmm0-xmm7 #if KERNEL movdqa 0*16+xmm_save, %xmm0 movdqa 1*16+xmm_save, %xmm1 movdqa 2*16+xmm_save, %xmm2 movdqa 3*16+xmm_save, %xmm3 movdqa 4*16+xmm_save, %xmm4 movdqa 5*16+xmm_save, %xmm5 movdqa 6*16+xmm_save, %xmm6 movdqa 7*16+xmm_save, %xmm7 #endif // free allocated stack memory add $stack_size, sp // restore callee-saved registers #if defined (__x86_64__) pop %r15 pop %r14 pop %r13 pop %r12 pop %rbx pop %rbp #else pop %edi pop %esi pop %ebx pop %ebp #endif // return ret .const .align 4, 0x90 L_bswap: .long 0x00010203 .long 0x04050607 .long 0x08090a0b .long 0x0c0d0e0f #endif // x86_64/i386