--- /dev/null
+/*
+ This file provides x86_64/i386 hand implementation of the following function
+
+ void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
+
+ which is a C function in sha2.c (from xnu).
+
+ The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
+ SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
+ with all ssse3 instructions replaced with sse3 or below instructions.
+
+ sha256 algorithm per block description:
+
+ 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
+ 2. load 8 digests a-h from ctx->state
+ 3. for r = 0:15
+ T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+ d += T1;
+ h = T1 + Sigma0(a) + Maj(a,b,c)
+ permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+ 4. for r = 16:63
+ W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
+ T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+ d += T1;
+ h = T1 + Sigma0(a) + Maj(a,b,c)
+ permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+ In the assembly implementation:
+ - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
+ - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+ - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
+
+ the implementation per block looks like
+
+ ----------------------------------------------------------------------------
+
+ load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
+ pre_calculate and store W+K(0:15) in stack
+
+ load digests a-h from ctx->state;
+
+ for (r=0;r<48;r+=4) {
+ digests a-h update and permute round r:r+3
+ update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
+ }
+
+ for (r=48;r<64;r+=4) {
+ digests a-h update and permute round r:r+3
+ }
+
+ ctx->states += digests a-h;
+
+ ----------------------------------------------------------------------------
+
+ our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+ into the last 16 rounds of its previous block:
+
+ ----------------------------------------------------------------------------
+
+ load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
+ pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+ load digests a-h from ctx->state;
+
+ for (r=0;r<48;r+=4) {
+ digests a-h update and permute round r:r+3
+ update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
+ }
+
+ num_block--;
+ if (num_block==0) jmp L_last_block;
+
+ for (r=48;r<64;r+=4) {
+ digests a-h update and permute round r:r+3
+ load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
+ pre_calculate and store W+K([r:r+3]%16) in stack
+ }
+
+ ctx->states += digests a-h;
+
+ jmp L_loop;
+
+L_last_block:
+
+ for (r=48;r<64;r+=4) {
+ digests a-h update and permute round r:r+3
+ }
+
+ ctx->states += digests a-h;
+
+ ------------------------------------------------------------------------
+
+ Apple CoreOS vector & numerics
+ cclee 8-3-10
+*/
+
+#if defined KERNEL
+#include <i386/cpu_capabilities.h>
+#else
+#include <System/i386/cpu_capabilities.h>
+#endif
+
+ // associate variables with registers or memory
+
+#if defined (__x86_64__)
+ #define sp %rsp
+ #define ctx %rdi
+ #define data %rsi
+ #define num_blocks %rdx
+
+ #define a %r8d
+ #define b %r9d
+ #define c %r10d
+ #define d %r11d
+ #define e %r12d
+ #define f %r13d
+ #define g %r14d
+ #define h %r15d
+
+ #define K %rbx
+ #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
+
+ #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words
+ #define xmm_save 80(sp) // starting address for xmm save/restore
+#else
+ #define sp %esp
+ #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
+ #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
+ #define data_addr 24+stack_size(sp) // 2nd caller argument
+ #define num_blocks 28+stack_size(sp) // 3rd caller argument
+
+ #define a %ebx
+ #define b %edx
+ #define c 64(sp)
+ #define d %ebp
+ #define e %esi
+ #define f 68(sp)
+ #define g %edi
+ #define h 72(sp)
+
+ #define K 76(sp) // pointer to K256[] table
+ #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words
+ #define xmm_save 96(sp) // starting address for xmm save/restore
+#endif
+
+ // 2 local variables
+ #define t %eax
+ #define s %ecx
+
+ // a window (16 words) of message scheule
+ #define W0 %xmm0
+ #define W1 %xmm1
+ #define W2 %xmm2
+ #define W3 %xmm3
+
+ // circular buffer for WK[(r:r+15)%16]
+ #define WK(x) (x&15)*4(sp)
+
+// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
+
+ .macro Ch
+ mov $0, t // x
+ mov $0, s // x
+ not t // ~x
+ and $1, s // x & y
+ and $2, t // ~x & z
+ xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
+ .endm
+
+// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+ .macro Maj
+ mov $0, t // x
+ mov $1, s // y
+ and s, t // x&y
+ and $2, s // y&z
+ xor s, t // (x&y) ^ (y&z)
+ mov $2, s // z
+ and $0, s // (x&z)
+ xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+ .endm
+
+/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
+// #define R(b,x) ((x) >> (b))
+/* 32-bit Rotate-right (used in SHA-256): */
+// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
+
+// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
+
+ // performs sigma0_256 on 4 words on an xmm registers
+ // use xmm6/xmm7 as intermediate registers
+ .macro sigma0
+ movdqa $0, %xmm6
+ movdqa $0, %xmm7
+ psrld $$3, $0 // SHR3(x)
+ psrld $$7, %xmm6 // part of ROTR7
+ pslld $$14, %xmm7 // part of ROTR18
+ pxor %xmm6, $0
+ pxor %xmm7, $0
+ psrld $$11, %xmm6 // part of ROTR18
+ pslld $$11, %xmm7 // part of ROTR7
+ pxor %xmm6, $0
+ pxor %xmm7, $0
+ .endm
+
+// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
+
+ // performs sigma1_256 on 4 words on an xmm registers
+ // use xmm6/xmm7 as intermediate registers
+ .macro sigma1
+ movdqa $0, %xmm6
+ movdqa $0, %xmm7
+ psrld $$10, $0 // SHR10(x)
+ psrld $$17, %xmm6 // part of ROTR17
+ pxor %xmm6, $0
+ pslld $$13, %xmm7 // part of ROTR19
+ pxor %xmm7, $0
+ psrld $$2, %xmm6 // part of ROTR19
+ pxor %xmm6, $0
+ pslld $$2, %xmm7 // part of ROTR17
+ pxor %xmm7, $0
+ .endm
+
+// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+
+ .macro Sigma0
+ mov $0, t // x
+ mov $0, s // x
+ ror $$2, t // S32(2, (x))
+ ror $$13, s // S32(13, (x))
+ xor s, t // S32(2, (x)) ^ S32(13, (x))
+ ror $$9, s // S32(22, (x))
+ xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+ .endm
+
+// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+
+ .macro Sigma1
+ mov $0, s // x
+ ror $$6, s // S32(6, (x))
+ mov s, t // S32(6, (x))
+ ror $$5, s // S32(11, (x))
+ xor s, t // S32(6, (x)) ^ S32(11, (x))
+ ror $$14, s // S32(25, (x))
+ xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+ .endm
+
+ // per round digests update
+ .macro round
+ Sigma1 $4 // t = T1
+ add t, $7 // use h to store h+Sigma1(e)
+ Ch $4, $5, $6 // t = Ch (e, f, g);
+ add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
+ add WK($8), t // h = T1
+ add t, $3 // d += T1;
+ mov t, $7 // h = T1
+ Sigma0 $0 // t = Sigma0(a);
+ add t, $7 // h = T1 + Sigma0(a);
+ Maj $0, $1, $2 // t = Maj(a,b,c)
+ add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
+ .endm
+
+ // per 4 rounds digests update and permutation
+ // permutation is absorbed by rotating the roles of digests a-h
+ .macro rounds
+ round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+ round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+ round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+ round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+ .endm
+
+ // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
+ .macro message_schedule
+
+ // 4 32-bit K256 words in xmm5
+#if defined (__x86_64__)
+ movdqu (K), %xmm5
+#else
+ mov K, t
+ movdqu (t), %xmm5
+#endif
+ add $$16, K // K points to next K256 word for next iteration
+ movdqa $1, %xmm4 // W7:W4
+ palignr $$4, $0, %xmm4 // W4:W1
+ sigma0 %xmm4 // sigma0(W4:W1)
+ movdqa $3, %xmm6 // W15:W12
+ paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
+ palignr $$4, $2, %xmm6 // W12:W9
+ paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
+ movdqa $3, %xmm4 // W15:W12
+ psrldq $$8, %xmm4 // 0,0,W15,W14
+ sigma1 %xmm4 // sigma1(0,0,W15,W14)
+ paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
+ movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
+ pslldq $$8, %xmm4 // W17, W16, 0, 0
+ sigma1 %xmm4 // sigma1(W17,W16,0,0)
+ paddd %xmm4, $0 // W19:W16
+ paddd $0, %xmm5 // WK
+ movdqa %xmm5, WK($4)
+ .endm
+
+ // this macro is used in the last 16 rounds of a current block
+ // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
+ // and save into stack to prepare for next block
+
+ .macro update_W_WK
+#if defined (__x86_64__)
+ movdqu $0*16(data), $1 // read 4 4-byte words
+ pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
+ movdqu $0*16(K), %xmm4 // K[r:r+3]
+#else
+ mov data_addr, t
+ movdqu $0*16(t), $1 // read 4 4-byte words
+ pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
+ mov K, t
+ movdqu $0*16(t), %xmm4 // K[r:r+3]
+#endif
+ paddd $1, %xmm4 // WK[r:r+3]
+ movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
+ .endm
+
+ .text
+
+#if defined (__x86_64__) || defined (__i386__)
+
+ .globl _SHA256_Transform
+
+_SHA256_Transform:
+
+
+ // detect SSSE3 and dispatch appropriate code branch
+ #if defined __x86_64__
+ movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
+ mov (%rax), %eax // %eax = __cpu_capabilities
+ #else // i386
+ #if defined KERNEL
+ leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
+ mov (%eax), %eax // %eax = __cpu_capabilities
+ #else
+ mov _COMM_PAGE_CPU_CAPABILITIES, %eax
+ #endif
+ #endif
+ test $(kHasSupplementalSSE3), %eax
+ je _SHA256_Transform_nossse3 // branch to no-ssse3 code
+
+ // push callee-saved registers
+#if defined (__x86_64__)
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+#else
+ push %ebp
+ push %ebx
+ push %esi
+ push %edi
+#endif
+
+ // allocate stack space
+ sub $stack_size, sp
+
+ // if kernel code, save used xmm registers
+#if KERNEL
+ movdqa %xmm0, 0*16+xmm_save
+ movdqa %xmm1, 1*16+xmm_save
+ movdqa %xmm2, 2*16+xmm_save
+ movdqa %xmm3, 3*16+xmm_save
+ movdqa %xmm4, 4*16+xmm_save
+ movdqa %xmm5, 5*16+xmm_save
+ movdqa %xmm6, 6*16+xmm_save
+ movdqa %xmm7, 7*16+xmm_save
+#endif
+
+ // set up bswap parameters in the aligned stack space and pointer to table K256[]
+#if defined (__x86_64__)
+ lea _K256(%rip), K
+ lea L_bswap(%rip), %rax
+ movdqa (%rax), %xmm0
+#else
+ lea _K256, t
+ mov t, K
+ lea L_bswap, %eax
+ movdqa (%eax), %xmm0
+#endif
+ movdqa %xmm0, L_aligned_bswap
+
+ // load W[0:15] into xmm0-xmm3
+#if defined (__x86_64__)
+ movdqu 0*16(data), W0
+ movdqu 1*16(data), W1
+ movdqu 2*16(data), W2
+ movdqu 3*16(data), W3
+ add $64, data
+#else
+ mov data_addr, t
+ movdqu 0*16(t), W0
+ movdqu 1*16(t), W1
+ movdqu 2*16(t), W2
+ movdqu 3*16(t), W3
+ add $64, data_addr
+#endif
+ pshufb L_aligned_bswap, W0
+ pshufb L_aligned_bswap, W1
+ pshufb L_aligned_bswap, W2
+ pshufb L_aligned_bswap, W3
+
+ // compute WK[0:15] and save in stack
+#if defined (__x86_64__)
+ movdqu 0*16(K), %xmm4
+ movdqu 1*16(K), %xmm5
+ movdqu 2*16(K), %xmm6
+ movdqu 3*16(K), %xmm7
+#else
+ mov K, t
+ movdqu 0*16(t), %xmm4
+ movdqu 1*16(t), %xmm5
+ movdqu 2*16(t), %xmm6
+ movdqu 3*16(t), %xmm7
+#endif
+ add $64, K
+ paddd %xmm0, %xmm4
+ paddd %xmm1, %xmm5
+ paddd %xmm2, %xmm6
+ paddd %xmm3, %xmm7
+ movdqa %xmm4, WK(0)
+ movdqa %xmm5, WK(4)
+ movdqa %xmm6, WK(8)
+ movdqa %xmm7, WK(12)
+
+L_loop:
+
+ // digests a-h = ctx->states;
+#if defined (__x86_64__)
+ mov 0*4(ctx), a
+ mov 1*4(ctx), b
+ mov 2*4(ctx), c
+ mov 3*4(ctx), d
+ mov 4*4(ctx), e
+ mov 5*4(ctx), f
+ mov 6*4(ctx), g
+ mov 7*4(ctx), h
+#else
+ mov ctx_addr, t
+ mov 0*4(t), a
+ mov 1*4(t), b
+ mov 2*4(t), s
+ mov s, c
+ mov 3*4(t), d
+ mov 4*4(t), e
+ mov 5*4(t), s
+ mov s, f
+ mov 6*4(t), g
+ mov 7*4(t), s
+ mov s, h
+#endif
+
+ // rounds 0:47 interleaved with W/WK update for rounds 16:63
+ rounds a, b, c, d, e, f, g, h, 0
+ message_schedule W0,W1,W2,W3,16
+ rounds e, f, g, h, a, b, c, d, 4
+ message_schedule W1,W2,W3,W0,20
+ rounds a, b, c, d, e, f, g, h, 8
+ message_schedule W2,W3,W0,W1,24
+ rounds e, f, g, h, a, b, c, d, 12
+ message_schedule W3,W0,W1,W2,28
+ rounds a, b, c, d, e, f, g, h, 16
+ message_schedule W0,W1,W2,W3,32
+ rounds e, f, g, h, a, b, c, d, 20
+ message_schedule W1,W2,W3,W0,36
+ rounds a, b, c, d, e, f, g, h, 24
+ message_schedule W2,W3,W0,W1,40
+ rounds e, f, g, h, a, b, c, d, 28
+ message_schedule W3,W0,W1,W2,44
+ rounds a, b, c, d, e, f, g, h, 32
+ message_schedule W0,W1,W2,W3,48
+ rounds e, f, g, h, a, b, c, d, 36
+ message_schedule W1,W2,W3,W0,52
+ rounds a, b, c, d, e, f, g, h, 40
+ message_schedule W2,W3,W0,W1,56
+ rounds e, f, g, h, a, b, c, d, 44
+ message_schedule W3,W0,W1,W2,60
+
+ // revert K to the beginning of K256[]
+#if defined __x86_64__
+ sub $256, K
+#else
+ subl $256, K
+#endif
+
+ sub $1, num_blocks // num_blocks--
+ je L_final_block // if final block, wrap up final rounds
+
+ // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
+ rounds a, b, c, d, e, f, g, h, 48
+ update_W_WK 0, W0
+ rounds e, f, g, h, a, b, c, d, 52
+ update_W_WK 1, W1
+ rounds a, b, c, d, e, f, g, h, 56
+ update_W_WK 2, W2
+ rounds e, f, g, h, a, b, c, d, 60
+ update_W_WK 3, W3
+
+ add $64, K
+#if defined (__x86_64__)
+ add $64, data
+#else
+ add $64, data_addr
+#endif
+
+ // ctx->states += digests a-h
+#if defined (__x86_64__)
+ add a, 0*4(ctx)
+ add b, 1*4(ctx)
+ add c, 2*4(ctx)
+ add d, 3*4(ctx)
+ add e, 4*4(ctx)
+ add f, 5*4(ctx)
+ add g, 6*4(ctx)
+ add h, 7*4(ctx)
+#else
+ mov ctx_addr, t
+ add a, 0*4(t)
+ add b, 1*4(t)
+ mov c, s
+ add s, 2*4(t)
+ add d, 3*4(t)
+ add e, 4*4(t)
+ mov f, s
+ add s, 5*4(t)
+ add g, 6*4(t)
+ mov h, s
+ add s, 7*4(t)
+#endif
+
+ jmp L_loop // branch for next block
+
+ // wrap up digest update round 48:63 for final block
+L_final_block:
+ rounds a, b, c, d, e, f, g, h, 48
+ rounds e, f, g, h, a, b, c, d, 52
+ rounds a, b, c, d, e, f, g, h, 56
+ rounds e, f, g, h, a, b, c, d, 60
+
+ // ctx->states += digests a-h
+#if defined (__x86_64__)
+ add a, 0*4(ctx)
+ add b, 1*4(ctx)
+ add c, 2*4(ctx)
+ add d, 3*4(ctx)
+ add e, 4*4(ctx)
+ add f, 5*4(ctx)
+ add g, 6*4(ctx)
+ add h, 7*4(ctx)
+#else
+ mov ctx_addr, t
+ add a, 0*4(t)
+ add b, 1*4(t)
+ mov c, s
+ add s, 2*4(t)
+ add d, 3*4(t)
+ add e, 4*4(t)
+ mov f, s
+ add s, 5*4(t)
+ add g, 6*4(t)
+ mov h, s
+ add s, 7*4(t)
+#endif
+
+ // if kernel, restore xmm0-xmm7
+#if KERNEL
+ movdqa 0*16+xmm_save, %xmm0
+ movdqa 1*16+xmm_save, %xmm1
+ movdqa 2*16+xmm_save, %xmm2
+ movdqa 3*16+xmm_save, %xmm3
+ movdqa 4*16+xmm_save, %xmm4
+ movdqa 5*16+xmm_save, %xmm5
+ movdqa 6*16+xmm_save, %xmm6
+ movdqa 7*16+xmm_save, %xmm7
+#endif
+
+ // free allocated stack memory
+ add $stack_size, sp
+
+ // restore callee-saved registers
+#if defined (__x86_64__)
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+#else
+ pop %edi
+ pop %esi
+ pop %ebx
+ pop %ebp
+#endif
+
+ // return
+ ret
+
+
+ .const
+ .align 4, 0x90
+
+L_bswap:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+
+#endif // x86_64/i386
+