2 This file provides x86_64/i386 hand implementation of the following function
4 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
6 which is a C function in sha2.c (from xnu).
8 The code SHA256_Transform_nossse3 is a clone of SHA256_Transform
9 with all ssse3 instructions replaced with sse3 or below instructions.
11 For performance reason, this function should not be called directly. This file should be working
12 together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect
13 ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function.
15 sha256 algorithm per block description:
17 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
18 2. load 8 digests a-h from ctx->state
20 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
22 h = T1 + Sigma0(a) + Maj(a,b,c)
23 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
25 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
26 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
28 h = T1 + Sigma0(a) + Maj(a,b,c)
29 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
31 In the assembly implementation:
32 - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
33 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
34 - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
36 the implementation per block looks like
38 ----------------------------------------------------------------------------
40 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
41 pre_calculate and store W+K(0:15) in stack
43 load digests a-h from ctx->state;
46 digests a-h update and permute round r:r+3
47 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
50 for (r=48;r<64;r+=4) {
51 digests a-h update and permute round r:r+3
54 ctx->states += digests a-h;
56 ----------------------------------------------------------------------------
58 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
59 into the last 16 rounds of its previous block:
61 ----------------------------------------------------------------------------
63 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
64 pre_calculate and store W+K(0:15) in stack
68 load digests a-h from ctx->state;
71 digests a-h update and permute round r:r+3
72 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
76 if (num_block==0) jmp L_last_block;
78 for (r=48;r<64;r+=4) {
79 digests a-h update and permute round r:r+3
80 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
81 pre_calculate and store W+K([r:r+3]%16) in stack
84 ctx->states += digests a-h;
90 for (r=48;r<64;r+=4) {
91 digests a-h update and permute round r:r+3
94 ctx->states += digests a-h;
96 ------------------------------------------------------------------------
98 Apple CoreOS vector & numerics
103 #include <i386/cpu_capabilities.h>
105 #include <System/i386/cpu_capabilities.h>
108 // associate variables with registers or memory
110 #if defined (__x86_64__)
114 #define num_blocks %rdx
126 #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
128 #define xmm_save 80(sp) // starting address for xmm save/restore
131 #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
132 #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
133 #define data_addr 24+stack_size(sp) // 2nd caller argument
134 #define num_blocks 28+stack_size(sp) // 3rd caller argument
145 #define K 76(sp) // pointer to K256[] table
146 #define xmm_save 96(sp) // starting address for xmm save/restore
153 // a window (16 words) of message scheule
159 // circular buffer for WK[(r:r+15)%16]
160 #define WK(x) (x&15)*4(sp)
162 // #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
170 xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
173 // #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
180 xor s, t // (x&y) ^ (y&z)
183 xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
186 /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
187 // #define R(b,x) ((x) >> (b))
188 /* 32-bit Rotate-right (used in SHA-256): */
189 // #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
191 // #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
193 // performs sigma0_256 on 4 words on an xmm registers
194 // use xmm6/xmm7 as intermediate registers
198 psrld $$3, $0 // SHR3(x)
199 psrld $$7, %xmm6 // part of ROTR7
200 pslld $$14, %xmm7 // part of ROTR18
203 psrld $$11, %xmm6 // part of ROTR18
204 pslld $$11, %xmm7 // part of ROTR7
209 // #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
211 // performs sigma1_256 on 4 words on an xmm registers
212 // use xmm6/xmm7 as intermediate registers
216 psrld $$10, $0 // SHR10(x)
217 psrld $$17, %xmm6 // part of ROTR17
219 pslld $$13, %xmm7 // part of ROTR19
221 psrld $$2, %xmm6 // part of ROTR19
223 pslld $$2, %xmm7 // part of ROTR17
227 // #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
232 ror $$2, t // S32(2, (x))
233 ror $$13, s // S32(13, (x))
234 xor s, t // S32(2, (x)) ^ S32(13, (x))
235 ror $$9, s // S32(22, (x))
236 xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
239 // #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
243 ror $$6, s // S32(6, (x))
244 mov s, t // S32(6, (x))
245 ror $$5, s // S32(11, (x))
246 xor s, t // S32(6, (x)) ^ S32(11, (x))
247 ror $$14, s // S32(25, (x))
248 xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
251 // per round digests update
254 add t, $7 // use h to store h+Sigma1(e)
255 Ch $4, $5, $6 // t = Ch (e, f, g);
256 add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
257 add WK($8), t // h = T1
258 add t, $3 // d += T1;
260 Sigma0 $0 // t = Sigma0(a);
261 add t, $7 // h = T1 + Sigma0(a);
262 Maj $0, $1, $2 // t = Maj(a,b,c)
263 add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
266 // per 4 rounds digests update and permutation
267 // permutation is absorbed by rotating the roles of digests a-h
269 round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
270 round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
271 round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
272 round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
275 // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
276 .macro message_schedule
278 // 4 32-bit K256 words in xmm5
279 #if defined (__x86_64__)
285 add $$16, K // K points to next K256 word for next iteration
286 movdqa $1, %xmm4 // W7:W4
288 palignr $$4, $0, %xmm4 // W4:W1
289 #else // no-ssse3 implementation of palignr
295 sigma0 %xmm4 // sigma0(W4:W1)
296 movdqa $3, %xmm6 // W15:W12
297 paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
299 palignr $$4, $2, %xmm6 // W12:W9
300 #else // no-ssse3 implementation of palignr
306 paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
307 movdqa $3, %xmm4 // W15:W12
308 psrldq $$8, %xmm4 // 0,0,W15,W14
309 sigma1 %xmm4 // sigma1(0,0,W15,W14)
310 paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
311 movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
312 pslldq $$8, %xmm4 // W17, W16, 0, 0
313 sigma1 %xmm4 // sigma1(W17,W16,0,0)
314 paddd %xmm4, $0 // W19:W16
315 paddd $0, %xmm5 // WK
319 // this macro is used in the last 16 rounds of a current block
320 // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
321 // and save into stack to prepare for next block
324 #if defined (__x86_64__)
326 movdqu $0*16(data), $1 // read 4 4-byte words
327 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
328 #else // no-ssse3 implementation
338 mov 12+$0*16(data), s
343 movdqu $0*16(K), %xmm4 // K[r:r+3]
347 movdqu $0*16(t), $1 // read 4 4-byte words
348 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
349 #else // no-ssse3 implementation
365 movdqu $0*16(t), %xmm4 // K[r:r+3]
367 paddd $1, %xmm4 // WK[r:r+3]
368 movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
373 #if defined (__x86_64__) || defined (__i386__)
375 .globl _SHA256_Transform_nossse3
377 _SHA256_Transform_nossse3:
379 // push callee-saved registers
380 #if defined (__x86_64__)
394 // allocate stack space
397 // if kernel code, save used xmm registers
399 movdqa %xmm0, 0*16+xmm_save
400 movdqa %xmm1, 1*16+xmm_save
401 movdqa %xmm2, 2*16+xmm_save
402 movdqa %xmm3, 3*16+xmm_save
403 movdqa %xmm4, 4*16+xmm_save
404 movdqa %xmm5, 5*16+xmm_save
405 movdqa %xmm6, 6*16+xmm_save
406 movdqa %xmm7, 7*16+xmm_save
409 // set up pointer to table K256[]
410 #if defined (__x86_64__)
417 // load W[0:15] into xmm0-xmm3
433 #if defined (__x86_64__)
452 // compute WK[0:15] and save in stack
453 #if defined (__x86_64__)
454 movdqu 0*16(K), %xmm4
455 movdqu 1*16(K), %xmm5
456 movdqu 2*16(K), %xmm6
457 movdqu 3*16(K), %xmm7
460 movdqu 0*16(t), %xmm4
461 movdqu 1*16(t), %xmm5
462 movdqu 2*16(t), %xmm6
463 movdqu 3*16(t), %xmm7
477 // digests a-h = ctx->states;
478 #if defined (__x86_64__)
502 // rounds 0:47 interleaved with W/WK update for rounds 16:63
503 rounds a, b, c, d, e, f, g, h, 0
504 message_schedule W0,W1,W2,W3,16
505 rounds e, f, g, h, a, b, c, d, 4
506 message_schedule W1,W2,W3,W0,20
507 rounds a, b, c, d, e, f, g, h, 8
508 message_schedule W2,W3,W0,W1,24
509 rounds e, f, g, h, a, b, c, d, 12
510 message_schedule W3,W0,W1,W2,28
511 rounds a, b, c, d, e, f, g, h, 16
512 message_schedule W0,W1,W2,W3,32
513 rounds e, f, g, h, a, b, c, d, 20
514 message_schedule W1,W2,W3,W0,36
515 rounds a, b, c, d, e, f, g, h, 24
516 message_schedule W2,W3,W0,W1,40
517 rounds e, f, g, h, a, b, c, d, 28
518 message_schedule W3,W0,W1,W2,44
519 rounds a, b, c, d, e, f, g, h, 32
520 message_schedule W0,W1,W2,W3,48
521 rounds e, f, g, h, a, b, c, d, 36
522 message_schedule W1,W2,W3,W0,52
523 rounds a, b, c, d, e, f, g, h, 40
524 message_schedule W2,W3,W0,W1,56
525 rounds e, f, g, h, a, b, c, d, 44
526 message_schedule W3,W0,W1,W2,60
528 // revert K to the beginning of K256[]
529 #if defined __x86_64__
535 sub $1, num_blocks // num_blocks--
536 je L_final_block // if final block, wrap up final rounds
538 // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
539 rounds a, b, c, d, e, f, g, h, 48
541 rounds e, f, g, h, a, b, c, d, 52
543 rounds a, b, c, d, e, f, g, h, 56
545 rounds e, f, g, h, a, b, c, d, 60
549 #if defined (__x86_64__)
555 // ctx->states += digests a-h
556 #if defined (__x86_64__)
580 jmp L_loop // branch for next block
582 // wrap up digest update round 48:63 for final block
584 rounds a, b, c, d, e, f, g, h, 48
585 rounds e, f, g, h, a, b, c, d, 52
586 rounds a, b, c, d, e, f, g, h, 56
587 rounds e, f, g, h, a, b, c, d, 60
589 // ctx->states += digests a-h
590 #if defined (__x86_64__)
614 // if kernel, restore xmm0-xmm7
616 movdqa 0*16+xmm_save, %xmm0
617 movdqa 1*16+xmm_save, %xmm1
618 movdqa 2*16+xmm_save, %xmm2
619 movdqa 3*16+xmm_save, %xmm3
620 movdqa 4*16+xmm_save, %xmm4
621 movdqa 5*16+xmm_save, %xmm5
622 movdqa 6*16+xmm_save, %xmm6
623 movdqa 7*16+xmm_save, %xmm7
626 // free allocated stack memory
629 // restore callee-saved registers
630 #if defined (__x86_64__)
648 #endif // x86_64/i386