2 This file provides x86_64/i386 hand implementation of the following function
4 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
6 which is a C function in sha2.c (from xnu).
8 The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
9 SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
10 with all ssse3 instructions replaced with sse3 or below instructions.
12 sha256 algorithm per block description:
14 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
15 2. load 8 digests a-h from ctx->state
17 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
19 h = T1 + Sigma0(a) + Maj(a,b,c)
20 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
22 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
23 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
25 h = T1 + Sigma0(a) + Maj(a,b,c)
26 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
28 In the assembly implementation:
29 - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
30 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
31 - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
33 the implementation per block looks like
35 ----------------------------------------------------------------------------
37 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
38 pre_calculate and store W+K(0:15) in stack
40 load digests a-h from ctx->state;
43 digests a-h update and permute round r:r+3
44 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
47 for (r=48;r<64;r+=4) {
48 digests a-h update and permute round r:r+3
51 ctx->states += digests a-h;
53 ----------------------------------------------------------------------------
55 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
56 into the last 16 rounds of its previous block:
58 ----------------------------------------------------------------------------
60 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
61 pre_calculate and store W+K(0:15) in stack
65 load digests a-h from ctx->state;
68 digests a-h update and permute round r:r+3
69 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
73 if (num_block==0) jmp L_last_block;
75 for (r=48;r<64;r+=4) {
76 digests a-h update and permute round r:r+3
77 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
78 pre_calculate and store W+K([r:r+3]%16) in stack
81 ctx->states += digests a-h;
87 for (r=48;r<64;r+=4) {
88 digests a-h update and permute round r:r+3
91 ctx->states += digests a-h;
93 ------------------------------------------------------------------------
95 Apple CoreOS vector & numerics
100 #include <i386/cpu_capabilities.h>
102 #include <System/i386/cpu_capabilities.h>
105 // associate variables with registers or memory
107 #if defined (__x86_64__)
111 #define num_blocks %rdx
123 #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
125 #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words
126 #define xmm_save 80(sp) // starting address for xmm save/restore
129 #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
130 #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
131 #define data_addr 24+stack_size(sp) // 2nd caller argument
132 #define num_blocks 28+stack_size(sp) // 3rd caller argument
143 #define K 76(sp) // pointer to K256[] table
144 #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words
145 #define xmm_save 96(sp) // starting address for xmm save/restore
152 // a window (16 words) of message scheule
158 // circular buffer for WK[(r:r+15)%16]
159 #define WK(x) (x&15)*4(sp)
161 // #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
169 xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
172 // #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
179 xor s, t // (x&y) ^ (y&z)
182 xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
185 /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
186 // #define R(b,x) ((x) >> (b))
187 /* 32-bit Rotate-right (used in SHA-256): */
188 // #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
190 // #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
192 // performs sigma0_256 on 4 words on an xmm registers
193 // use xmm6/xmm7 as intermediate registers
197 psrld $$3, $0 // SHR3(x)
198 psrld $$7, %xmm6 // part of ROTR7
199 pslld $$14, %xmm7 // part of ROTR18
202 psrld $$11, %xmm6 // part of ROTR18
203 pslld $$11, %xmm7 // part of ROTR7
208 // #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
210 // performs sigma1_256 on 4 words on an xmm registers
211 // use xmm6/xmm7 as intermediate registers
215 psrld $$10, $0 // SHR10(x)
216 psrld $$17, %xmm6 // part of ROTR17
218 pslld $$13, %xmm7 // part of ROTR19
220 psrld $$2, %xmm6 // part of ROTR19
222 pslld $$2, %xmm7 // part of ROTR17
226 // #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
231 ror $$2, t // S32(2, (x))
232 ror $$13, s // S32(13, (x))
233 xor s, t // S32(2, (x)) ^ S32(13, (x))
234 ror $$9, s // S32(22, (x))
235 xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
238 // #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
242 ror $$6, s // S32(6, (x))
243 mov s, t // S32(6, (x))
244 ror $$5, s // S32(11, (x))
245 xor s, t // S32(6, (x)) ^ S32(11, (x))
246 ror $$14, s // S32(25, (x))
247 xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
250 // per round digests update
253 add t, $7 // use h to store h+Sigma1(e)
254 Ch $4, $5, $6 // t = Ch (e, f, g);
255 add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
256 add WK($8), t // h = T1
257 add t, $3 // d += T1;
259 Sigma0 $0 // t = Sigma0(a);
260 add t, $7 // h = T1 + Sigma0(a);
261 Maj $0, $1, $2 // t = Maj(a,b,c)
262 add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
265 // per 4 rounds digests update and permutation
266 // permutation is absorbed by rotating the roles of digests a-h
268 round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
269 round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
270 round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
271 round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
274 // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
275 .macro message_schedule
277 // 4 32-bit K256 words in xmm5
278 #if defined (__x86_64__)
284 add $$16, K // K points to next K256 word for next iteration
285 movdqa $1, %xmm4 // W7:W4
286 palignr $$4, $0, %xmm4 // W4:W1
287 sigma0 %xmm4 // sigma0(W4:W1)
288 movdqa $3, %xmm6 // W15:W12
289 paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
290 palignr $$4, $2, %xmm6 // W12:W9
291 paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
292 movdqa $3, %xmm4 // W15:W12
293 psrldq $$8, %xmm4 // 0,0,W15,W14
294 sigma1 %xmm4 // sigma1(0,0,W15,W14)
295 paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
296 movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
297 pslldq $$8, %xmm4 // W17, W16, 0, 0
298 sigma1 %xmm4 // sigma1(W17,W16,0,0)
299 paddd %xmm4, $0 // W19:W16
300 paddd $0, %xmm5 // WK
304 // this macro is used in the last 16 rounds of a current block
305 // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
306 // and save into stack to prepare for next block
309 #if defined (__x86_64__)
310 movdqu $0*16(data), $1 // read 4 4-byte words
311 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
312 movdqu $0*16(K), %xmm4 // K[r:r+3]
315 movdqu $0*16(t), $1 // read 4 4-byte words
316 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
318 movdqu $0*16(t), %xmm4 // K[r:r+3]
320 paddd $1, %xmm4 // WK[r:r+3]
321 movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
326 #if defined (__x86_64__) || defined (__i386__)
328 .globl _SHA256_Transform
333 // detect SSSE3 and dispatch appropriate code branch
334 #if defined __x86_64__
335 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
336 mov (%rax), %eax // %eax = __cpu_capabilities
339 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
340 mov (%eax), %eax // %eax = __cpu_capabilities
342 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
345 test $(kHasSupplementalSSE3), %eax
346 je _SHA256_Transform_nossse3 // branch to no-ssse3 code
348 // push callee-saved registers
349 #if defined (__x86_64__)
363 // allocate stack space
366 // if kernel code, save used xmm registers
368 movdqa %xmm0, 0*16+xmm_save
369 movdqa %xmm1, 1*16+xmm_save
370 movdqa %xmm2, 2*16+xmm_save
371 movdqa %xmm3, 3*16+xmm_save
372 movdqa %xmm4, 4*16+xmm_save
373 movdqa %xmm5, 5*16+xmm_save
374 movdqa %xmm6, 6*16+xmm_save
375 movdqa %xmm7, 7*16+xmm_save
378 // set up bswap parameters in the aligned stack space and pointer to table K256[]
379 #if defined (__x86_64__)
381 lea L_bswap(%rip), %rax
389 movdqa %xmm0, L_aligned_bswap
391 // load W[0:15] into xmm0-xmm3
392 #if defined (__x86_64__)
393 movdqu 0*16(data), W0
394 movdqu 1*16(data), W1
395 movdqu 2*16(data), W2
396 movdqu 3*16(data), W3
406 pshufb L_aligned_bswap, W0
407 pshufb L_aligned_bswap, W1
408 pshufb L_aligned_bswap, W2
409 pshufb L_aligned_bswap, W3
411 // compute WK[0:15] and save in stack
412 #if defined (__x86_64__)
413 movdqu 0*16(K), %xmm4
414 movdqu 1*16(K), %xmm5
415 movdqu 2*16(K), %xmm6
416 movdqu 3*16(K), %xmm7
419 movdqu 0*16(t), %xmm4
420 movdqu 1*16(t), %xmm5
421 movdqu 2*16(t), %xmm6
422 movdqu 3*16(t), %xmm7
436 // digests a-h = ctx->states;
437 #if defined (__x86_64__)
461 // rounds 0:47 interleaved with W/WK update for rounds 16:63
462 rounds a, b, c, d, e, f, g, h, 0
463 message_schedule W0,W1,W2,W3,16
464 rounds e, f, g, h, a, b, c, d, 4
465 message_schedule W1,W2,W3,W0,20
466 rounds a, b, c, d, e, f, g, h, 8
467 message_schedule W2,W3,W0,W1,24
468 rounds e, f, g, h, a, b, c, d, 12
469 message_schedule W3,W0,W1,W2,28
470 rounds a, b, c, d, e, f, g, h, 16
471 message_schedule W0,W1,W2,W3,32
472 rounds e, f, g, h, a, b, c, d, 20
473 message_schedule W1,W2,W3,W0,36
474 rounds a, b, c, d, e, f, g, h, 24
475 message_schedule W2,W3,W0,W1,40
476 rounds e, f, g, h, a, b, c, d, 28
477 message_schedule W3,W0,W1,W2,44
478 rounds a, b, c, d, e, f, g, h, 32
479 message_schedule W0,W1,W2,W3,48
480 rounds e, f, g, h, a, b, c, d, 36
481 message_schedule W1,W2,W3,W0,52
482 rounds a, b, c, d, e, f, g, h, 40
483 message_schedule W2,W3,W0,W1,56
484 rounds e, f, g, h, a, b, c, d, 44
485 message_schedule W3,W0,W1,W2,60
487 // revert K to the beginning of K256[]
488 #if defined __x86_64__
494 sub $1, num_blocks // num_blocks--
495 je L_final_block // if final block, wrap up final rounds
497 // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
498 rounds a, b, c, d, e, f, g, h, 48
500 rounds e, f, g, h, a, b, c, d, 52
502 rounds a, b, c, d, e, f, g, h, 56
504 rounds e, f, g, h, a, b, c, d, 60
508 #if defined (__x86_64__)
514 // ctx->states += digests a-h
515 #if defined (__x86_64__)
539 jmp L_loop // branch for next block
541 // wrap up digest update round 48:63 for final block
543 rounds a, b, c, d, e, f, g, h, 48
544 rounds e, f, g, h, a, b, c, d, 52
545 rounds a, b, c, d, e, f, g, h, 56
546 rounds e, f, g, h, a, b, c, d, 60
548 // ctx->states += digests a-h
549 #if defined (__x86_64__)
573 // if kernel, restore xmm0-xmm7
575 movdqa 0*16+xmm_save, %xmm0
576 movdqa 1*16+xmm_save, %xmm1
577 movdqa 2*16+xmm_save, %xmm2
578 movdqa 3*16+xmm_save, %xmm3
579 movdqa 4*16+xmm_save, %xmm4
580 movdqa 5*16+xmm_save, %xmm5
581 movdqa 6*16+xmm_save, %xmm6
582 movdqa 7*16+xmm_save, %xmm7
585 // free allocated stack memory
588 // restore callee-saved registers
589 #if defined (__x86_64__)
616 #endif // x86_64/i386