]> git.saurik.com Git - apple/xnu.git/blob - bsd/crypto/sha2/intel/sha256.s
xnu-1699.22.81.tar.gz
[apple/xnu.git] / bsd / crypto / sha2 / intel / sha256.s
1 /*
2 This file provides x86_64/i386 hand implementation of the following function
3
4 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
5
6 which is a C function in sha2.c (from xnu).
7
8 The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
9 SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
10 with all ssse3 instructions replaced with sse3 or below instructions.
11
12 sha256 algorithm per block description:
13
14 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
15 2. load 8 digests a-h from ctx->state
16 3. for r = 0:15
17 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
18 d += T1;
19 h = T1 + Sigma0(a) + Maj(a,b,c)
20 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
21 4. for r = 16:63
22 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
23 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
24 d += T1;
25 h = T1 + Sigma0(a) + Maj(a,b,c)
26 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
27
28 In the assembly implementation:
29 - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
30 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
31 - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
32
33 the implementation per block looks like
34
35 ----------------------------------------------------------------------------
36
37 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
38 pre_calculate and store W+K(0:15) in stack
39
40 load digests a-h from ctx->state;
41
42 for (r=0;r<48;r+=4) {
43 digests a-h update and permute round r:r+3
44 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
45 }
46
47 for (r=48;r<64;r+=4) {
48 digests a-h update and permute round r:r+3
49 }
50
51 ctx->states += digests a-h;
52
53 ----------------------------------------------------------------------------
54
55 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
56 into the last 16 rounds of its previous block:
57
58 ----------------------------------------------------------------------------
59
60 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
61 pre_calculate and store W+K(0:15) in stack
62
63 L_loop:
64
65 load digests a-h from ctx->state;
66
67 for (r=0;r<48;r+=4) {
68 digests a-h update and permute round r:r+3
69 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
70 }
71
72 num_block--;
73 if (num_block==0) jmp L_last_block;
74
75 for (r=48;r<64;r+=4) {
76 digests a-h update and permute round r:r+3
77 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
78 pre_calculate and store W+K([r:r+3]%16) in stack
79 }
80
81 ctx->states += digests a-h;
82
83 jmp L_loop;
84
85 L_last_block:
86
87 for (r=48;r<64;r+=4) {
88 digests a-h update and permute round r:r+3
89 }
90
91 ctx->states += digests a-h;
92
93 ------------------------------------------------------------------------
94
95 Apple CoreOS vector & numerics
96 cclee 8-3-10
97 */
98
99 #if defined KERNEL
100 #include <i386/cpu_capabilities.h>
101 #else
102 #include <System/i386/cpu_capabilities.h>
103 #endif
104
105 // associate variables with registers or memory
106
107 #if defined (__x86_64__)
108 #define sp %rsp
109 #define ctx %rdi
110 #define data %rsi
111 #define num_blocks %rdx
112
113 #define a %r8d
114 #define b %r9d
115 #define c %r10d
116 #define d %r11d
117 #define e %r12d
118 #define f %r13d
119 #define g %r14d
120 #define h %r15d
121
122 #define K %rbx
123 #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
124
125 #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words
126 #define xmm_save 80(sp) // starting address for xmm save/restore
127 #else
128 #define sp %esp
129 #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
130 #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
131 #define data_addr 24+stack_size(sp) // 2nd caller argument
132 #define num_blocks 28+stack_size(sp) // 3rd caller argument
133
134 #define a %ebx
135 #define b %edx
136 #define c 64(sp)
137 #define d %ebp
138 #define e %esi
139 #define f 68(sp)
140 #define g %edi
141 #define h 72(sp)
142
143 #define K 76(sp) // pointer to K256[] table
144 #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words
145 #define xmm_save 96(sp) // starting address for xmm save/restore
146 #endif
147
148 // 2 local variables
149 #define t %eax
150 #define s %ecx
151
152 // a window (16 words) of message scheule
153 #define W0 %xmm0
154 #define W1 %xmm1
155 #define W2 %xmm2
156 #define W3 %xmm3
157
158 // circular buffer for WK[(r:r+15)%16]
159 #define WK(x) (x&15)*4(sp)
160
161 // #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
162
163 .macro Ch
164 mov $0, t // x
165 mov $0, s // x
166 not t // ~x
167 and $1, s // x & y
168 and $2, t // ~x & z
169 xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
170 .endm
171
172 // #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
173
174 .macro Maj
175 mov $0, t // x
176 mov $1, s // y
177 and s, t // x&y
178 and $2, s // y&z
179 xor s, t // (x&y) ^ (y&z)
180 mov $2, s // z
181 and $0, s // (x&z)
182 xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
183 .endm
184
185 /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
186 // #define R(b,x) ((x) >> (b))
187 /* 32-bit Rotate-right (used in SHA-256): */
188 // #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
189
190 // #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
191
192 // performs sigma0_256 on 4 words on an xmm registers
193 // use xmm6/xmm7 as intermediate registers
194 .macro sigma0
195 movdqa $0, %xmm6
196 movdqa $0, %xmm7
197 psrld $$3, $0 // SHR3(x)
198 psrld $$7, %xmm6 // part of ROTR7
199 pslld $$14, %xmm7 // part of ROTR18
200 pxor %xmm6, $0
201 pxor %xmm7, $0
202 psrld $$11, %xmm6 // part of ROTR18
203 pslld $$11, %xmm7 // part of ROTR7
204 pxor %xmm6, $0
205 pxor %xmm7, $0
206 .endm
207
208 // #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
209
210 // performs sigma1_256 on 4 words on an xmm registers
211 // use xmm6/xmm7 as intermediate registers
212 .macro sigma1
213 movdqa $0, %xmm6
214 movdqa $0, %xmm7
215 psrld $$10, $0 // SHR10(x)
216 psrld $$17, %xmm6 // part of ROTR17
217 pxor %xmm6, $0
218 pslld $$13, %xmm7 // part of ROTR19
219 pxor %xmm7, $0
220 psrld $$2, %xmm6 // part of ROTR19
221 pxor %xmm6, $0
222 pslld $$2, %xmm7 // part of ROTR17
223 pxor %xmm7, $0
224 .endm
225
226 // #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
227
228 .macro Sigma0
229 mov $0, t // x
230 mov $0, s // x
231 ror $$2, t // S32(2, (x))
232 ror $$13, s // S32(13, (x))
233 xor s, t // S32(2, (x)) ^ S32(13, (x))
234 ror $$9, s // S32(22, (x))
235 xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
236 .endm
237
238 // #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
239
240 .macro Sigma1
241 mov $0, s // x
242 ror $$6, s // S32(6, (x))
243 mov s, t // S32(6, (x))
244 ror $$5, s // S32(11, (x))
245 xor s, t // S32(6, (x)) ^ S32(11, (x))
246 ror $$14, s // S32(25, (x))
247 xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
248 .endm
249
250 // per round digests update
251 .macro round
252 Sigma1 $4 // t = T1
253 add t, $7 // use h to store h+Sigma1(e)
254 Ch $4, $5, $6 // t = Ch (e, f, g);
255 add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
256 add WK($8), t // h = T1
257 add t, $3 // d += T1;
258 mov t, $7 // h = T1
259 Sigma0 $0 // t = Sigma0(a);
260 add t, $7 // h = T1 + Sigma0(a);
261 Maj $0, $1, $2 // t = Maj(a,b,c)
262 add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
263 .endm
264
265 // per 4 rounds digests update and permutation
266 // permutation is absorbed by rotating the roles of digests a-h
267 .macro rounds
268 round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
269 round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
270 round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
271 round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
272 .endm
273
274 // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
275 .macro message_schedule
276
277 // 4 32-bit K256 words in xmm5
278 #if defined (__x86_64__)
279 movdqu (K), %xmm5
280 #else
281 mov K, t
282 movdqu (t), %xmm5
283 #endif
284 add $$16, K // K points to next K256 word for next iteration
285 movdqa $1, %xmm4 // W7:W4
286 palignr $$4, $0, %xmm4 // W4:W1
287 sigma0 %xmm4 // sigma0(W4:W1)
288 movdqa $3, %xmm6 // W15:W12
289 paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
290 palignr $$4, $2, %xmm6 // W12:W9
291 paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
292 movdqa $3, %xmm4 // W15:W12
293 psrldq $$8, %xmm4 // 0,0,W15,W14
294 sigma1 %xmm4 // sigma1(0,0,W15,W14)
295 paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
296 movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
297 pslldq $$8, %xmm4 // W17, W16, 0, 0
298 sigma1 %xmm4 // sigma1(W17,W16,0,0)
299 paddd %xmm4, $0 // W19:W16
300 paddd $0, %xmm5 // WK
301 movdqa %xmm5, WK($4)
302 .endm
303
304 // this macro is used in the last 16 rounds of a current block
305 // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
306 // and save into stack to prepare for next block
307
308 .macro update_W_WK
309 #if defined (__x86_64__)
310 movdqu $0*16(data), $1 // read 4 4-byte words
311 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
312 movdqu $0*16(K), %xmm4 // K[r:r+3]
313 #else
314 mov data_addr, t
315 movdqu $0*16(t), $1 // read 4 4-byte words
316 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
317 mov K, t
318 movdqu $0*16(t), %xmm4 // K[r:r+3]
319 #endif
320 paddd $1, %xmm4 // WK[r:r+3]
321 movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
322 .endm
323
324 .text
325
326 #if defined (__x86_64__) || defined (__i386__)
327
328 .globl _SHA256_Transform
329
330 _SHA256_Transform:
331
332
333 // detect SSSE3 and dispatch appropriate code branch
334 #if defined __x86_64__
335 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
336 mov (%rax), %eax // %eax = __cpu_capabilities
337 #else // i386
338 #if defined KERNEL
339 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
340 mov (%eax), %eax // %eax = __cpu_capabilities
341 #else
342 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
343 #endif
344 #endif
345 test $(kHasSupplementalSSE3), %eax
346 je _SHA256_Transform_nossse3 // branch to no-ssse3 code
347
348 // push callee-saved registers
349 #if defined (__x86_64__)
350 push %rbp
351 push %rbx
352 push %r12
353 push %r13
354 push %r14
355 push %r15
356 #else
357 push %ebp
358 push %ebx
359 push %esi
360 push %edi
361 #endif
362
363 // allocate stack space
364 sub $stack_size, sp
365
366 // if kernel code, save used xmm registers
367 #if KERNEL
368 movdqa %xmm0, 0*16+xmm_save
369 movdqa %xmm1, 1*16+xmm_save
370 movdqa %xmm2, 2*16+xmm_save
371 movdqa %xmm3, 3*16+xmm_save
372 movdqa %xmm4, 4*16+xmm_save
373 movdqa %xmm5, 5*16+xmm_save
374 movdqa %xmm6, 6*16+xmm_save
375 movdqa %xmm7, 7*16+xmm_save
376 #endif
377
378 // set up bswap parameters in the aligned stack space and pointer to table K256[]
379 #if defined (__x86_64__)
380 lea _K256(%rip), K
381 lea L_bswap(%rip), %rax
382 movdqa (%rax), %xmm0
383 #else
384 lea _K256, t
385 mov t, K
386 lea L_bswap, %eax
387 movdqa (%eax), %xmm0
388 #endif
389 movdqa %xmm0, L_aligned_bswap
390
391 // load W[0:15] into xmm0-xmm3
392 #if defined (__x86_64__)
393 movdqu 0*16(data), W0
394 movdqu 1*16(data), W1
395 movdqu 2*16(data), W2
396 movdqu 3*16(data), W3
397 add $64, data
398 #else
399 mov data_addr, t
400 movdqu 0*16(t), W0
401 movdqu 1*16(t), W1
402 movdqu 2*16(t), W2
403 movdqu 3*16(t), W3
404 add $64, data_addr
405 #endif
406 pshufb L_aligned_bswap, W0
407 pshufb L_aligned_bswap, W1
408 pshufb L_aligned_bswap, W2
409 pshufb L_aligned_bswap, W3
410
411 // compute WK[0:15] and save in stack
412 #if defined (__x86_64__)
413 movdqu 0*16(K), %xmm4
414 movdqu 1*16(K), %xmm5
415 movdqu 2*16(K), %xmm6
416 movdqu 3*16(K), %xmm7
417 #else
418 mov K, t
419 movdqu 0*16(t), %xmm4
420 movdqu 1*16(t), %xmm5
421 movdqu 2*16(t), %xmm6
422 movdqu 3*16(t), %xmm7
423 #endif
424 add $64, K
425 paddd %xmm0, %xmm4
426 paddd %xmm1, %xmm5
427 paddd %xmm2, %xmm6
428 paddd %xmm3, %xmm7
429 movdqa %xmm4, WK(0)
430 movdqa %xmm5, WK(4)
431 movdqa %xmm6, WK(8)
432 movdqa %xmm7, WK(12)
433
434 L_loop:
435
436 // digests a-h = ctx->states;
437 #if defined (__x86_64__)
438 mov 0*4(ctx), a
439 mov 1*4(ctx), b
440 mov 2*4(ctx), c
441 mov 3*4(ctx), d
442 mov 4*4(ctx), e
443 mov 5*4(ctx), f
444 mov 6*4(ctx), g
445 mov 7*4(ctx), h
446 #else
447 mov ctx_addr, t
448 mov 0*4(t), a
449 mov 1*4(t), b
450 mov 2*4(t), s
451 mov s, c
452 mov 3*4(t), d
453 mov 4*4(t), e
454 mov 5*4(t), s
455 mov s, f
456 mov 6*4(t), g
457 mov 7*4(t), s
458 mov s, h
459 #endif
460
461 // rounds 0:47 interleaved with W/WK update for rounds 16:63
462 rounds a, b, c, d, e, f, g, h, 0
463 message_schedule W0,W1,W2,W3,16
464 rounds e, f, g, h, a, b, c, d, 4
465 message_schedule W1,W2,W3,W0,20
466 rounds a, b, c, d, e, f, g, h, 8
467 message_schedule W2,W3,W0,W1,24
468 rounds e, f, g, h, a, b, c, d, 12
469 message_schedule W3,W0,W1,W2,28
470 rounds a, b, c, d, e, f, g, h, 16
471 message_schedule W0,W1,W2,W3,32
472 rounds e, f, g, h, a, b, c, d, 20
473 message_schedule W1,W2,W3,W0,36
474 rounds a, b, c, d, e, f, g, h, 24
475 message_schedule W2,W3,W0,W1,40
476 rounds e, f, g, h, a, b, c, d, 28
477 message_schedule W3,W0,W1,W2,44
478 rounds a, b, c, d, e, f, g, h, 32
479 message_schedule W0,W1,W2,W3,48
480 rounds e, f, g, h, a, b, c, d, 36
481 message_schedule W1,W2,W3,W0,52
482 rounds a, b, c, d, e, f, g, h, 40
483 message_schedule W2,W3,W0,W1,56
484 rounds e, f, g, h, a, b, c, d, 44
485 message_schedule W3,W0,W1,W2,60
486
487 // revert K to the beginning of K256[]
488 #if defined __x86_64__
489 sub $256, K
490 #else
491 subl $256, K
492 #endif
493
494 sub $1, num_blocks // num_blocks--
495 je L_final_block // if final block, wrap up final rounds
496
497 // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
498 rounds a, b, c, d, e, f, g, h, 48
499 update_W_WK 0, W0
500 rounds e, f, g, h, a, b, c, d, 52
501 update_W_WK 1, W1
502 rounds a, b, c, d, e, f, g, h, 56
503 update_W_WK 2, W2
504 rounds e, f, g, h, a, b, c, d, 60
505 update_W_WK 3, W3
506
507 add $64, K
508 #if defined (__x86_64__)
509 add $64, data
510 #else
511 add $64, data_addr
512 #endif
513
514 // ctx->states += digests a-h
515 #if defined (__x86_64__)
516 add a, 0*4(ctx)
517 add b, 1*4(ctx)
518 add c, 2*4(ctx)
519 add d, 3*4(ctx)
520 add e, 4*4(ctx)
521 add f, 5*4(ctx)
522 add g, 6*4(ctx)
523 add h, 7*4(ctx)
524 #else
525 mov ctx_addr, t
526 add a, 0*4(t)
527 add b, 1*4(t)
528 mov c, s
529 add s, 2*4(t)
530 add d, 3*4(t)
531 add e, 4*4(t)
532 mov f, s
533 add s, 5*4(t)
534 add g, 6*4(t)
535 mov h, s
536 add s, 7*4(t)
537 #endif
538
539 jmp L_loop // branch for next block
540
541 // wrap up digest update round 48:63 for final block
542 L_final_block:
543 rounds a, b, c, d, e, f, g, h, 48
544 rounds e, f, g, h, a, b, c, d, 52
545 rounds a, b, c, d, e, f, g, h, 56
546 rounds e, f, g, h, a, b, c, d, 60
547
548 // ctx->states += digests a-h
549 #if defined (__x86_64__)
550 add a, 0*4(ctx)
551 add b, 1*4(ctx)
552 add c, 2*4(ctx)
553 add d, 3*4(ctx)
554 add e, 4*4(ctx)
555 add f, 5*4(ctx)
556 add g, 6*4(ctx)
557 add h, 7*4(ctx)
558 #else
559 mov ctx_addr, t
560 add a, 0*4(t)
561 add b, 1*4(t)
562 mov c, s
563 add s, 2*4(t)
564 add d, 3*4(t)
565 add e, 4*4(t)
566 mov f, s
567 add s, 5*4(t)
568 add g, 6*4(t)
569 mov h, s
570 add s, 7*4(t)
571 #endif
572
573 // if kernel, restore xmm0-xmm7
574 #if KERNEL
575 movdqa 0*16+xmm_save, %xmm0
576 movdqa 1*16+xmm_save, %xmm1
577 movdqa 2*16+xmm_save, %xmm2
578 movdqa 3*16+xmm_save, %xmm3
579 movdqa 4*16+xmm_save, %xmm4
580 movdqa 5*16+xmm_save, %xmm5
581 movdqa 6*16+xmm_save, %xmm6
582 movdqa 7*16+xmm_save, %xmm7
583 #endif
584
585 // free allocated stack memory
586 add $stack_size, sp
587
588 // restore callee-saved registers
589 #if defined (__x86_64__)
590 pop %r15
591 pop %r14
592 pop %r13
593 pop %r12
594 pop %rbx
595 pop %rbp
596 #else
597 pop %edi
598 pop %esi
599 pop %ebx
600 pop %ebp
601 #endif
602
603 // return
604 ret
605
606
607 .const
608 .align 4, 0x90
609
610 L_bswap:
611 .long 0x00010203
612 .long 0x04050607
613 .long 0x08090a0b
614 .long 0x0c0d0e0f
615
616 #endif // x86_64/i386
617