]> git.saurik.com Git - apple/xnu.git/blame - bsd/crypto/sha2/intel/sha256nossse3.s
xnu-1699.22.81.tar.gz
[apple/xnu.git] / bsd / crypto / sha2 / intel / sha256nossse3.s
CommitLineData
6d2010ae
A
1/*
2 This file provides x86_64/i386 hand implementation of the following function
3
4 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
5
6 which is a C function in sha2.c (from xnu).
7
8 The code SHA256_Transform_nossse3 is a clone of SHA256_Transform
9 with all ssse3 instructions replaced with sse3 or below instructions.
10
11 For performance reason, this function should not be called directly. This file should be working
12 together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect
13 ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function.
14
15 sha256 algorithm per block description:
16
17 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
18 2. load 8 digests a-h from ctx->state
19 3. for r = 0:15
20 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
21 d += T1;
22 h = T1 + Sigma0(a) + Maj(a,b,c)
23 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
24 4. for r = 16:63
25 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
26 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
27 d += T1;
28 h = T1 + Sigma0(a) + Maj(a,b,c)
29 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
30
31 In the assembly implementation:
32 - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
33 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
34 - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
35
36 the implementation per block looks like
37
38 ----------------------------------------------------------------------------
39
40 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
41 pre_calculate and store W+K(0:15) in stack
42
43 load digests a-h from ctx->state;
44
45 for (r=0;r<48;r+=4) {
46 digests a-h update and permute round r:r+3
47 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
48 }
49
50 for (r=48;r<64;r+=4) {
51 digests a-h update and permute round r:r+3
52 }
53
54 ctx->states += digests a-h;
55
56 ----------------------------------------------------------------------------
57
58 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
59 into the last 16 rounds of its previous block:
60
61 ----------------------------------------------------------------------------
62
63 load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
64 pre_calculate and store W+K(0:15) in stack
65
66L_loop:
67
68 load digests a-h from ctx->state;
69
70 for (r=0;r<48;r+=4) {
71 digests a-h update and permute round r:r+3
72 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
73 }
74
75 num_block--;
76 if (num_block==0) jmp L_last_block;
77
78 for (r=48;r<64;r+=4) {
79 digests a-h update and permute round r:r+3
80 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
81 pre_calculate and store W+K([r:r+3]%16) in stack
82 }
83
84 ctx->states += digests a-h;
85
86 jmp L_loop;
87
88L_last_block:
89
90 for (r=48;r<64;r+=4) {
91 digests a-h update and permute round r:r+3
92 }
93
94 ctx->states += digests a-h;
95
96 ------------------------------------------------------------------------
97
98 Apple CoreOS vector & numerics
99 cclee 8-3-10
100*/
101
102#if defined KERNEL
103#include <i386/cpu_capabilities.h>
104#else
105#include <System/i386/cpu_capabilities.h>
106#endif
107
108 // associate variables with registers or memory
109
110#if defined (__x86_64__)
111 #define sp %rsp
112 #define ctx %rdi
113 #define data %rsi
114 #define num_blocks %rdx
115
116 #define a %r8d
117 #define b %r9d
118 #define c %r10d
119 #define d %r11d
120 #define e %r12d
121 #define f %r13d
122 #define g %r14d
123 #define h %r15d
124
125 #define K %rbx
126 #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
127
128 #define xmm_save 80(sp) // starting address for xmm save/restore
129#else
130 #define sp %esp
131 #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
132 #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
133 #define data_addr 24+stack_size(sp) // 2nd caller argument
134 #define num_blocks 28+stack_size(sp) // 3rd caller argument
135
136 #define a %ebx
137 #define b %edx
138 #define c 64(sp)
139 #define d %ebp
140 #define e %esi
141 #define f 68(sp)
142 #define g %edi
143 #define h 72(sp)
144
145 #define K 76(sp) // pointer to K256[] table
146 #define xmm_save 96(sp) // starting address for xmm save/restore
147#endif
148
149 // 2 local variables
150 #define t %eax
151 #define s %ecx
152
153 // a window (16 words) of message scheule
154 #define W0 %xmm0
155 #define W1 %xmm1
156 #define W2 %xmm2
157 #define W3 %xmm3
158
159 // circular buffer for WK[(r:r+15)%16]
160 #define WK(x) (x&15)*4(sp)
161
162// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
163
164 .macro Ch
165 mov $0, t // x
166 mov $0, s // x
167 not t // ~x
168 and $1, s // x & y
169 and $2, t // ~x & z
170 xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
171 .endm
172
173// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
174
175 .macro Maj
176 mov $0, t // x
177 mov $1, s // y
178 and s, t // x&y
179 and $2, s // y&z
180 xor s, t // (x&y) ^ (y&z)
181 mov $2, s // z
182 and $0, s // (x&z)
183 xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
184 .endm
185
186/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
187// #define R(b,x) ((x) >> (b))
188/* 32-bit Rotate-right (used in SHA-256): */
189// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
190
191// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
192
193 // performs sigma0_256 on 4 words on an xmm registers
194 // use xmm6/xmm7 as intermediate registers
195 .macro sigma0
196 movdqa $0, %xmm6
197 movdqa $0, %xmm7
198 psrld $$3, $0 // SHR3(x)
199 psrld $$7, %xmm6 // part of ROTR7
200 pslld $$14, %xmm7 // part of ROTR18
201 pxor %xmm6, $0
202 pxor %xmm7, $0
203 psrld $$11, %xmm6 // part of ROTR18
204 pslld $$11, %xmm7 // part of ROTR7
205 pxor %xmm6, $0
206 pxor %xmm7, $0
207 .endm
208
209// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
210
211 // performs sigma1_256 on 4 words on an xmm registers
212 // use xmm6/xmm7 as intermediate registers
213 .macro sigma1
214 movdqa $0, %xmm6
215 movdqa $0, %xmm7
216 psrld $$10, $0 // SHR10(x)
217 psrld $$17, %xmm6 // part of ROTR17
218 pxor %xmm6, $0
219 pslld $$13, %xmm7 // part of ROTR19
220 pxor %xmm7, $0
221 psrld $$2, %xmm6 // part of ROTR19
222 pxor %xmm6, $0
223 pslld $$2, %xmm7 // part of ROTR17
224 pxor %xmm7, $0
225 .endm
226
227// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
228
229 .macro Sigma0
230 mov $0, t // x
231 mov $0, s // x
232 ror $$2, t // S32(2, (x))
233 ror $$13, s // S32(13, (x))
234 xor s, t // S32(2, (x)) ^ S32(13, (x))
235 ror $$9, s // S32(22, (x))
236 xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
237 .endm
238
239// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
240
241 .macro Sigma1
242 mov $0, s // x
243 ror $$6, s // S32(6, (x))
244 mov s, t // S32(6, (x))
245 ror $$5, s // S32(11, (x))
246 xor s, t // S32(6, (x)) ^ S32(11, (x))
247 ror $$14, s // S32(25, (x))
248 xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
249 .endm
250
251 // per round digests update
252 .macro round
253 Sigma1 $4 // t = T1
254 add t, $7 // use h to store h+Sigma1(e)
255 Ch $4, $5, $6 // t = Ch (e, f, g);
256 add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
257 add WK($8), t // h = T1
258 add t, $3 // d += T1;
259 mov t, $7 // h = T1
260 Sigma0 $0 // t = Sigma0(a);
261 add t, $7 // h = T1 + Sigma0(a);
262 Maj $0, $1, $2 // t = Maj(a,b,c)
263 add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
264 .endm
265
266 // per 4 rounds digests update and permutation
267 // permutation is absorbed by rotating the roles of digests a-h
268 .macro rounds
269 round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
270 round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
271 round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
272 round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
273 .endm
274
275 // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
276 .macro message_schedule
277
278 // 4 32-bit K256 words in xmm5
279#if defined (__x86_64__)
280 movdqu (K), %xmm5
281#else
282 mov K, t
283 movdqu (t), %xmm5
284#endif
285 add $$16, K // K points to next K256 word for next iteration
286 movdqa $1, %xmm4 // W7:W4
287#if 0
288 palignr $$4, $0, %xmm4 // W4:W1
289#else // no-ssse3 implementation of palignr
290 movdqa $0, %xmm7
291 pslldq $$12, %xmm4
292 psrldq $$4, %xmm7
293 por %xmm7, %xmm4
294#endif
295 sigma0 %xmm4 // sigma0(W4:W1)
296 movdqa $3, %xmm6 // W15:W12
297 paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
298#if 0
299 palignr $$4, $2, %xmm6 // W12:W9
300#else // no-ssse3 implementation of palignr
301 movdqa $2, %xmm7
302 pslldq $$12, %xmm6
303 psrldq $$4, %xmm7
304 por %xmm7, %xmm6
305#endif
306 paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
307 movdqa $3, %xmm4 // W15:W12
308 psrldq $$8, %xmm4 // 0,0,W15,W14
309 sigma1 %xmm4 // sigma1(0,0,W15,W14)
310 paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
311 movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
312 pslldq $$8, %xmm4 // W17, W16, 0, 0
313 sigma1 %xmm4 // sigma1(W17,W16,0,0)
314 paddd %xmm4, $0 // W19:W16
315 paddd $0, %xmm5 // WK
316 movdqa %xmm5, WK($4)
317 .endm
318
319 // this macro is used in the last 16 rounds of a current block
320 // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
321 // and save into stack to prepare for next block
322
323 .macro update_W_WK
324#if defined (__x86_64__)
325#if 0
326 movdqu $0*16(data), $1 // read 4 4-byte words
327 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
328#else // no-ssse3 implementation
329 mov 0+$0*16(data), s
330 bswap s
331 mov s, 0+WK($0*4)
332 mov 4+$0*16(data), s
333 bswap s
334 mov s, 4+WK($0*4)
335 mov 8+$0*16(data), s
336 bswap s
337 mov s, 8+WK($0*4)
338 mov 12+$0*16(data), s
339 bswap s
340 mov s, 12+WK($0*4)
341 movdqa WK($0*4), $1
342#endif
343 movdqu $0*16(K), %xmm4 // K[r:r+3]
344#else
345 mov data_addr, t
346#if 0
347 movdqu $0*16(t), $1 // read 4 4-byte words
348 pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
349#else // no-ssse3 implementation
350 mov 0+$0*16(t), s
351 bswap s
352 mov s, 0+WK($0*4)
353 mov 4+$0*16(t), s
354 bswap s
355 mov s, 4+WK($0*4)
356 mov 8+$0*16(t), s
357 bswap s
358 mov s, 8+WK($0*4)
359 mov 12+$0*16(t), s
360 bswap s
361 mov s, 12+WK($0*4)
362 movdqa WK($0*4), $1
363#endif
364 mov K, t
365 movdqu $0*16(t), %xmm4 // K[r:r+3]
366#endif
367 paddd $1, %xmm4 // WK[r:r+3]
368 movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
369 .endm
370
371 .text
372
373#if defined (__x86_64__) || defined (__i386__)
374
375 .globl _SHA256_Transform_nossse3
376
377_SHA256_Transform_nossse3:
378
379 // push callee-saved registers
380#if defined (__x86_64__)
381 push %rbp
382 push %rbx
383 push %r12
384 push %r13
385 push %r14
386 push %r15
387#else
388 push %ebp
389 push %ebx
390 push %esi
391 push %edi
392#endif
393
394 // allocate stack space
395 sub $stack_size, sp
396
397 // if kernel code, save used xmm registers
398#if KERNEL
399 movdqa %xmm0, 0*16+xmm_save
400 movdqa %xmm1, 1*16+xmm_save
401 movdqa %xmm2, 2*16+xmm_save
402 movdqa %xmm3, 3*16+xmm_save
403 movdqa %xmm4, 4*16+xmm_save
404 movdqa %xmm5, 5*16+xmm_save
405 movdqa %xmm6, 6*16+xmm_save
406 movdqa %xmm7, 7*16+xmm_save
407#endif
408
409 // set up pointer to table K256[]
410#if defined (__x86_64__)
411 lea _K256(%rip), K
412#else
413 lea _K256, t
414 mov t, K
415#endif
416
417 // load W[0:15] into xmm0-xmm3
418 .macro mybswap
419 movl 0+$0*16($1), a
420 movl 4+$0*16($1), b
421 movl 8+$0*16($1), e
422 movl 12+$0*16($1), d
423 bswap a
424 bswap b
425 bswap e
426 bswap d
427 movl a, $0*16(sp)
428 movl b, 4+$0*16(sp)
429 movl e, 8+$0*16(sp)
430 movl d, 12+$0*16(sp)
431 .endm
432
433#if defined (__x86_64__)
434 mybswap 0, data
435 mybswap 1, data
436 mybswap 2, data
437 mybswap 3, data
438 add $64, data
439#else
440 mov data_addr, t
441 mybswap 0, t
442 mybswap 1, t
443 mybswap 2, t
444 mybswap 3, t
445 add $64, data_addr
446#endif
447 movdqa 0*16(sp), W0
448 movdqa 1*16(sp), W1
449 movdqa 2*16(sp), W2
450 movdqa 3*16(sp), W3
451
452 // compute WK[0:15] and save in stack
453#if defined (__x86_64__)
454 movdqu 0*16(K), %xmm4
455 movdqu 1*16(K), %xmm5
456 movdqu 2*16(K), %xmm6
457 movdqu 3*16(K), %xmm7
458#else
459 mov K, t
460 movdqu 0*16(t), %xmm4
461 movdqu 1*16(t), %xmm5
462 movdqu 2*16(t), %xmm6
463 movdqu 3*16(t), %xmm7
464#endif
465 add $64, K
466 paddd %xmm0, %xmm4
467 paddd %xmm1, %xmm5
468 paddd %xmm2, %xmm6
469 paddd %xmm3, %xmm7
470 movdqa %xmm4, WK(0)
471 movdqa %xmm5, WK(4)
472 movdqa %xmm6, WK(8)
473 movdqa %xmm7, WK(12)
474
475L_loop:
476
477 // digests a-h = ctx->states;
478#if defined (__x86_64__)
479 mov 0*4(ctx), a
480 mov 1*4(ctx), b
481 mov 2*4(ctx), c
482 mov 3*4(ctx), d
483 mov 4*4(ctx), e
484 mov 5*4(ctx), f
485 mov 6*4(ctx), g
486 mov 7*4(ctx), h
487#else
488 mov ctx_addr, t
489 mov 0*4(t), a
490 mov 1*4(t), b
491 mov 2*4(t), s
492 mov s, c
493 mov 3*4(t), d
494 mov 4*4(t), e
495 mov 5*4(t), s
496 mov s, f
497 mov 6*4(t), g
498 mov 7*4(t), s
499 mov s, h
500#endif
501
502 // rounds 0:47 interleaved with W/WK update for rounds 16:63
503 rounds a, b, c, d, e, f, g, h, 0
504 message_schedule W0,W1,W2,W3,16
505 rounds e, f, g, h, a, b, c, d, 4
506 message_schedule W1,W2,W3,W0,20
507 rounds a, b, c, d, e, f, g, h, 8
508 message_schedule W2,W3,W0,W1,24
509 rounds e, f, g, h, a, b, c, d, 12
510 message_schedule W3,W0,W1,W2,28
511 rounds a, b, c, d, e, f, g, h, 16
512 message_schedule W0,W1,W2,W3,32
513 rounds e, f, g, h, a, b, c, d, 20
514 message_schedule W1,W2,W3,W0,36
515 rounds a, b, c, d, e, f, g, h, 24
516 message_schedule W2,W3,W0,W1,40
517 rounds e, f, g, h, a, b, c, d, 28
518 message_schedule W3,W0,W1,W2,44
519 rounds a, b, c, d, e, f, g, h, 32
520 message_schedule W0,W1,W2,W3,48
521 rounds e, f, g, h, a, b, c, d, 36
522 message_schedule W1,W2,W3,W0,52
523 rounds a, b, c, d, e, f, g, h, 40
524 message_schedule W2,W3,W0,W1,56
525 rounds e, f, g, h, a, b, c, d, 44
526 message_schedule W3,W0,W1,W2,60
527
528 // revert K to the beginning of K256[]
529#if defined __x86_64__
530 sub $256, K
531#else
532 subl $256, K
533#endif
534
535 sub $1, num_blocks // num_blocks--
536 je L_final_block // if final block, wrap up final rounds
537
538 // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
539 rounds a, b, c, d, e, f, g, h, 48
540 update_W_WK 0, W0
541 rounds e, f, g, h, a, b, c, d, 52
542 update_W_WK 1, W1
543 rounds a, b, c, d, e, f, g, h, 56
544 update_W_WK 2, W2
545 rounds e, f, g, h, a, b, c, d, 60
546 update_W_WK 3, W3
547
548 add $64, K
549#if defined (__x86_64__)
550 add $64, data
551#else
552 add $64, data_addr
553#endif
554
555 // ctx->states += digests a-h
556#if defined (__x86_64__)
557 add a, 0*4(ctx)
558 add b, 1*4(ctx)
559 add c, 2*4(ctx)
560 add d, 3*4(ctx)
561 add e, 4*4(ctx)
562 add f, 5*4(ctx)
563 add g, 6*4(ctx)
564 add h, 7*4(ctx)
565#else
566 mov ctx_addr, t
567 add a, 0*4(t)
568 add b, 1*4(t)
569 mov c, s
570 add s, 2*4(t)
571 add d, 3*4(t)
572 add e, 4*4(t)
573 mov f, s
574 add s, 5*4(t)
575 add g, 6*4(t)
576 mov h, s
577 add s, 7*4(t)
578#endif
579
580 jmp L_loop // branch for next block
581
582 // wrap up digest update round 48:63 for final block
583L_final_block:
584 rounds a, b, c, d, e, f, g, h, 48
585 rounds e, f, g, h, a, b, c, d, 52
586 rounds a, b, c, d, e, f, g, h, 56
587 rounds e, f, g, h, a, b, c, d, 60
588
589 // ctx->states += digests a-h
590#if defined (__x86_64__)
591 add a, 0*4(ctx)
592 add b, 1*4(ctx)
593 add c, 2*4(ctx)
594 add d, 3*4(ctx)
595 add e, 4*4(ctx)
596 add f, 5*4(ctx)
597 add g, 6*4(ctx)
598 add h, 7*4(ctx)
599#else
600 mov ctx_addr, t
601 add a, 0*4(t)
602 add b, 1*4(t)
603 mov c, s
604 add s, 2*4(t)
605 add d, 3*4(t)
606 add e, 4*4(t)
607 mov f, s
608 add s, 5*4(t)
609 add g, 6*4(t)
610 mov h, s
611 add s, 7*4(t)
612#endif
613
614 // if kernel, restore xmm0-xmm7
615#if KERNEL
616 movdqa 0*16+xmm_save, %xmm0
617 movdqa 1*16+xmm_save, %xmm1
618 movdqa 2*16+xmm_save, %xmm2
619 movdqa 3*16+xmm_save, %xmm3
620 movdqa 4*16+xmm_save, %xmm4
621 movdqa 5*16+xmm_save, %xmm5
622 movdqa 6*16+xmm_save, %xmm6
623 movdqa 7*16+xmm_save, %xmm7
624#endif
625
626 // free allocated stack memory
627 add $stack_size, sp
628
629 // restore callee-saved registers
630#if defined (__x86_64__)
631 pop %r15
632 pop %r14
633 pop %r13
634 pop %r12
635 pop %rbx
636 pop %rbp
637#else
638 pop %edi
639 pop %esi
640 pop %ebx
641 pop %ebp
642#endif
643
644 // return
645 ret
646
647
648#endif // x86_64/i386
649