]>
Commit | Line | Data |
---|---|---|
6d2010ae A |
1 | /* |
2 | This file provides x86_64/i386 hand implementation of the following function | |
3 | ||
4 | void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); | |
5 | ||
6 | which is a C function in sha2.c (from xnu). | |
7 | ||
8 | The code SHA256_Transform_nossse3 is a clone of SHA256_Transform | |
9 | with all ssse3 instructions replaced with sse3 or below instructions. | |
10 | ||
11 | For performance reason, this function should not be called directly. This file should be working | |
12 | together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect | |
13 | ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function. | |
14 | ||
15 | sha256 algorithm per block description: | |
16 | ||
17 | 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) | |
18 | 2. load 8 digests a-h from ctx->state | |
19 | 3. for r = 0:15 | |
20 | T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; | |
21 | d += T1; | |
22 | h = T1 + Sigma0(a) + Maj(a,b,c) | |
23 | permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g | |
24 | 4. for r = 16:63 | |
25 | W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); | |
26 | T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; | |
27 | d += T1; | |
28 | h = T1 + Sigma0(a) + Maj(a,b,c) | |
29 | permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g | |
30 | ||
31 | In the assembly implementation: | |
32 | - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 | |
33 | - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer | |
34 | - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) | |
35 | ||
36 | the implementation per block looks like | |
37 | ||
38 | ---------------------------------------------------------------------------- | |
39 | ||
40 | load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 | |
41 | pre_calculate and store W+K(0:15) in stack | |
42 | ||
43 | load digests a-h from ctx->state; | |
44 | ||
45 | for (r=0;r<48;r+=4) { | |
46 | digests a-h update and permute round r:r+3 | |
47 | update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration | |
48 | } | |
49 | ||
50 | for (r=48;r<64;r+=4) { | |
51 | digests a-h update and permute round r:r+3 | |
52 | } | |
53 | ||
54 | ctx->states += digests a-h; | |
55 | ||
56 | ---------------------------------------------------------------------------- | |
57 | ||
58 | our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block | |
59 | into the last 16 rounds of its previous block: | |
60 | ||
61 | ---------------------------------------------------------------------------- | |
62 | ||
63 | load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 | |
64 | pre_calculate and store W+K(0:15) in stack | |
65 | ||
66 | L_loop: | |
67 | ||
68 | load digests a-h from ctx->state; | |
69 | ||
70 | for (r=0;r<48;r+=4) { | |
71 | digests a-h update and permute round r:r+3 | |
72 | update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration | |
73 | } | |
74 | ||
75 | num_block--; | |
76 | if (num_block==0) jmp L_last_block; | |
77 | ||
78 | for (r=48;r<64;r+=4) { | |
79 | digests a-h update and permute round r:r+3 | |
80 | load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 | |
81 | pre_calculate and store W+K([r:r+3]%16) in stack | |
82 | } | |
83 | ||
84 | ctx->states += digests a-h; | |
85 | ||
86 | jmp L_loop; | |
87 | ||
88 | L_last_block: | |
89 | ||
90 | for (r=48;r<64;r+=4) { | |
91 | digests a-h update and permute round r:r+3 | |
92 | } | |
93 | ||
94 | ctx->states += digests a-h; | |
95 | ||
96 | ------------------------------------------------------------------------ | |
97 | ||
98 | Apple CoreOS vector & numerics | |
99 | cclee 8-3-10 | |
100 | */ | |
101 | ||
102 | #if defined KERNEL | |
103 | #include <i386/cpu_capabilities.h> | |
104 | #else | |
105 | #include <System/i386/cpu_capabilities.h> | |
106 | #endif | |
107 | ||
108 | // associate variables with registers or memory | |
109 | ||
110 | #if defined (__x86_64__) | |
111 | #define sp %rsp | |
112 | #define ctx %rdi | |
113 | #define data %rsi | |
114 | #define num_blocks %rdx | |
115 | ||
116 | #define a %r8d | |
117 | #define b %r9d | |
118 | #define c %r10d | |
119 | #define d %r11d | |
120 | #define e %r12d | |
121 | #define f %r13d | |
122 | #define g %r14d | |
123 | #define h %r15d | |
124 | ||
125 | #define K %rbx | |
126 | #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) | |
127 | ||
128 | #define xmm_save 80(sp) // starting address for xmm save/restore | |
129 | #else | |
130 | #define sp %esp | |
131 | #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) | |
132 | #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument | |
133 | #define data_addr 24+stack_size(sp) // 2nd caller argument | |
134 | #define num_blocks 28+stack_size(sp) // 3rd caller argument | |
135 | ||
136 | #define a %ebx | |
137 | #define b %edx | |
138 | #define c 64(sp) | |
139 | #define d %ebp | |
140 | #define e %esi | |
141 | #define f 68(sp) | |
142 | #define g %edi | |
143 | #define h 72(sp) | |
144 | ||
145 | #define K 76(sp) // pointer to K256[] table | |
146 | #define xmm_save 96(sp) // starting address for xmm save/restore | |
147 | #endif | |
148 | ||
149 | // 2 local variables | |
150 | #define t %eax | |
151 | #define s %ecx | |
152 | ||
153 | // a window (16 words) of message scheule | |
154 | #define W0 %xmm0 | |
155 | #define W1 %xmm1 | |
156 | #define W2 %xmm2 | |
157 | #define W3 %xmm3 | |
158 | ||
159 | // circular buffer for WK[(r:r+15)%16] | |
160 | #define WK(x) (x&15)*4(sp) | |
161 | ||
162 | // #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) | |
163 | ||
164 | .macro Ch | |
165 | mov $0, t // x | |
166 | mov $0, s // x | |
167 | not t // ~x | |
168 | and $1, s // x & y | |
169 | and $2, t // ~x & z | |
170 | xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); | |
171 | .endm | |
172 | ||
173 | // #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) | |
174 | ||
175 | .macro Maj | |
176 | mov $0, t // x | |
177 | mov $1, s // y | |
178 | and s, t // x&y | |
179 | and $2, s // y&z | |
180 | xor s, t // (x&y) ^ (y&z) | |
181 | mov $2, s // z | |
182 | and $0, s // (x&z) | |
183 | xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) | |
184 | .endm | |
185 | ||
186 | /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ | |
187 | // #define R(b,x) ((x) >> (b)) | |
188 | /* 32-bit Rotate-right (used in SHA-256): */ | |
189 | // #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) | |
190 | ||
191 | // #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) | |
192 | ||
193 | // performs sigma0_256 on 4 words on an xmm registers | |
194 | // use xmm6/xmm7 as intermediate registers | |
195 | .macro sigma0 | |
196 | movdqa $0, %xmm6 | |
197 | movdqa $0, %xmm7 | |
198 | psrld $$3, $0 // SHR3(x) | |
199 | psrld $$7, %xmm6 // part of ROTR7 | |
200 | pslld $$14, %xmm7 // part of ROTR18 | |
201 | pxor %xmm6, $0 | |
202 | pxor %xmm7, $0 | |
203 | psrld $$11, %xmm6 // part of ROTR18 | |
204 | pslld $$11, %xmm7 // part of ROTR7 | |
205 | pxor %xmm6, $0 | |
206 | pxor %xmm7, $0 | |
207 | .endm | |
208 | ||
209 | // #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) | |
210 | ||
211 | // performs sigma1_256 on 4 words on an xmm registers | |
212 | // use xmm6/xmm7 as intermediate registers | |
213 | .macro sigma1 | |
214 | movdqa $0, %xmm6 | |
215 | movdqa $0, %xmm7 | |
216 | psrld $$10, $0 // SHR10(x) | |
217 | psrld $$17, %xmm6 // part of ROTR17 | |
218 | pxor %xmm6, $0 | |
219 | pslld $$13, %xmm7 // part of ROTR19 | |
220 | pxor %xmm7, $0 | |
221 | psrld $$2, %xmm6 // part of ROTR19 | |
222 | pxor %xmm6, $0 | |
223 | pslld $$2, %xmm7 // part of ROTR17 | |
224 | pxor %xmm7, $0 | |
225 | .endm | |
226 | ||
227 | // #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) | |
228 | ||
229 | .macro Sigma0 | |
230 | mov $0, t // x | |
231 | mov $0, s // x | |
232 | ror $$2, t // S32(2, (x)) | |
233 | ror $$13, s // S32(13, (x)) | |
234 | xor s, t // S32(2, (x)) ^ S32(13, (x)) | |
235 | ror $$9, s // S32(22, (x)) | |
236 | xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) | |
237 | .endm | |
238 | ||
239 | // #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) | |
240 | ||
241 | .macro Sigma1 | |
242 | mov $0, s // x | |
243 | ror $$6, s // S32(6, (x)) | |
244 | mov s, t // S32(6, (x)) | |
245 | ror $$5, s // S32(11, (x)) | |
246 | xor s, t // S32(6, (x)) ^ S32(11, (x)) | |
247 | ror $$14, s // S32(25, (x)) | |
248 | xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) | |
249 | .endm | |
250 | ||
251 | // per round digests update | |
252 | .macro round | |
253 | Sigma1 $4 // t = T1 | |
254 | add t, $7 // use h to store h+Sigma1(e) | |
255 | Ch $4, $5, $6 // t = Ch (e, f, g); | |
256 | add $7, t // t = h+Sigma1(e)+Ch(e,f,g); | |
257 | add WK($8), t // h = T1 | |
258 | add t, $3 // d += T1; | |
259 | mov t, $7 // h = T1 | |
260 | Sigma0 $0 // t = Sigma0(a); | |
261 | add t, $7 // h = T1 + Sigma0(a); | |
262 | Maj $0, $1, $2 // t = Maj(a,b,c) | |
263 | add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); | |
264 | .endm | |
265 | ||
266 | // per 4 rounds digests update and permutation | |
267 | // permutation is absorbed by rotating the roles of digests a-h | |
268 | .macro rounds | |
269 | round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 | |
270 | round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 | |
271 | round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 | |
272 | round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 | |
273 | .endm | |
274 | ||
275 | // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future | |
276 | .macro message_schedule | |
277 | ||
278 | // 4 32-bit K256 words in xmm5 | |
279 | #if defined (__x86_64__) | |
280 | movdqu (K), %xmm5 | |
281 | #else | |
282 | mov K, t | |
283 | movdqu (t), %xmm5 | |
284 | #endif | |
285 | add $$16, K // K points to next K256 word for next iteration | |
286 | movdqa $1, %xmm4 // W7:W4 | |
287 | #if 0 | |
288 | palignr $$4, $0, %xmm4 // W4:W1 | |
289 | #else // no-ssse3 implementation of palignr | |
290 | movdqa $0, %xmm7 | |
291 | pslldq $$12, %xmm4 | |
292 | psrldq $$4, %xmm7 | |
293 | por %xmm7, %xmm4 | |
294 | #endif | |
295 | sigma0 %xmm4 // sigma0(W4:W1) | |
296 | movdqa $3, %xmm6 // W15:W12 | |
297 | paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) | |
298 | #if 0 | |
299 | palignr $$4, $2, %xmm6 // W12:W9 | |
300 | #else // no-ssse3 implementation of palignr | |
301 | movdqa $2, %xmm7 | |
302 | pslldq $$12, %xmm6 | |
303 | psrldq $$4, %xmm7 | |
304 | por %xmm7, %xmm6 | |
305 | #endif | |
306 | paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 | |
307 | movdqa $3, %xmm4 // W15:W12 | |
308 | psrldq $$8, %xmm4 // 0,0,W15,W14 | |
309 | sigma1 %xmm4 // sigma1(0,0,W15,W14) | |
310 | paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 | |
311 | movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 | |
312 | pslldq $$8, %xmm4 // W17, W16, 0, 0 | |
313 | sigma1 %xmm4 // sigma1(W17,W16,0,0) | |
314 | paddd %xmm4, $0 // W19:W16 | |
315 | paddd $0, %xmm5 // WK | |
316 | movdqa %xmm5, WK($4) | |
317 | .endm | |
318 | ||
319 | // this macro is used in the last 16 rounds of a current block | |
320 | // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] | |
321 | // and save into stack to prepare for next block | |
322 | ||
323 | .macro update_W_WK | |
324 | #if defined (__x86_64__) | |
325 | #if 0 | |
326 | movdqu $0*16(data), $1 // read 4 4-byte words | |
327 | pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] | |
328 | #else // no-ssse3 implementation | |
329 | mov 0+$0*16(data), s | |
330 | bswap s | |
331 | mov s, 0+WK($0*4) | |
332 | mov 4+$0*16(data), s | |
333 | bswap s | |
334 | mov s, 4+WK($0*4) | |
335 | mov 8+$0*16(data), s | |
336 | bswap s | |
337 | mov s, 8+WK($0*4) | |
338 | mov 12+$0*16(data), s | |
339 | bswap s | |
340 | mov s, 12+WK($0*4) | |
341 | movdqa WK($0*4), $1 | |
342 | #endif | |
343 | movdqu $0*16(K), %xmm4 // K[r:r+3] | |
344 | #else | |
345 | mov data_addr, t | |
346 | #if 0 | |
347 | movdqu $0*16(t), $1 // read 4 4-byte words | |
348 | pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] | |
349 | #else // no-ssse3 implementation | |
350 | mov 0+$0*16(t), s | |
351 | bswap s | |
352 | mov s, 0+WK($0*4) | |
353 | mov 4+$0*16(t), s | |
354 | bswap s | |
355 | mov s, 4+WK($0*4) | |
356 | mov 8+$0*16(t), s | |
357 | bswap s | |
358 | mov s, 8+WK($0*4) | |
359 | mov 12+$0*16(t), s | |
360 | bswap s | |
361 | mov s, 12+WK($0*4) | |
362 | movdqa WK($0*4), $1 | |
363 | #endif | |
364 | mov K, t | |
365 | movdqu $0*16(t), %xmm4 // K[r:r+3] | |
366 | #endif | |
367 | paddd $1, %xmm4 // WK[r:r+3] | |
368 | movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer | |
369 | .endm | |
370 | ||
371 | .text | |
372 | ||
373 | #if defined (__x86_64__) || defined (__i386__) | |
374 | ||
375 | .globl _SHA256_Transform_nossse3 | |
376 | ||
377 | _SHA256_Transform_nossse3: | |
378 | ||
379 | // push callee-saved registers | |
380 | #if defined (__x86_64__) | |
381 | push %rbp | |
382 | push %rbx | |
383 | push %r12 | |
384 | push %r13 | |
385 | push %r14 | |
386 | push %r15 | |
387 | #else | |
388 | push %ebp | |
389 | push %ebx | |
390 | push %esi | |
391 | push %edi | |
392 | #endif | |
393 | ||
394 | // allocate stack space | |
395 | sub $stack_size, sp | |
396 | ||
397 | // if kernel code, save used xmm registers | |
398 | #if KERNEL | |
399 | movdqa %xmm0, 0*16+xmm_save | |
400 | movdqa %xmm1, 1*16+xmm_save | |
401 | movdqa %xmm2, 2*16+xmm_save | |
402 | movdqa %xmm3, 3*16+xmm_save | |
403 | movdqa %xmm4, 4*16+xmm_save | |
404 | movdqa %xmm5, 5*16+xmm_save | |
405 | movdqa %xmm6, 6*16+xmm_save | |
406 | movdqa %xmm7, 7*16+xmm_save | |
407 | #endif | |
408 | ||
409 | // set up pointer to table K256[] | |
410 | #if defined (__x86_64__) | |
411 | lea _K256(%rip), K | |
412 | #else | |
413 | lea _K256, t | |
414 | mov t, K | |
415 | #endif | |
416 | ||
417 | // load W[0:15] into xmm0-xmm3 | |
418 | .macro mybswap | |
419 | movl 0+$0*16($1), a | |
420 | movl 4+$0*16($1), b | |
421 | movl 8+$0*16($1), e | |
422 | movl 12+$0*16($1), d | |
423 | bswap a | |
424 | bswap b | |
425 | bswap e | |
426 | bswap d | |
427 | movl a, $0*16(sp) | |
428 | movl b, 4+$0*16(sp) | |
429 | movl e, 8+$0*16(sp) | |
430 | movl d, 12+$0*16(sp) | |
431 | .endm | |
432 | ||
433 | #if defined (__x86_64__) | |
434 | mybswap 0, data | |
435 | mybswap 1, data | |
436 | mybswap 2, data | |
437 | mybswap 3, data | |
438 | add $64, data | |
439 | #else | |
440 | mov data_addr, t | |
441 | mybswap 0, t | |
442 | mybswap 1, t | |
443 | mybswap 2, t | |
444 | mybswap 3, t | |
445 | add $64, data_addr | |
446 | #endif | |
447 | movdqa 0*16(sp), W0 | |
448 | movdqa 1*16(sp), W1 | |
449 | movdqa 2*16(sp), W2 | |
450 | movdqa 3*16(sp), W3 | |
451 | ||
452 | // compute WK[0:15] and save in stack | |
453 | #if defined (__x86_64__) | |
454 | movdqu 0*16(K), %xmm4 | |
455 | movdqu 1*16(K), %xmm5 | |
456 | movdqu 2*16(K), %xmm6 | |
457 | movdqu 3*16(K), %xmm7 | |
458 | #else | |
459 | mov K, t | |
460 | movdqu 0*16(t), %xmm4 | |
461 | movdqu 1*16(t), %xmm5 | |
462 | movdqu 2*16(t), %xmm6 | |
463 | movdqu 3*16(t), %xmm7 | |
464 | #endif | |
465 | add $64, K | |
466 | paddd %xmm0, %xmm4 | |
467 | paddd %xmm1, %xmm5 | |
468 | paddd %xmm2, %xmm6 | |
469 | paddd %xmm3, %xmm7 | |
470 | movdqa %xmm4, WK(0) | |
471 | movdqa %xmm5, WK(4) | |
472 | movdqa %xmm6, WK(8) | |
473 | movdqa %xmm7, WK(12) | |
474 | ||
475 | L_loop: | |
476 | ||
477 | // digests a-h = ctx->states; | |
478 | #if defined (__x86_64__) | |
479 | mov 0*4(ctx), a | |
480 | mov 1*4(ctx), b | |
481 | mov 2*4(ctx), c | |
482 | mov 3*4(ctx), d | |
483 | mov 4*4(ctx), e | |
484 | mov 5*4(ctx), f | |
485 | mov 6*4(ctx), g | |
486 | mov 7*4(ctx), h | |
487 | #else | |
488 | mov ctx_addr, t | |
489 | mov 0*4(t), a | |
490 | mov 1*4(t), b | |
491 | mov 2*4(t), s | |
492 | mov s, c | |
493 | mov 3*4(t), d | |
494 | mov 4*4(t), e | |
495 | mov 5*4(t), s | |
496 | mov s, f | |
497 | mov 6*4(t), g | |
498 | mov 7*4(t), s | |
499 | mov s, h | |
500 | #endif | |
501 | ||
502 | // rounds 0:47 interleaved with W/WK update for rounds 16:63 | |
503 | rounds a, b, c, d, e, f, g, h, 0 | |
504 | message_schedule W0,W1,W2,W3,16 | |
505 | rounds e, f, g, h, a, b, c, d, 4 | |
506 | message_schedule W1,W2,W3,W0,20 | |
507 | rounds a, b, c, d, e, f, g, h, 8 | |
508 | message_schedule W2,W3,W0,W1,24 | |
509 | rounds e, f, g, h, a, b, c, d, 12 | |
510 | message_schedule W3,W0,W1,W2,28 | |
511 | rounds a, b, c, d, e, f, g, h, 16 | |
512 | message_schedule W0,W1,W2,W3,32 | |
513 | rounds e, f, g, h, a, b, c, d, 20 | |
514 | message_schedule W1,W2,W3,W0,36 | |
515 | rounds a, b, c, d, e, f, g, h, 24 | |
516 | message_schedule W2,W3,W0,W1,40 | |
517 | rounds e, f, g, h, a, b, c, d, 28 | |
518 | message_schedule W3,W0,W1,W2,44 | |
519 | rounds a, b, c, d, e, f, g, h, 32 | |
520 | message_schedule W0,W1,W2,W3,48 | |
521 | rounds e, f, g, h, a, b, c, d, 36 | |
522 | message_schedule W1,W2,W3,W0,52 | |
523 | rounds a, b, c, d, e, f, g, h, 40 | |
524 | message_schedule W2,W3,W0,W1,56 | |
525 | rounds e, f, g, h, a, b, c, d, 44 | |
526 | message_schedule W3,W0,W1,W2,60 | |
527 | ||
528 | // revert K to the beginning of K256[] | |
529 | #if defined __x86_64__ | |
530 | sub $256, K | |
531 | #else | |
532 | subl $256, K | |
533 | #endif | |
534 | ||
535 | sub $1, num_blocks // num_blocks-- | |
536 | je L_final_block // if final block, wrap up final rounds | |
537 | ||
538 | // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 | |
539 | rounds a, b, c, d, e, f, g, h, 48 | |
540 | update_W_WK 0, W0 | |
541 | rounds e, f, g, h, a, b, c, d, 52 | |
542 | update_W_WK 1, W1 | |
543 | rounds a, b, c, d, e, f, g, h, 56 | |
544 | update_W_WK 2, W2 | |
545 | rounds e, f, g, h, a, b, c, d, 60 | |
546 | update_W_WK 3, W3 | |
547 | ||
548 | add $64, K | |
549 | #if defined (__x86_64__) | |
550 | add $64, data | |
551 | #else | |
552 | add $64, data_addr | |
553 | #endif | |
554 | ||
555 | // ctx->states += digests a-h | |
556 | #if defined (__x86_64__) | |
557 | add a, 0*4(ctx) | |
558 | add b, 1*4(ctx) | |
559 | add c, 2*4(ctx) | |
560 | add d, 3*4(ctx) | |
561 | add e, 4*4(ctx) | |
562 | add f, 5*4(ctx) | |
563 | add g, 6*4(ctx) | |
564 | add h, 7*4(ctx) | |
565 | #else | |
566 | mov ctx_addr, t | |
567 | add a, 0*4(t) | |
568 | add b, 1*4(t) | |
569 | mov c, s | |
570 | add s, 2*4(t) | |
571 | add d, 3*4(t) | |
572 | add e, 4*4(t) | |
573 | mov f, s | |
574 | add s, 5*4(t) | |
575 | add g, 6*4(t) | |
576 | mov h, s | |
577 | add s, 7*4(t) | |
578 | #endif | |
579 | ||
580 | jmp L_loop // branch for next block | |
581 | ||
582 | // wrap up digest update round 48:63 for final block | |
583 | L_final_block: | |
584 | rounds a, b, c, d, e, f, g, h, 48 | |
585 | rounds e, f, g, h, a, b, c, d, 52 | |
586 | rounds a, b, c, d, e, f, g, h, 56 | |
587 | rounds e, f, g, h, a, b, c, d, 60 | |
588 | ||
589 | // ctx->states += digests a-h | |
590 | #if defined (__x86_64__) | |
591 | add a, 0*4(ctx) | |
592 | add b, 1*4(ctx) | |
593 | add c, 2*4(ctx) | |
594 | add d, 3*4(ctx) | |
595 | add e, 4*4(ctx) | |
596 | add f, 5*4(ctx) | |
597 | add g, 6*4(ctx) | |
598 | add h, 7*4(ctx) | |
599 | #else | |
600 | mov ctx_addr, t | |
601 | add a, 0*4(t) | |
602 | add b, 1*4(t) | |
603 | mov c, s | |
604 | add s, 2*4(t) | |
605 | add d, 3*4(t) | |
606 | add e, 4*4(t) | |
607 | mov f, s | |
608 | add s, 5*4(t) | |
609 | add g, 6*4(t) | |
610 | mov h, s | |
611 | add s, 7*4(t) | |
612 | #endif | |
613 | ||
614 | // if kernel, restore xmm0-xmm7 | |
615 | #if KERNEL | |
616 | movdqa 0*16+xmm_save, %xmm0 | |
617 | movdqa 1*16+xmm_save, %xmm1 | |
618 | movdqa 2*16+xmm_save, %xmm2 | |
619 | movdqa 3*16+xmm_save, %xmm3 | |
620 | movdqa 4*16+xmm_save, %xmm4 | |
621 | movdqa 5*16+xmm_save, %xmm5 | |
622 | movdqa 6*16+xmm_save, %xmm6 | |
623 | movdqa 7*16+xmm_save, %xmm7 | |
624 | #endif | |
625 | ||
626 | // free allocated stack memory | |
627 | add $stack_size, sp | |
628 | ||
629 | // restore callee-saved registers | |
630 | #if defined (__x86_64__) | |
631 | pop %r15 | |
632 | pop %r14 | |
633 | pop %r13 | |
634 | pop %r12 | |
635 | pop %rbx | |
636 | pop %rbp | |
637 | #else | |
638 | pop %edi | |
639 | pop %esi | |
640 | pop %ebx | |
641 | pop %ebp | |
642 | #endif | |
643 | ||
644 | // return | |
645 | ret | |
646 | ||
647 | ||
648 | #endif // x86_64/i386 | |
649 |