1 /* sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
2 CoreOS - vector and numerics group
5 The implementation is based on the principle described in an Intel online article
6 "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
7 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
10 Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
12 void SHA1( int HASH[], int MESSAGE[] )
14 int A[81], B[81], C[81], D[81], E[81];
25 for ( i=0; i<80; ++i )
28 W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
30 W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
32 FN = F( i, B[i], C[i], D[i] );
34 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
36 C[i+1] = ROTATE_LEFT( B[i], 30 );
48 For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
50 The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
52 1. done on 4 consequtive W[i] values in a single XMM register
53 W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
54 W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
55 W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
56 W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
58 2. this additional calculation unfortunately requires many additional operations
61 3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
64 Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
65 The Dean Gaudet approach can be expressed as
67 1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
68 2. W[i+3] ^= W[i] rol 1
71 For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
73 1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
76 1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
77 2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
84 3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
85 a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
86 b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
87 4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available.
88 If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched.
92 /* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
93 #define Multiple_Blocks 1
95 #if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures
97 #if defined(__x86_64__)
100 #define stack_size (8+16*11+16*4) // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage
101 #define sp %rsp // unifying architectural stack pointer representation
102 #define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
103 #define buf %rsi // 2nd input argument, will move to BUFFER_PTR (%r10)
104 #define cnt %r11 // will copy from the 3rd input argument (%rdx)
105 #define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
106 #define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
107 #define BUFFER_PTR %r10 // pointer to input blocks
112 #define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
113 #define sp %esp // unifying architectural stack pointer representation
114 #define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
115 #define BUFFER_PTR stack_size+16+8(sp) // use 2nd input argument from caller function
116 #define cnt stack_size+16+12(sp) // use 3rd input argument from caller function
117 #define K_BASE stack_size-4(sp) // use for K_BASE
121 // symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
131 #if defined(__x86_64__)
134 #define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported
135 #else // defined (__i386__)
136 #define W24 12*16(sp)
137 #define W28 13*16(sp)
138 #define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported
141 #define xmov movaps // aligned 16-byte move
142 #define xmovu movups // unaligned 16-byte move
144 // intermediate hash variables
155 #define WK(t) (t&15)*4(sp)
157 // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
166 // int F2(int B, int C, int D) { return (D ^ B ^ C); }
174 // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
185 // for i=60:79, F4 is identical to F2
190 i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
192 with ssse3 support, this is achived via
193 for (i=0;i<16;i+=4) {
194 1. W_TMP = new 16 bytes from MESSAGE[]
195 2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
196 3. WTMP += {K,K,K,K};
197 4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
200 each step is represented in one of the following 4 macro definitions
204 .macro W_PRECALC_00_15_0_ssse3 // input argument $0 : 0/4/8/12
205 #if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64
206 xmovu $0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
207 #else // BUFFER_PTR is from the argument set up in the caller
208 mov BUFFER_PTR, T1 // T1 = BUFFER_PTR
209 xmovu $0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
213 .macro W_PRECALC_00_15_1_ssse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
214 pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian
215 xmov W_TMP, $0 // save W_TMP in the circular buffer
218 .macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple.
219 #if defined (__x86_64__) // K_BASE is already an address register in x86_64
220 paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K};
221 #else // K_BASE is previously set up in the stack memory
222 mov K_BASE, T1 // T1 = K_BASE
223 paddd (T1), W_TMP // W_TMP += {K,K,K,K};
227 .macro W_PRECALC_00_15_3
228 xmov W_TMP, WK($0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
232 without ssse3 support, steps 1 and 2 need to be modified
233 1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space
234 2. load the 16-bytes from the aligned stack memory into W_TMP
237 .macro W_PRECALC_00_15_0_nossse3 // input argument $0 : 0/4/8/12
239 #if defined (__x86_64__)
240 #define BUFFERP BUFFER_PTR
242 mov BUFFER_PTR, T2 // copy BUFFER_PTR (from caller 2nd argument) to T2
246 // load 1st word, bswap it, save it to stack
247 mov $0*4(BUFFERP), T1
251 // load 2nd word, bswap it, save it to stack
252 mov 4+$0*4(BUFFERP), T1
256 // load 3rd word, bswap it, save it to stack
257 mov 8+$0*4(BUFFERP), T1
261 // load 4th word, bswap it, save it to stack
262 mov 12+$0*4(BUFFERP), T1
267 .macro W_PRECALC_00_15_1_nossse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
268 xmov 14*16(sp), W_TMP // load the bswapped 16-bytes from the aligned stack memory
269 xmov W_TMP, $0 // save W = W_TMP in the circular buffer
272 // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
274 W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
275 W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
276 W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
277 W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
279 W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
281 The operation (updating W and W+K) is scheduled as and divided into 4 steps
283 0. W_tmp = W3; W = W14 ^ W8
284 1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
285 2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
286 3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
290 .macro W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W
291 xmov $1, $4 // W = W12
292 palignr $$8, $0, $4 // W = W14
293 xmov $3, W_TMP // W_TMP = W4
294 psrldq $$4, W_TMP // W_TMP = W3
295 pxor $2, $4 // W = W8 ^ W14
298 .macro W_PRECALC_16_31_1 // input arguments : W16,W
299 pxor $0, W_TMP // W_TMP = W3 ^ W16
300 pxor W_TMP, $1 // W = W3 ^ W16 ^ W8 ^ W14
301 xmov $1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14
302 xmov $1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14
303 pslldq $$12, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
306 .macro W_PRECALC_16_31_2 // input argument : W
307 psrld $$31, $0 // (W3 ^ W16 ^ W8 ^ W14)>>31
308 pslld $$1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
309 por $0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
310 xmov W_TMP2, $0 // copy W[i] at location of W[i+3]
311 psrld $$30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
312 pslld $$2, $0 // W = W[i] higher 30 bits after rol 2
315 .macro W_PRECALC_16_31_3 // input arguments: W, i, K_XMM
316 #if defined (__i386__)
317 mov K_BASE, T1 // K_BASE is store in the stack memory for i386
320 pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
321 xmov W_TMP, $0 // save W = W_TMP in the W circular buffer
322 #if defined (__x86_64__)
323 paddd $2(K_BASE), W_TMP // W+K
325 paddd $2(T1), W_TMP // W+K
327 xmov W_TMP, WK($1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
330 // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions
332 .macro W_PRECALC_16_31_0_nossse3 // input arguments : W16,W12,W8,W4,W
333 xmov $1, $4 // W = W12 = (w9 w10 w11 w12)
335 // the following is a wrokaround for palignr
336 xmov $0, W_TMP // W16 = (w13 w14 w15 w16)
337 pslldq $$8, $4 // shift left to make (w11 w12 0 0)
338 psrldq $$8, W_TMP // shift right to make (0 0 w13 w14)
339 por W_TMP, $4 // W = W14 = (w11 w12 w13 w14)
341 xmov $3, W_TMP // W_TMP = W4 = (w1 w2 w3 w4)
342 psrldq $$4, W_TMP // W_TMP = W3 = (0 w1 w2 w3)
343 pxor $2, $4 // W = W8 ^ W14
346 /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
348 W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
350 where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
353 0. W_tmp = W6; W = W28 ^ W32;
354 1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
355 2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
356 3. W = W_Tmp; WK = W_tmp + K;
361 .macro W_PRECALC_32_79_0_ssse3 // inputr arguments : W28,W8,W4,W
362 xmov $2, W_TMP // (w1 w2 w3 w4)
363 pxor $0, $3 // W = W28 ^ W32;
364 palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
367 // the following is a variant and will be used for system without ssse3 support
368 .macro W_PRECALC_32_79_0_nossse3 // input arguments : W28,W8,W4,W
369 xmov $2, W_TMP // (w1 w2 w3 w4)
370 xmov $1, W_TMP2 // (w5 w6 w7 w8)
371 pxor $0, $3 // W = W28 ^ W32
372 pslldq $$8, W_TMP // (w3 w4 0 0)
373 psrldq $$8, W_TMP2 // (0 0 w5 w6)
374 por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6
377 // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
378 .macro W_PRECALC_32_79_0_i386_ssse3 // input arguments : W28,W8,W4,W
379 xmov $3, W_TMP // W32
380 pxor $0, W_TMP // W28 ^ W32
381 xmov W_TMP, $3 // W = W28 ^ W32;
383 palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
386 // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers)
387 .macro W_PRECALC_32_79_0_i386_nossse3 // input arguments : W28,W8,W4,W
388 xmov $3, W_TMP // W32
389 pxor $0, W_TMP // W28 ^ W32
390 xmov W_TMP, $3 // W = W28 ^ W32
391 xmov $2, W_TMP // W4 = (w1 w2 w3 w4)
392 xmov $1, W_TMP2 // W8 = (w5 w6 w7 w8)
393 pslldq $$8, W_TMP // (w3 w4 0 0)
394 psrldq $$8, W_TMP2 // (0 0 w5 w6)
395 por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6
398 .macro W_PRECALC_32_79_1 // input arguments : W16,W
399 pxor $0, W_TMP // W_tmp = W6 ^ W16
400 pxor $1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32
401 xmov W_TMP, $1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
404 .macro W_PRECALC_32_79_2 // input argument : W
405 psrld $$30, $0 // W >> 30
406 pslld $$2, W_TMP // W << 2
407 por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
410 // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
411 // this should be used when the input is either W24 or W28 on i386 architecture
412 .macro W_PRECALC_32_79_2_i386 // input argument : W
414 psrld $$30, W_TMP2 // W >> 30
415 xmov W_TMP2, $0 // save (W >> 30) at W
416 pslld $$2, W_TMP // W_tmp << 2
417 por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
420 .macro W_PRECALC_32_79_3 // input argument W, i, K_XMM
421 #if defined (__x86_64__)
422 xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
423 paddd $2(K_BASE), W_TMP // W + K
424 xmov W_TMP, WK($1&~3) // write W+K
426 mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument)
427 xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
428 paddd $2(T1), W_TMP // W_tmp = W + K
429 xmov W_TMP, WK($1&~3) // write WK
434 /* The hash update operation is completed by the following statements.
436 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
438 C[i+1] = ROTATE_LEFT( B[i], 30 );
442 Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
444 A1 = FN + E0 + rol(A0,5) + WK;
450 to avoid excessive memory movement between registers,
451 1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
452 2. C1 = rol(B0,30) can be temporarily saved in B0.
454 Therefore, ignoring the time index, the update operation is equivalent to
455 1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
457 3. the hashes are now stored in the order of E,A,B,C,D
460 To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
461 1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
463 // now the hashes are in the order of E,A,B,C,D
464 3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
466 // now the hashes are in the order of D,E,A,B,C
468 These operations are distributed into the following 2 macro definitions RR0 and RR1.
472 .macro RR0 // input arguments : FN, A, B, C, D, E, i
473 $0 $2, $3, $4 // T1 = FN(B,C,D)
474 add WK($6), $5 // E + WK(i)
475 rol $$30, $2 // B = rol(B,30)
477 add WK($6+1), $4 // D + WK(i+1)
478 rol $$5, T2 // rol(A,5)
479 add T1, $5 // E = FN(B,C,D) + E + WK(i)
483 add $5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
484 mov T2, $5 // E = FN(B,C,D) + E + rol(A,5) + WK(i)
485 rol $$5, T2 // rol(E,5)
486 add T2, $4 // D + WK(i+1) + rol(E,5)
487 $0 $1, $2, $3 // FN(A,B,C)
488 add T1, $4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
489 rol $$30, $1 // A = rol(A,30)
496 The following macro definitions are used to expand code for the per-block sha1 operation.
498 INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
499 INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
500 ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
502 For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
503 into 1 macro definition for software pipeling.
505 SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
507 assume cnt (the number of blocks) >= 1, the main code body should look like
509 INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15)
511 INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
515 SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
517 ENDING // update hash digests A/B/C/D/E (i=64:79)
521 #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3
522 #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3
523 #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3
524 #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3
525 #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3
528 .macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
530 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
531 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
532 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
533 W_PRECALC_00_15_2 // W_TMP = W0 + K
534 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
536 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
537 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
538 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
539 W_PRECALC_00_15_2 // W_TMP = W28 + K
540 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
542 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
543 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
544 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
545 W_PRECALC_00_15_2 // W_TMP = W24 + K
546 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
548 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
549 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
550 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
551 W_PRECALC_00_15_2 // W_TMP = W20 + K
552 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
557 .macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
559 // i=16 : W12,W8,W4,W0,W28,W24,W20,W16
560 W_PRECALC_16_31_0 W0,W28,W24,W20,W16
562 W_PRECALC_16_31_1 W0,W16
564 W_PRECALC_16_31_2 W16
566 W_PRECALC_16_31_3 W16, 2, 0
569 // i=20 : W8,W4,W0,W28,W24,W20,W16,W12
570 W_PRECALC_16_31_0 W28,W24,W20,W16,W12
572 W_PRECALC_16_31_1 W28,W12
574 W_PRECALC_16_31_2 W12
576 W_PRECALC_16_31_3 W12, 6, 16
579 // i=24 : W4,W0,W28,W24,W20,W16,W12,W8
580 W_PRECALC_16_31_0 W24,W20,W16,W12,W8
582 W_PRECALC_16_31_1 W24,W8
586 W_PRECALC_16_31_3 W8,10,16
589 // i=28 : W0,W28,W24,W20,W16,W12,W8,W4
590 W_PRECALC_16_31_0 W20,W16,W12,W8,W4
592 W_PRECALC_16_31_1 W20,W4
596 W_PRECALC_16_31_3 W4,14,16
599 // i=32 : W28,W24,W20,W16,W12,W8,W4,W0
600 W_PRECALC_32_79_0 W28,W8,W4,W0
602 W_PRECALC_32_79_1 W16,W0
606 W_PRECALC_32_79_3 W0,18,16
611 // i=36 : W24,W20,W16,W12,W8,W4,W0,W28
612 #if defined (__x86_64__)
613 W_PRECALC_32_79_0 W24,W4,W0,W28
615 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
618 W_PRECALC_32_79_1 W12,W28
620 #if defined (__x86_64__)
621 W_PRECALC_32_79_2 W28
623 W_PRECALC_32_79_2_i386 W28
626 W_PRECALC_32_79_3 W28,22,16
629 // i=40 : W20,W16,W12,W8,W4,W0,W28,W24
632 #if defined (__x86_64__)
633 W_PRECALC_32_79_0 W20,W0,W28,W24
635 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
638 W_PRECALC_32_79_1 W8,W24
640 #if defined (__x86_64__)
641 W_PRECALC_32_79_2 W24
643 W_PRECALC_32_79_2_i386 W24
646 W_PRECALC_32_79_3 W24,26,K_XMM
649 // i=44 : W16,W12,W8,W4,W0,W28,W24,W20
650 W_PRECALC_32_79_0 W16,W28,W24,W20
652 W_PRECALC_32_79_1 W4,W20
654 W_PRECALC_32_79_2 W20
656 W_PRECALC_32_79_3 W20,30,K_XMM
659 // i=48 : W12,W8,W4,W0,W28,W24,W20,W16
660 W_PRECALC_32_79_0 W12,W24,W20,W16
662 W_PRECALC_32_79_1 W0,W16
664 W_PRECALC_32_79_2 W16
666 W_PRECALC_32_79_3 W16,34,K_XMM
669 // i=52 : W8,W4,W0,W28,W24,W20,W16,W12
670 W_PRECALC_32_79_0 W8,W20,W16,W12
672 W_PRECALC_32_79_1 W28,W12
674 W_PRECALC_32_79_2 W12
676 W_PRECALC_32_79_3 W12,38,K_XMM
681 // i=56 : W4,W0,W28,W24,W20,W16,W12,W8
682 W_PRECALC_32_79_0 W4,W16,W12,W8
684 W_PRECALC_32_79_1 W24,W8
688 W_PRECALC_32_79_3 W8,42,K_XMM
691 // i=60 : W0,W28,W24,W20,W16,W12,W8,W4
694 W_PRECALC_32_79_0 W0,W12,W8,W4
696 W_PRECALC_32_79_1 W20,W4
700 W_PRECALC_32_79_3 W4,46,K_XMM
703 // i=64 : W28,W24,W20,W16,W12,W8,W4,W0
704 W_PRECALC_32_79_0 W28,W8,W4,W0
706 W_PRECALC_32_79_1 W16,W0
710 W_PRECALC_32_79_3 W0,50,K_XMM
713 // i=68 : W24,W20,W16,W12,W8,W4,W0,W28
714 #if defined (__x86_64__)
715 W_PRECALC_32_79_0 W24,W4,W0,W28
717 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
720 W_PRECALC_32_79_1 W12,W28
722 #if defined (__x86_64__)
723 W_PRECALC_32_79_2 W28
725 W_PRECALC_32_79_2_i386 W28
728 W_PRECALC_32_79_3 W28,54,K_XMM
731 // i=72 : W20,W16,W12,W8,W4,W0,W28,W24
732 #if defined (__x86_64__)
733 W_PRECALC_32_79_0 W20,W0,W28,W24
735 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
738 W_PRECALC_32_79_1 W8,W24
740 #if defined (__x86_64__)
741 W_PRECALC_32_79_2 W24
743 W_PRECALC_32_79_2_i386 W24
746 W_PRECALC_32_79_3 W24,58,K_XMM
751 // i=76 : W16,W12,W8,W4,W0,W28,W24,W20
752 W_PRECALC_32_79_0 W16,W28,W24,W20
754 W_PRECALC_32_79_1 W4,W20
756 W_PRECALC_32_79_2 W20
758 W_PRECALC_32_79_3 W20,62,K_XMM
763 .macro SOFTWARE_PIPELINING_ssse3
764 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
765 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
767 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
769 W_PRECALC_00_15_2 // W_TMP = W0 + K
771 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
774 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
775 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
777 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
779 W_PRECALC_00_15_2 // W_TMP = W28 + K
781 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
784 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
785 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
787 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
789 W_PRECALC_00_15_2 // W_TMP = W24 + K
791 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
794 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
795 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
797 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
799 W_PRECALC_00_15_2 // W_TMP = W20 + K
801 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
806 #undef W_PRECALC_00_15_0
807 #undef W_PRECALC_00_15_1
808 #undef W_PRECALC_16_31_0
809 #undef W_PRECALC_32_79_0
810 #undef W_PRECALC_32_79_0_i386
816 The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions.
818 INITIAL_W_PRECALC_nossse3
820 SOFTWARE_PIPELINING_nossse3
822 They will be used in a sha1 code main body definition that will be used for system without ssse3 support.
826 #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_nossse3
827 #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_nossse3
828 #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_nossse3
829 #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_nossse3
830 #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_nossse3
833 .macro INITIAL_W_PRECALC_nossse3
835 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
836 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
837 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
838 W_PRECALC_00_15_2 // W_TMP = W0 + K
839 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
841 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
842 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
843 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
844 W_PRECALC_00_15_2 // W_TMP = W28 + K
845 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
847 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
848 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
849 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
850 W_PRECALC_00_15_2 // W_TMP = W24 + K
851 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
853 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
854 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
855 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
856 W_PRECALC_00_15_2 // W_TMP = W20 + K
857 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
862 .macro INTERNAL_nossse3
864 // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
865 W_PRECALC_16_31_0 W0,W28,W24,W20,W16
867 W_PRECALC_16_31_1 W0,W16
869 W_PRECALC_16_31_2 W16
871 W_PRECALC_16_31_3 W16, 2, 0
875 // W8,W4,W0,W28,W24,W20,W16,W12
876 W_PRECALC_16_31_0 W28,W24,W20,W16,W12
878 W_PRECALC_16_31_1 W28,W12
881 W_PRECALC_16_31_2 W12
883 W_PRECALC_16_31_3 W12, 6, 16
887 // W4,W0,W28,W24,W20,W16,W12,W8
888 W_PRECALC_16_31_0 W24,W20,W16,W12,W8
890 W_PRECALC_16_31_1 W24,W8
895 W_PRECALC_16_31_3 W8,10,16
899 // W0,W28,W24,W20,W16,W12,W8,W4
900 W_PRECALC_16_31_0 W20,W16,W12,W8,W4
902 W_PRECALC_16_31_1 W20,W4
907 W_PRECALC_16_31_3 W4,14,16
911 // W28,W24,W20,W16,W12,W8,W4,W0
912 W_PRECALC_32_79_0 W28,W8,W4,W0
914 W_PRECALC_32_79_1 W16,W0
918 W_PRECALC_32_79_3 W0,18,16
922 // W24,W20,W16,W12,W8,W4,W0,W28
923 #if defined (__x86_64__)
924 W_PRECALC_32_79_0 W24,W4,W0,W28
926 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
929 W_PRECALC_32_79_1 W12,W28
931 #if defined (__x86_64__)
932 W_PRECALC_32_79_2 W28
934 W_PRECALC_32_79_2_i386 W28
937 W_PRECALC_32_79_3 W28,22,16
943 // W20,W16,W12,W8,W4,W0,W28,W24
944 #if defined (__x86_64__)
945 W_PRECALC_32_79_0 W20,W0,W28,W24
947 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
950 W_PRECALC_32_79_1 W8,W24
952 #if defined (__x86_64__)
953 W_PRECALC_32_79_2 W24
955 W_PRECALC_32_79_2_i386 W24
958 W_PRECALC_32_79_3 W24,26,K_XMM
962 // W16,W12,W8,W4,W0,W28,W24,W20
963 W_PRECALC_32_79_0 W16,W28,W24,W20
965 W_PRECALC_32_79_1 W4,W20
967 W_PRECALC_32_79_2 W20
969 W_PRECALC_32_79_3 W20,30,K_XMM
973 // W12,W8,W4,W0,W28,W24,W20,W16
974 W_PRECALC_32_79_0 W12,W24,W20,W16
976 W_PRECALC_32_79_1 W0,W16
978 W_PRECALC_32_79_2 W16
980 W_PRECALC_32_79_3 W16,34,K_XMM
984 // W8,W4,W0,W28,W24,W20,W16,W12
985 W_PRECALC_32_79_0 W8,W20,W16,W12
987 W_PRECALC_32_79_1 W28,W12
989 W_PRECALC_32_79_2 W12
991 W_PRECALC_32_79_3 W12,38,K_XMM
995 // W4,W0,W28,W24,W20,W16,W12,W8
996 W_PRECALC_32_79_0 W4,W16,W12,W8
998 W_PRECALC_32_79_1 W24,W8
1000 W_PRECALC_32_79_2 W8
1002 W_PRECALC_32_79_3 W8,42,K_XMM
1008 // W0,W28,W24,W20,W16,W12,W8,W4
1009 W_PRECALC_32_79_0 W0,W12,W8,W4
1011 W_PRECALC_32_79_1 W20,W4
1013 W_PRECALC_32_79_2 W4
1015 W_PRECALC_32_79_3 W4,46,K_XMM
1019 // W28,W24,W20,W16,W12,W8,W4,W0
1020 W_PRECALC_32_79_0 W28,W8,W4,W0
1022 W_PRECALC_32_79_1 W16,W0
1024 W_PRECALC_32_79_2 W0
1026 W_PRECALC_32_79_3 W0,50,K_XMM
1030 // W24,W20,W16,W12,W8,W4,W0,W28
1031 #if defined (__x86_64__)
1032 W_PRECALC_32_79_0 W24,W4,W0,W28
1034 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
1037 W_PRECALC_32_79_1 W12,W28
1039 #if defined (__x86_64__)
1040 W_PRECALC_32_79_2 W28
1042 W_PRECALC_32_79_2_i386 W28
1045 W_PRECALC_32_79_3 W28,54,K_XMM
1049 // W20,W16,W12,W8,W4,W0,W28,W24
1050 #if defined (__x86_64__)
1051 W_PRECALC_32_79_0 W20,W0,W28,W24
1053 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
1056 W_PRECALC_32_79_1 W8,W24
1058 #if defined (__x86_64__)
1059 W_PRECALC_32_79_2 W24
1061 W_PRECALC_32_79_2_i386 W24
1064 W_PRECALC_32_79_3 W24,58,K_XMM
1067 // starting using F4
1070 // W16,W12,W8,W4,W0,W28,W24,W20
1071 W_PRECALC_32_79_0 W16,W28,W24,W20
1073 W_PRECALC_32_79_1 W4,W20
1075 W_PRECALC_32_79_2 W20
1077 W_PRECALC_32_79_3 W20,62,K_XMM
1082 .macro SOFTWARE_PIPELINING_nossse3
1083 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
1084 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
1086 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
1088 W_PRECALC_00_15_2 // W_TMP = W0 + K
1090 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
1093 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
1094 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
1096 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
1098 W_PRECALC_00_15_2 // W_TMP = W28 + K
1100 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
1103 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
1104 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
1106 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
1108 W_PRECALC_00_15_2 // W_TMP = W24 + K
1110 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
1113 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
1114 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
1116 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
1118 W_PRECALC_00_15_2 // W_TMP = W20 + K
1120 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
1124 .macro ENDING // finish up updating hash digests (i=64:79)
1150 // load hash digests A,B,C,D,E from memory into registers
1152 #if defined (__x86_64__)
1173 .macro UPDATE_ALL_HASH
1174 #if defined (__x86_64__)
1175 UPDATE_HASH (HASH_PTR), A
1176 UPDATE_HASH 4(HASH_PTR), B
1177 UPDATE_HASH 8(HASH_PTR), C
1178 UPDATE_HASH 12(HASH_PTR), D
1179 UPDATE_HASH 16(HASH_PTR), E
1183 UPDATE_HASH 4(T1), B
1184 UPDATE_HASH 8(T1), C
1185 UPDATE_HASH 12(T1), D
1186 UPDATE_HASH 16(T1), E
1192 main sha1 code for system without ssse3 support
1195 .macro SHA1_PIPELINED_MAIN_BODY_nossse3
1196 LOAD_HASH // load initial hashes into A,B,C,D,E (registers)
1197 INITIAL_W_PRECALC_nossse3 // big_endian_load(W) and W+K (i=0:15)
1200 INTERNAL_nossse3 // update W (i=16:79) and update ABCDE (i=0:63)
1202 #if defined(__x86_64__)
1203 add $$64, BUFFER_PTR // BUFFER_PTR+=64;
1204 sub $$1, cnt // pre-decrement cnt by 1
1206 addl $$64, BUFFER_PTR // BUFFER_PTR+=64;
1207 subl $$1, cnt // pre-decrement cnt by 1
1209 jbe 1f // if cnt <= 0, branch to finish off
1210 SOFTWARE_PIPELINING_nossse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1211 UPDATE_ALL_HASH // update output hashes
1212 jmp 0b // repeat for next block
1216 ENDING // update ABCDE (i=64:79)
1217 UPDATE_ALL_HASH // update output hashes
1221 main sha1 code for system with ssse3 support
1224 .macro SHA1_PIPELINED_MAIN_BODY_ssse3
1225 LOAD_HASH // load initial hashes into A,B,C,D,E
1226 INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15)
1229 INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63)
1231 #if defined(__x86_64__)
1232 add $$64, BUFFER_PTR // BUFFER_PTR+=64;
1233 sub $$1, cnt // pre-decrement cnt by 1
1235 addl $$64, BUFFER_PTR // BUFFER_PTR+=64;
1236 subl $$1, cnt // pre-decrement cnt by 1
1238 jbe 1f // if cnt <= 0, branch to finish off
1239 SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1240 UPDATE_ALL_HASH // update output hashes
1241 jmp 0b // repeat for next block
1245 ENDING // update ABCDE (i=64:79)
1246 UPDATE_ALL_HASH // update output hashes
1250 #include <i386/cpu_capabilities.h>
1252 #include <System/i386/cpu_capabilities.h>
1257 .globl _SHA1Transform
1258 //.private_extern _SHA1Transform
1261 // detect SSSE3 and dispatch appropriate code branch
1262 #if defined __x86_64__
1263 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
1264 mov (%rax), %eax // %eax = __cpu_capabilities
1267 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
1268 mov (%eax), %eax // %eax = __cpu_capabilities
1270 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
1273 test $(kHasSupplementalSSE3), %eax
1274 je _SHA1Transform_nossse3 // branch to no-ssse3 code
1277 // start the sha1 code with ssse3 support
1279 // save callee-save registers
1280 #if defined (__x86_64__)
1290 sub $stack_size, sp // allocate stack memory for use
1292 // save used xmm register if this is for kernel
1294 xmov %xmm0, 4*16(sp)
1295 xmov %xmm1, 5*16(sp)
1296 xmov %xmm2, 6*16(sp)
1297 xmov %xmm3, 7*16(sp)
1298 xmov %xmm4, 8*16(sp)
1299 xmov %xmm5, 9*16(sp)
1300 xmov %xmm6, 10*16(sp)
1301 xmov %xmm7, 11*16(sp)
1302 #if defined (__x86_64__)
1303 xmov %xmm8, 12*16(sp)
1304 xmov %xmm9, 13*16(sp)
1305 xmov %xmm10, 14*16(sp)
1309 #if defined (__x86_64__)
1311 // set up registers to free %edx/%edi/%esi for other use (ABCDE)
1317 lea K_XMM_AR(%rip), K_BASE
1318 xmov 0x40(K_BASE), XMM_SHUFB_BSWAP
1325 // Get address of 0 in R.
1326 call 0f // Push program counter onto stack.
1327 0: pop %eax // Get program counter.
1328 lea K_XMM_AR-0b(%eax), %eax
1331 xmov 0x40(%eax), %xmm0
1332 xmov %xmm0, XMM_SHUFB_BSWAP
1336 SHA1_PIPELINED_MAIN_BODY_ssse3
1338 // restore used xmm registers if this is for kernel
1340 xmov 4*16(sp), %xmm0
1341 xmov 5*16(sp), %xmm1
1342 xmov 6*16(sp), %xmm2
1343 xmov 7*16(sp), %xmm3
1344 xmov 8*16(sp), %xmm4
1345 xmov 9*16(sp), %xmm5
1346 xmov 10*16(sp), %xmm6
1347 xmov 11*16(sp), %xmm7
1348 #if defined (__x86_64__)
1349 xmov 12*16(sp), %xmm8
1350 xmov 13*16(sp), %xmm9
1351 xmov 14*16(sp), %xmm10
1355 add $stack_size, sp // deallocate stack memory
1357 // restore callee-save registers
1358 #if defined (__x86_64__)
1370 // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions
1372 .globl _SHA1Transform_nossse3
1373 .private_extern _SHA1Transform_nossse3
1374 _SHA1Transform_nossse3:
1376 // push callee-save registers
1377 #if defined (__x86_64__)
1387 sub $stack_size, sp // allocate stack memory for local use
1389 // save used xmm registers if this is for kernel
1391 xmov %xmm0, 4*16(sp)
1392 xmov %xmm1, 5*16(sp)
1393 xmov %xmm2, 6*16(sp)
1394 xmov %xmm3, 7*16(sp)
1395 xmov %xmm4, 8*16(sp)
1396 xmov %xmm5, 9*16(sp)
1397 xmov %xmm6, 10*16(sp)
1398 xmov %xmm7, 11*16(sp)
1399 #if defined (__x86_64__)
1400 xmov %xmm8, 12*16(sp)
1401 xmov %xmm9, 13*16(sp)
1405 #if defined (__x86_64__)
1407 // set up registers to free %edx/%edi/%esi for other use (ABCDE)
1413 lea K_XMM_AR(%rip), K_BASE
1420 // Get address of 0 in R.
1421 call 0f // Push program counter onto stack.
1422 0: pop %eax // Get program counter.
1423 lea K_XMM_AR-0b(%eax), %eax
1429 SHA1_PIPELINED_MAIN_BODY_nossse3
1431 // restore used xmm registers if this is for kernel
1433 xmov 4*16(sp), %xmm0
1434 xmov 5*16(sp), %xmm1
1435 xmov 6*16(sp), %xmm2
1436 xmov 7*16(sp), %xmm3
1437 xmov 8*16(sp), %xmm4
1438 xmov 9*16(sp), %xmm5
1439 xmov 10*16(sp), %xmm6
1440 xmov 11*16(sp), %xmm7
1441 #if defined (__x86_64__)
1442 xmov 12*16(sp), %xmm8
1443 xmov 13*16(sp), %xmm9
1447 add $stack_size, sp // deallocate stack memory
1449 // restore callee-save registers
1450 #if defined (__x86_64__)
1465 #define K1 0x5a827999
1466 #define K2 0x6ed9eba1
1467 #define K3 0x8f1bbcdc
1468 #define K4 0xca62c1d6
1487 // bswap_shufb_ctl: invoked thru 0x40(K_XMM_AR)
1495 #endif // architecture x86_64 or i386