]> git.saurik.com Git - apple/xnu.git/blob - libkern/crypto/intel/sha1edp.s
8c52a5e7bf6c9e39535e7df06b7b98bb536de439
[apple/xnu.git] / libkern / crypto / intel / sha1edp.s
1 /* sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
2 CoreOS - vector and numerics group
3 cclee 6-21-10
4
5 The implementation is based on the principle described in an Intel online article
6 "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
7 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
8
9
10 Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
11
12 void SHA1( int HASH[], int MESSAGE[] )
13 {
14 int A[81], B[81], C[81], D[81], E[81];
15 int W[80];
16
17 int i, FN;
18
19 A[0] = HASH[0];
20 B[0] = HASH[1];
21 C[0] = HASH[2];
22 D[0] = HASH[3];
23 E[0] = HASH[4];
24
25 for ( i=0; i<80; ++i )
26 {
27 if ( i < 16 )
28 W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
29 else
30 W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
31
32 FN = F( i, B[i], C[i], D[i] );
33
34 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
35 B[i+1] = A[i];
36 C[i+1] = ROTATE_LEFT( B[i], 30 );
37 D[i+1] = C[i];
38 E[i+1] = D[i];
39 }
40
41 HASH[0] += A[80];
42 HASH[1] += B[80];
43 HASH[2] += C[80];
44 HASH[3] += D[80];
45 HASH[4] += E[80];
46 }
47
48 For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
49
50 The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
51
52 1. done on 4 consequtive W[i] values in a single XMM register
53 W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
54 W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
55 W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
56 W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
57
58 2. this additional calculation unfortunately requires many additional operations
59 W[i+3] ^= W[i] rol 1
60
61 3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
62 W[i:i+3] += {K,K,K,K}
63
64 Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
65 The Dean Gaudet approach can be expressed as
66
67 1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
68 2. W[i+3] ^= W[i] rol 1
69 3. W0 += {K,K,K,K}
70
71 For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
72
73 1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
74
75 Note:
76 1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
77 2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
78 i=0, W28,W24,...,W0
79 i=4, W24,W20,...,W28
80 i=8, W20,W16,...,W24
81 .
82 .
83 and so forth.
84 3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
85 a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
86 b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
87 4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available.
88 If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched.
89
90 */
91
92 /* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
93 #define Multiple_Blocks 1
94
95 #if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures
96
97 #if defined(__x86_64__)
98
99 // set up for x86_64
100 #define stack_size (8+16*11+16*4) // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage
101 #define sp %rsp // unifying architectural stack pointer representation
102 #define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
103 #define buf %rsi // 2nd input argument, will move to BUFFER_PTR (%r10)
104 #define cnt %r11 // will copy from the 3rd input argument (%rdx)
105 #define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
106 #define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
107 #define BUFFER_PTR %r10 // pointer to input blocks
108
109 #else // !__x86_64__
110
111 // set up for i386
112 #define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
113 #define sp %esp // unifying architectural stack pointer representation
114 #define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
115 #define BUFFER_PTR stack_size+16+8(sp) // use 2nd input argument from caller function
116 #define cnt stack_size+16+12(sp) // use 3rd input argument from caller function
117 #define K_BASE stack_size-4(sp) // use for K_BASE
118
119 #endif // __x86_64__
120
121 // symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
122
123 #define W_TMP %xmm0
124 #define W_TMP2 %xmm1
125 #define W0 %xmm2
126 #define W4 %xmm3
127 #define W8 %xmm4
128 #define W12 %xmm5
129 #define W16 %xmm6
130 #define W20 %xmm7
131 #if defined(__x86_64__)
132 #define W24 %xmm8
133 #define W28 %xmm9
134 #define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported
135 #else // defined (__i386__)
136 #define W24 12*16(sp)
137 #define W28 13*16(sp)
138 #define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported
139 #endif
140
141 #define xmov movaps // aligned 16-byte move
142 #define xmovu movups // unaligned 16-byte move
143
144 // intermediate hash variables
145 #define A %ecx
146 #define B %esi
147 #define C %edi
148 #define D %ebp
149 #define E %edx
150
151 // temp variables
152 #define T1 %eax
153 #define T2 %ebx
154
155 #define WK(t) (t&15)*4(sp)
156
157 // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
158 // result in T1
159 .macro F1
160 mov $1, T1
161 xor $2, T1
162 and $0, T1
163 xor $2, T1
164 .endm
165
166 // int F2(int B, int C, int D) { return (D ^ B ^ C); }
167 // result in T1
168 .macro F2
169 mov $2, T1
170 xor $1, T1
171 xor $0, T1
172 .endm
173
174 // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
175 // result in T1
176 .macro F3
177 mov $1, T1
178 mov $0, T2
179 or $0, T1
180 and $1, T2
181 and $2, T1
182 or T2, T1
183 .endm
184
185 // for i=60:79, F4 is identical to F2
186 #define F4 F2
187
188
189 /*
190 i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
191
192 with ssse3 support, this is achived via
193 for (i=0;i<16;i+=4) {
194 1. W_TMP = new 16 bytes from MESSAGE[]
195 2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
196 3. WTMP += {K,K,K,K};
197 4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
198 }
199
200 each step is represented in one of the following 4 macro definitions
201
202 */
203
204 .macro W_PRECALC_00_15_0_ssse3 // input argument $0 : 0/4/8/12
205 #if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64
206 xmovu $0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
207 #else // BUFFER_PTR is from the argument set up in the caller
208 mov BUFFER_PTR, T1 // T1 = BUFFER_PTR
209 xmovu $0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
210 #endif
211 .endm
212
213 .macro W_PRECALC_00_15_1_ssse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
214 pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian
215 xmov W_TMP, $0 // save W_TMP in the circular buffer
216 .endm
217
218 .macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple.
219 #if defined (__x86_64__) // K_BASE is already an address register in x86_64
220 paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K};
221 #else // K_BASE is previously set up in the stack memory
222 mov K_BASE, T1 // T1 = K_BASE
223 paddd (T1), W_TMP // W_TMP += {K,K,K,K};
224 #endif
225 .endm
226
227 .macro W_PRECALC_00_15_3
228 xmov W_TMP, WK($0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
229 .endm
230
231 /*
232 without ssse3 support, steps 1 and 2 need to be modified
233 1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space
234 2. load the 16-bytes from the aligned stack memory into W_TMP
235 */
236
237 .macro W_PRECALC_00_15_0_nossse3 // input argument $0 : 0/4/8/12
238
239 #if defined (__x86_64__)
240 #define BUFFERP BUFFER_PTR
241 #else
242 mov BUFFER_PTR, T2 // copy BUFFER_PTR (from caller 2nd argument) to T2
243 #define BUFFERP T2
244 #endif
245
246 // load 1st word, bswap it, save it to stack
247 mov $0*4(BUFFERP), T1
248 bswap T1
249 mov T1, 14*16(sp)
250
251 // load 2nd word, bswap it, save it to stack
252 mov 4+$0*4(BUFFERP), T1
253 bswap T1
254 mov T1, 4+14*16(sp)
255
256 // load 3rd word, bswap it, save it to stack
257 mov 8+$0*4(BUFFERP), T1
258 bswap T1
259 mov T1, 8+14*16(sp)
260
261 // load 4th word, bswap it, save it to stack
262 mov 12+$0*4(BUFFERP), T1
263 bswap T1
264 mov T1, 12+14*16(sp)
265 .endm
266
267 .macro W_PRECALC_00_15_1_nossse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
268 xmov 14*16(sp), W_TMP // load the bswapped 16-bytes from the aligned stack memory
269 xmov W_TMP, $0 // save W = W_TMP in the circular buffer
270 .endm
271
272 // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
273 /*
274 W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
275 W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
276 W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
277 W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
278
279 W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
280
281 The operation (updating W and W+K) is scheduled as and divided into 4 steps
282
283 0. W_tmp = W3; W = W14 ^ W8
284 1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
285 2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
286 3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
287
288 */
289
290 .macro W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W
291 xmov $1, $4 // W = W12
292 palignr $$8, $0, $4 // W = W14
293 xmov $3, W_TMP // W_TMP = W4
294 psrldq $$4, W_TMP // W_TMP = W3
295 pxor $2, $4 // W = W8 ^ W14
296 .endm
297
298 .macro W_PRECALC_16_31_1 // input arguments : W16,W
299 pxor $0, W_TMP // W_TMP = W3 ^ W16
300 pxor W_TMP, $1 // W = W3 ^ W16 ^ W8 ^ W14
301 xmov $1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14
302 xmov $1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14
303 pslldq $$12, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
304 .endm
305
306 .macro W_PRECALC_16_31_2 // input argument : W
307 psrld $$31, $0 // (W3 ^ W16 ^ W8 ^ W14)>>31
308 pslld $$1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
309 por $0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
310 xmov W_TMP2, $0 // copy W[i] at location of W[i+3]
311 psrld $$30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
312 pslld $$2, $0 // W = W[i] higher 30 bits after rol 2
313 .endm
314
315 .macro W_PRECALC_16_31_3 // input arguments: W, i, K_XMM
316 #if defined (__i386__)
317 mov K_BASE, T1 // K_BASE is store in the stack memory for i386
318 #endif
319 pxor $0, W_TMP
320 pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
321 xmov W_TMP, $0 // save W = W_TMP in the W circular buffer
322 #if defined (__x86_64__)
323 paddd $2(K_BASE), W_TMP // W+K
324 #else
325 paddd $2(T1), W_TMP // W+K
326 #endif
327 xmov W_TMP, WK($1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
328 .endm
329
330 // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions
331
332 .macro W_PRECALC_16_31_0_nossse3 // input arguments : W16,W12,W8,W4,W
333 xmov $1, $4 // W = W12 = (w9 w10 w11 w12)
334
335 // the following is a wrokaround for palignr
336 xmov $0, W_TMP // W16 = (w13 w14 w15 w16)
337 pslldq $$8, $4 // shift left to make (w11 w12 0 0)
338 psrldq $$8, W_TMP // shift right to make (0 0 w13 w14)
339 por W_TMP, $4 // W = W14 = (w11 w12 w13 w14)
340
341 xmov $3, W_TMP // W_TMP = W4 = (w1 w2 w3 w4)
342 psrldq $$4, W_TMP // W_TMP = W3 = (0 w1 w2 w3)
343 pxor $2, $4 // W = W8 ^ W14
344 .endm
345
346 /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
347
348 W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
349
350 where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
351
352
353 0. W_tmp = W6; W = W28 ^ W32;
354 1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
355 2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
356 3. W = W_Tmp; WK = W_tmp + K;
357
358 */
359
360
361 .macro W_PRECALC_32_79_0_ssse3 // inputr arguments : W28,W8,W4,W
362 xmov $2, W_TMP // (w1 w2 w3 w4)
363 pxor $0, $3 // W = W28 ^ W32;
364 palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
365 .endm
366
367 // the following is a variant and will be used for system without ssse3 support
368 .macro W_PRECALC_32_79_0_nossse3 // input arguments : W28,W8,W4,W
369 xmov $2, W_TMP // (w1 w2 w3 w4)
370 xmov $1, W_TMP2 // (w5 w6 w7 w8)
371 pxor $0, $3 // W = W28 ^ W32
372 pslldq $$8, W_TMP // (w3 w4 0 0)
373 psrldq $$8, W_TMP2 // (0 0 w5 w6)
374 por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6
375 .endm
376
377 // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
378 .macro W_PRECALC_32_79_0_i386_ssse3 // input arguments : W28,W8,W4,W
379 xmov $3, W_TMP // W32
380 pxor $0, W_TMP // W28 ^ W32
381 xmov W_TMP, $3 // W = W28 ^ W32;
382 xmov $2, W_TMP // W4
383 palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
384 .endm
385
386 // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers)
387 .macro W_PRECALC_32_79_0_i386_nossse3 // input arguments : W28,W8,W4,W
388 xmov $3, W_TMP // W32
389 pxor $0, W_TMP // W28 ^ W32
390 xmov W_TMP, $3 // W = W28 ^ W32
391 xmov $2, W_TMP // W4 = (w1 w2 w3 w4)
392 xmov $1, W_TMP2 // W8 = (w5 w6 w7 w8)
393 pslldq $$8, W_TMP // (w3 w4 0 0)
394 psrldq $$8, W_TMP2 // (0 0 w5 w6)
395 por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6
396 .endm
397
398 .macro W_PRECALC_32_79_1 // input arguments : W16,W
399 pxor $0, W_TMP // W_tmp = W6 ^ W16
400 pxor $1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32
401 xmov W_TMP, $1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
402 .endm
403
404 .macro W_PRECALC_32_79_2 // input argument : W
405 psrld $$30, $0 // W >> 30
406 pslld $$2, W_TMP // W << 2
407 por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
408 .endm
409
410 // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
411 // this should be used when the input is either W24 or W28 on i386 architecture
412 .macro W_PRECALC_32_79_2_i386 // input argument : W
413 xmov $0, W_TMP2 // W
414 psrld $$30, W_TMP2 // W >> 30
415 xmov W_TMP2, $0 // save (W >> 30) at W
416 pslld $$2, W_TMP // W_tmp << 2
417 por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
418 .endm
419
420 .macro W_PRECALC_32_79_3 // input argument W, i, K_XMM
421 #if defined (__x86_64__)
422 xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
423 paddd $2(K_BASE), W_TMP // W + K
424 xmov W_TMP, WK($1&~3) // write W+K
425 #else
426 mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument)
427 xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
428 paddd $2(T1), W_TMP // W_tmp = W + K
429 xmov W_TMP, WK($1&~3) // write WK
430 #endif
431 .endm
432
433
434 /* The hash update operation is completed by the following statements.
435
436 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
437 B[i+1] = A[i];
438 C[i+1] = ROTATE_LEFT( B[i], 30 );
439 D[i+1] = C[i];
440 E[i+1] = D[i];
441
442 Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
443
444 A1 = FN + E0 + rol(A0,5) + WK;
445 B1 = A0;
446 C1 = rol(B0, 30);
447 D1 = C0;
448 E1 = D0;
449
450 to avoid excessive memory movement between registers,
451 1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
452 2. C1 = rol(B0,30) can be temporarily saved in B0.
453
454 Therefore, ignoring the time index, the update operation is equivalent to
455 1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
456 2. B = rol(B,30)
457 3. the hashes are now stored in the order of E,A,B,C,D
458
459
460 To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
461 1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
462 2. B = rol(B,30)
463 // now the hashes are in the order of E,A,B,C,D
464 3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
465 4. A = rol(A,30)
466 // now the hashes are in the order of D,E,A,B,C
467
468 These operations are distributed into the following 2 macro definitions RR0 and RR1.
469
470 */
471
472 .macro RR0 // input arguments : FN, A, B, C, D, E, i
473 $0 $2, $3, $4 // T1 = FN(B,C,D)
474 add WK($6), $5 // E + WK(i)
475 rol $$30, $2 // B = rol(B,30)
476 mov $1, T2 // T2 = A
477 add WK($6+1), $4 // D + WK(i+1)
478 rol $$5, T2 // rol(A,5)
479 add T1, $5 // E = FN(B,C,D) + E + WK(i)
480 .endm
481
482 .macro RR1
483 add $5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
484 mov T2, $5 // E = FN(B,C,D) + E + rol(A,5) + WK(i)
485 rol $$5, T2 // rol(E,5)
486 add T2, $4 // D + WK(i+1) + rol(E,5)
487 $0 $1, $2, $3 // FN(A,B,C)
488 add T1, $4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
489 rol $$30, $1 // A = rol(A,30)
490 .endm
491
492
493
494 /*
495
496 The following macro definitions are used to expand code for the per-block sha1 operation.
497
498 INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
499 INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
500 ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
501
502 For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
503 into 1 macro definition for software pipeling.
504
505 SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
506
507 assume cnt (the number of blocks) >= 1, the main code body should look like
508
509 INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15)
510 do {
511 INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
512 cnt--;
513 if (cnt==0) break;
514 BUFFER_PTR += 64;
515 SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
516 }
517 ENDING // update hash digests A/B/C/D/E (i=64:79)
518
519 */
520
521 #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3
522 #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3
523 #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3
524 #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3
525 #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3
526
527
528 .macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
529
530 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
531 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
532 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
533 W_PRECALC_00_15_2 // W_TMP = W0 + K
534 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
535
536 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
537 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
538 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
539 W_PRECALC_00_15_2 // W_TMP = W28 + K
540 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
541
542 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
543 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
544 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
545 W_PRECALC_00_15_2 // W_TMP = W24 + K
546 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
547
548 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
549 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
550 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
551 W_PRECALC_00_15_2 // W_TMP = W20 + K
552 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
553
554 .endm
555
556
557 .macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
558
559 // i=16 : W12,W8,W4,W0,W28,W24,W20,W16
560 W_PRECALC_16_31_0 W0,W28,W24,W20,W16
561 RR0 F1,A,B,C,D,E,0
562 W_PRECALC_16_31_1 W0,W16
563 RR1 F1,A,B,C,D,E,0
564 W_PRECALC_16_31_2 W16
565 RR0 F1,D,E,A,B,C,2
566 W_PRECALC_16_31_3 W16, 2, 0
567 RR1 F1,D,E,A,B,C,2
568
569 // i=20 : W8,W4,W0,W28,W24,W20,W16,W12
570 W_PRECALC_16_31_0 W28,W24,W20,W16,W12
571 RR0 F1,B,C,D,E,A,4
572 W_PRECALC_16_31_1 W28,W12
573 RR1 F1,B,C,D,E,A,4
574 W_PRECALC_16_31_2 W12
575 RR0 F1,E,A,B,C,D,6
576 W_PRECALC_16_31_3 W12, 6, 16
577 RR1 F1,E,A,B,C,D,6
578
579 // i=24 : W4,W0,W28,W24,W20,W16,W12,W8
580 W_PRECALC_16_31_0 W24,W20,W16,W12,W8
581 RR0 F1,C,D,E,A,B,8
582 W_PRECALC_16_31_1 W24,W8
583 RR1 F1,C,D,E,A,B,8
584 W_PRECALC_16_31_2 W8
585 RR0 F1,A,B,C,D,E,10
586 W_PRECALC_16_31_3 W8,10,16
587 RR1 F1,A,B,C,D,E,10
588
589 // i=28 : W0,W28,W24,W20,W16,W12,W8,W4
590 W_PRECALC_16_31_0 W20,W16,W12,W8,W4
591 RR0 F1,D,E,A,B,C,12
592 W_PRECALC_16_31_1 W20,W4
593 RR1 F1,D,E,A,B,C,12
594 W_PRECALC_16_31_2 W4
595 RR0 F1,B,C,D,E,A,14
596 W_PRECALC_16_31_3 W4,14,16
597 RR1 F1,B,C,D,E,A,14
598
599 // i=32 : W28,W24,W20,W16,W12,W8,W4,W0
600 W_PRECALC_32_79_0 W28,W8,W4,W0
601 RR0 F1,E,A,B,C,D,16
602 W_PRECALC_32_79_1 W16,W0
603 RR1 F1,E,A,B,C,D,16
604 W_PRECALC_32_79_2 W0
605 RR0 F1,C,D,E,A,B,18
606 W_PRECALC_32_79_3 W0,18,16
607 RR1 F1,C,D,E,A,B,18
608
609 // starting using F2
610
611 // i=36 : W24,W20,W16,W12,W8,W4,W0,W28
612 #if defined (__x86_64__)
613 W_PRECALC_32_79_0 W24,W4,W0,W28
614 #else
615 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
616 #endif
617 RR0 F2,A,B,C,D,E,20
618 W_PRECALC_32_79_1 W12,W28
619 RR1 F2,A,B,C,D,E,20
620 #if defined (__x86_64__)
621 W_PRECALC_32_79_2 W28
622 #else
623 W_PRECALC_32_79_2_i386 W28
624 #endif
625 RR0 F2,D,E,A,B,C,22
626 W_PRECALC_32_79_3 W28,22,16
627 RR1 F2,D,E,A,B,C,22
628
629 // i=40 : W20,W16,W12,W8,W4,W0,W28,W24
630 #undef K_XMM
631 #define K_XMM 32
632 #if defined (__x86_64__)
633 W_PRECALC_32_79_0 W20,W0,W28,W24
634 #else
635 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
636 #endif
637 RR0 F2,B,C,D,E,A,24
638 W_PRECALC_32_79_1 W8,W24
639 RR1 F2,B,C,D,E,A,24
640 #if defined (__x86_64__)
641 W_PRECALC_32_79_2 W24
642 #else
643 W_PRECALC_32_79_2_i386 W24
644 #endif
645 RR0 F2,E,A,B,C,D,26
646 W_PRECALC_32_79_3 W24,26,K_XMM
647 RR1 F2,E,A,B,C,D,26
648
649 // i=44 : W16,W12,W8,W4,W0,W28,W24,W20
650 W_PRECALC_32_79_0 W16,W28,W24,W20
651 RR0 F2,C,D,E,A,B,28
652 W_PRECALC_32_79_1 W4,W20
653 RR1 F2,C,D,E,A,B,28
654 W_PRECALC_32_79_2 W20
655 RR0 F2,A,B,C,D,E,30
656 W_PRECALC_32_79_3 W20,30,K_XMM
657 RR1 F2,A,B,C,D,E,30
658
659 // i=48 : W12,W8,W4,W0,W28,W24,W20,W16
660 W_PRECALC_32_79_0 W12,W24,W20,W16
661 RR0 F2,D,E,A,B,C,32
662 W_PRECALC_32_79_1 W0,W16
663 RR1 F2,D,E,A,B,C,32
664 W_PRECALC_32_79_2 W16
665 RR0 F2,B,C,D,E,A,34
666 W_PRECALC_32_79_3 W16,34,K_XMM
667 RR1 F2,B,C,D,E,A,34
668
669 // i=52 : W8,W4,W0,W28,W24,W20,W16,W12
670 W_PRECALC_32_79_0 W8,W20,W16,W12
671 RR0 F2,E,A,B,C,D,36
672 W_PRECALC_32_79_1 W28,W12
673 RR1 F2,E,A,B,C,D,36
674 W_PRECALC_32_79_2 W12
675 RR0 F2,C,D,E,A,B,38
676 W_PRECALC_32_79_3 W12,38,K_XMM
677 RR1 F2,C,D,E,A,B,38
678
679 // starting using F3
680
681 // i=56 : W4,W0,W28,W24,W20,W16,W12,W8
682 W_PRECALC_32_79_0 W4,W16,W12,W8
683 RR0 F3,A,B,C,D,E,40
684 W_PRECALC_32_79_1 W24,W8
685 RR1 F3,A,B,C,D,E,40
686 W_PRECALC_32_79_2 W8
687 RR0 F3,D,E,A,B,C,42
688 W_PRECALC_32_79_3 W8,42,K_XMM
689 RR1 F3,D,E,A,B,C,42
690
691 // i=60 : W0,W28,W24,W20,W16,W12,W8,W4
692 #undef K_XMM
693 #define K_XMM 48
694 W_PRECALC_32_79_0 W0,W12,W8,W4
695 RR0 F3,B,C,D,E,A,44
696 W_PRECALC_32_79_1 W20,W4
697 RR1 F3,B,C,D,E,A,44
698 W_PRECALC_32_79_2 W4
699 RR0 F3,E,A,B,C,D,46
700 W_PRECALC_32_79_3 W4,46,K_XMM
701 RR1 F3,E,A,B,C,D,46
702
703 // i=64 : W28,W24,W20,W16,W12,W8,W4,W0
704 W_PRECALC_32_79_0 W28,W8,W4,W0
705 RR0 F3,C,D,E,A,B,48
706 W_PRECALC_32_79_1 W16,W0
707 RR1 F3,C,D,E,A,B,48
708 W_PRECALC_32_79_2 W0
709 RR0 F3,A,B,C,D,E,50
710 W_PRECALC_32_79_3 W0,50,K_XMM
711 RR1 F3,A,B,C,D,E,50
712
713 // i=68 : W24,W20,W16,W12,W8,W4,W0,W28
714 #if defined (__x86_64__)
715 W_PRECALC_32_79_0 W24,W4,W0,W28
716 #else
717 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
718 #endif
719 RR0 F3,D,E,A,B,C,52
720 W_PRECALC_32_79_1 W12,W28
721 RR1 F3,D,E,A,B,C,52
722 #if defined (__x86_64__)
723 W_PRECALC_32_79_2 W28
724 #else
725 W_PRECALC_32_79_2_i386 W28
726 #endif
727 RR0 F3,B,C,D,E,A,54
728 W_PRECALC_32_79_3 W28,54,K_XMM
729 RR1 F3,B,C,D,E,A,54
730
731 // i=72 : W20,W16,W12,W8,W4,W0,W28,W24
732 #if defined (__x86_64__)
733 W_PRECALC_32_79_0 W20,W0,W28,W24
734 #else
735 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
736 #endif
737 RR0 F3,E,A,B,C,D,56
738 W_PRECALC_32_79_1 W8,W24
739 RR1 F3,E,A,B,C,D,56
740 #if defined (__x86_64__)
741 W_PRECALC_32_79_2 W24
742 #else
743 W_PRECALC_32_79_2_i386 W24
744 #endif
745 RR0 F3,C,D,E,A,B,58
746 W_PRECALC_32_79_3 W24,58,K_XMM
747 RR1 F3,C,D,E,A,B,58
748
749 // starting using F4
750
751 // i=76 : W16,W12,W8,W4,W0,W28,W24,W20
752 W_PRECALC_32_79_0 W16,W28,W24,W20
753 RR0 F4,A,B,C,D,E,60
754 W_PRECALC_32_79_1 W4,W20
755 RR1 F4,A,B,C,D,E,60
756 W_PRECALC_32_79_2 W20
757 RR0 F4,D,E,A,B,C,62
758 W_PRECALC_32_79_3 W20,62,K_XMM
759 RR1 F4,D,E,A,B,C,62
760
761 .endm
762
763 .macro SOFTWARE_PIPELINING_ssse3
764 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
765 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
766 RR0 F4,B,C,D,E,A,64
767 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
768 RR1 F4,B,C,D,E,A,64
769 W_PRECALC_00_15_2 // W_TMP = W0 + K
770 RR0 F4,E,A,B,C,D,66
771 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
772 RR1 F4,E,A,B,C,D,66
773
774 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
775 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
776 RR0 F4,C,D,E,A,B,68
777 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
778 RR1 F4,C,D,E,A,B,68
779 W_PRECALC_00_15_2 // W_TMP = W28 + K
780 RR0 F4,A,B,C,D,E,70
781 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
782 RR1 F4,A,B,C,D,E,70
783
784 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
785 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
786 RR0 F4,D,E,A,B,C,72
787 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
788 RR1 F4,D,E,A,B,C,72
789 W_PRECALC_00_15_2 // W_TMP = W24 + K
790 RR0 F4,B,C,D,E,A,74
791 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
792 RR1 F4,B,C,D,E,A,74
793
794 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
795 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
796 RR0 F4,E,A,B,C,D,76
797 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
798 RR1 F4,E,A,B,C,D,76
799 W_PRECALC_00_15_2 // W_TMP = W20 + K
800 RR0 F4,C,D,E,A,B,78
801 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
802 RR1 F4,C,D,E,A,B,78
803 .endm
804
805
806 #undef W_PRECALC_00_15_0
807 #undef W_PRECALC_00_15_1
808 #undef W_PRECALC_16_31_0
809 #undef W_PRECALC_32_79_0
810 #undef W_PRECALC_32_79_0_i386
811
812
813
814 /*
815
816 The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions.
817
818 INITIAL_W_PRECALC_nossse3
819 INTERNAL_nossse3
820 SOFTWARE_PIPELINING_nossse3
821
822 They will be used in a sha1 code main body definition that will be used for system without ssse3 support.
823
824 */
825
826 #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_nossse3
827 #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_nossse3
828 #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_nossse3
829 #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_nossse3
830 #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_nossse3
831
832
833 .macro INITIAL_W_PRECALC_nossse3
834
835 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
836 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
837 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
838 W_PRECALC_00_15_2 // W_TMP = W0 + K
839 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
840
841 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
842 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
843 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
844 W_PRECALC_00_15_2 // W_TMP = W28 + K
845 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
846
847 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
848 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
849 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
850 W_PRECALC_00_15_2 // W_TMP = W24 + K
851 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
852
853 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
854 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
855 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
856 W_PRECALC_00_15_2 // W_TMP = W20 + K
857 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
858
859 .endm
860
861
862 .macro INTERNAL_nossse3
863 // i=16
864 // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
865 W_PRECALC_16_31_0 W0,W28,W24,W20,W16
866 RR0 F1,A,B,C,D,E,0
867 W_PRECALC_16_31_1 W0,W16
868 RR1 F1,A,B,C,D,E,0
869 W_PRECALC_16_31_2 W16
870 RR0 F1,D,E,A,B,C,2
871 W_PRECALC_16_31_3 W16, 2, 0
872 RR1 F1,D,E,A,B,C,2
873
874 // i=20,
875 // W8,W4,W0,W28,W24,W20,W16,W12
876 W_PRECALC_16_31_0 W28,W24,W20,W16,W12
877 RR0 F1,B,C,D,E,A,4
878 W_PRECALC_16_31_1 W28,W12
879 RR1 F1,B,C,D,E,A,4
880
881 W_PRECALC_16_31_2 W12
882 RR0 F1,E,A,B,C,D,6
883 W_PRECALC_16_31_3 W12, 6, 16
884 RR1 F1,E,A,B,C,D,6
885
886 // i=24,
887 // W4,W0,W28,W24,W20,W16,W12,W8
888 W_PRECALC_16_31_0 W24,W20,W16,W12,W8
889 RR0 F1,C,D,E,A,B,8
890 W_PRECALC_16_31_1 W24,W8
891 RR1 F1,C,D,E,A,B,8
892
893 W_PRECALC_16_31_2 W8
894 RR0 F1,A,B,C,D,E,10
895 W_PRECALC_16_31_3 W8,10,16
896 RR1 F1,A,B,C,D,E,10
897
898 // i=28
899 // W0,W28,W24,W20,W16,W12,W8,W4
900 W_PRECALC_16_31_0 W20,W16,W12,W8,W4
901 RR0 F1,D,E,A,B,C,12
902 W_PRECALC_16_31_1 W20,W4
903 RR1 F1,D,E,A,B,C,12
904
905 W_PRECALC_16_31_2 W4
906 RR0 F1,B,C,D,E,A,14
907 W_PRECALC_16_31_3 W4,14,16
908 RR1 F1,B,C,D,E,A,14
909
910 //i=32
911 // W28,W24,W20,W16,W12,W8,W4,W0
912 W_PRECALC_32_79_0 W28,W8,W4,W0
913 RR0 F1,E,A,B,C,D,16
914 W_PRECALC_32_79_1 W16,W0
915 RR1 F1,E,A,B,C,D,16
916 W_PRECALC_32_79_2 W0
917 RR0 F1,C,D,E,A,B,18
918 W_PRECALC_32_79_3 W0,18,16
919 RR1 F1,C,D,E,A,B,18
920
921 //i=36
922 // W24,W20,W16,W12,W8,W4,W0,W28
923 #if defined (__x86_64__)
924 W_PRECALC_32_79_0 W24,W4,W0,W28
925 #else
926 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
927 #endif
928 RR0 F2,A,B,C,D,E,20
929 W_PRECALC_32_79_1 W12,W28
930 RR1 F2,A,B,C,D,E,20
931 #if defined (__x86_64__)
932 W_PRECALC_32_79_2 W28
933 #else
934 W_PRECALC_32_79_2_i386 W28
935 #endif
936 RR0 F2,D,E,A,B,C,22
937 W_PRECALC_32_79_3 W28,22,16
938 RR1 F2,D,E,A,B,C,22
939
940 //i=40
941 #undef K_XMM
942 #define K_XMM 32
943 // W20,W16,W12,W8,W4,W0,W28,W24
944 #if defined (__x86_64__)
945 W_PRECALC_32_79_0 W20,W0,W28,W24
946 #else
947 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
948 #endif
949 RR0 F2,B,C,D,E,A,24
950 W_PRECALC_32_79_1 W8,W24
951 RR1 F2,B,C,D,E,A,24
952 #if defined (__x86_64__)
953 W_PRECALC_32_79_2 W24
954 #else
955 W_PRECALC_32_79_2_i386 W24
956 #endif
957 RR0 F2,E,A,B,C,D,26
958 W_PRECALC_32_79_3 W24,26,K_XMM
959 RR1 F2,E,A,B,C,D,26
960
961 //i=44
962 // W16,W12,W8,W4,W0,W28,W24,W20
963 W_PRECALC_32_79_0 W16,W28,W24,W20
964 RR0 F2,C,D,E,A,B,28
965 W_PRECALC_32_79_1 W4,W20
966 RR1 F2,C,D,E,A,B,28
967 W_PRECALC_32_79_2 W20
968 RR0 F2,A,B,C,D,E,30
969 W_PRECALC_32_79_3 W20,30,K_XMM
970 RR1 F2,A,B,C,D,E,30
971
972 //i=48
973 // W12,W8,W4,W0,W28,W24,W20,W16
974 W_PRECALC_32_79_0 W12,W24,W20,W16
975 RR0 F2,D,E,A,B,C,32
976 W_PRECALC_32_79_1 W0,W16
977 RR1 F2,D,E,A,B,C,32
978 W_PRECALC_32_79_2 W16
979 RR0 F2,B,C,D,E,A,34
980 W_PRECALC_32_79_3 W16,34,K_XMM
981 RR1 F2,B,C,D,E,A,34
982
983 //i=52
984 // W8,W4,W0,W28,W24,W20,W16,W12
985 W_PRECALC_32_79_0 W8,W20,W16,W12
986 RR0 F2,E,A,B,C,D,36
987 W_PRECALC_32_79_1 W28,W12
988 RR1 F2,E,A,B,C,D,36
989 W_PRECALC_32_79_2 W12
990 RR0 F2,C,D,E,A,B,38
991 W_PRECALC_32_79_3 W12,38,K_XMM
992 RR1 F2,C,D,E,A,B,38
993
994 //i=56
995 // W4,W0,W28,W24,W20,W16,W12,W8
996 W_PRECALC_32_79_0 W4,W16,W12,W8
997 RR0 F3,A,B,C,D,E,40
998 W_PRECALC_32_79_1 W24,W8
999 RR1 F3,A,B,C,D,E,40
1000 W_PRECALC_32_79_2 W8
1001 RR0 F3,D,E,A,B,C,42
1002 W_PRECALC_32_79_3 W8,42,K_XMM
1003 RR1 F3,D,E,A,B,C,42
1004
1005 //i=60
1006 #undef K_XMM
1007 #define K_XMM 48
1008 // W0,W28,W24,W20,W16,W12,W8,W4
1009 W_PRECALC_32_79_0 W0,W12,W8,W4
1010 RR0 F3,B,C,D,E,A,44
1011 W_PRECALC_32_79_1 W20,W4
1012 RR1 F3,B,C,D,E,A,44
1013 W_PRECALC_32_79_2 W4
1014 RR0 F3,E,A,B,C,D,46
1015 W_PRECALC_32_79_3 W4,46,K_XMM
1016 RR1 F3,E,A,B,C,D,46
1017
1018 //i=64
1019 // W28,W24,W20,W16,W12,W8,W4,W0
1020 W_PRECALC_32_79_0 W28,W8,W4,W0
1021 RR0 F3,C,D,E,A,B,48
1022 W_PRECALC_32_79_1 W16,W0
1023 RR1 F3,C,D,E,A,B,48
1024 W_PRECALC_32_79_2 W0
1025 RR0 F3,A,B,C,D,E,50
1026 W_PRECALC_32_79_3 W0,50,K_XMM
1027 RR1 F3,A,B,C,D,E,50
1028
1029 //i=68
1030 // W24,W20,W16,W12,W8,W4,W0,W28
1031 #if defined (__x86_64__)
1032 W_PRECALC_32_79_0 W24,W4,W0,W28
1033 #else
1034 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
1035 #endif
1036 RR0 F3,D,E,A,B,C,52
1037 W_PRECALC_32_79_1 W12,W28
1038 RR1 F3,D,E,A,B,C,52
1039 #if defined (__x86_64__)
1040 W_PRECALC_32_79_2 W28
1041 #else
1042 W_PRECALC_32_79_2_i386 W28
1043 #endif
1044 RR0 F3,B,C,D,E,A,54
1045 W_PRECALC_32_79_3 W28,54,K_XMM
1046 RR1 F3,B,C,D,E,A,54
1047
1048 //i=72
1049 // W20,W16,W12,W8,W4,W0,W28,W24
1050 #if defined (__x86_64__)
1051 W_PRECALC_32_79_0 W20,W0,W28,W24
1052 #else
1053 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
1054 #endif
1055 RR0 F3,E,A,B,C,D,56
1056 W_PRECALC_32_79_1 W8,W24
1057 RR1 F3,E,A,B,C,D,56
1058 #if defined (__x86_64__)
1059 W_PRECALC_32_79_2 W24
1060 #else
1061 W_PRECALC_32_79_2_i386 W24
1062 #endif
1063 RR0 F3,C,D,E,A,B,58
1064 W_PRECALC_32_79_3 W24,58,K_XMM
1065 RR1 F3,C,D,E,A,B,58
1066
1067 // starting using F4
1068
1069 //i=76
1070 // W16,W12,W8,W4,W0,W28,W24,W20
1071 W_PRECALC_32_79_0 W16,W28,W24,W20
1072 RR0 F4,A,B,C,D,E,60
1073 W_PRECALC_32_79_1 W4,W20
1074 RR1 F4,A,B,C,D,E,60
1075 W_PRECALC_32_79_2 W20
1076 RR0 F4,D,E,A,B,C,62
1077 W_PRECALC_32_79_3 W20,62,K_XMM
1078 RR1 F4,D,E,A,B,C,62
1079
1080 .endm
1081
1082 .macro SOFTWARE_PIPELINING_nossse3
1083 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
1084 W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
1085 RR0 F4,B,C,D,E,A,64
1086 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
1087 RR1 F4,B,C,D,E,A,64
1088 W_PRECALC_00_15_2 // W_TMP = W0 + K
1089 RR0 F4,E,A,B,C,D,66
1090 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
1091 RR1 F4,E,A,B,C,D,66
1092
1093 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
1094 W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
1095 RR0 F4,C,D,E,A,B,68
1096 W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
1097 RR1 F4,C,D,E,A,B,68
1098 W_PRECALC_00_15_2 // W_TMP = W28 + K
1099 RR0 F4,A,B,C,D,E,70
1100 W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
1101 RR1 F4,A,B,C,D,E,70
1102
1103 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
1104 W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
1105 RR0 F4,D,E,A,B,C,72
1106 W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
1107 RR1 F4,D,E,A,B,C,72
1108 W_PRECALC_00_15_2 // W_TMP = W24 + K
1109 RR0 F4,B,C,D,E,A,74
1110 W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
1111 RR1 F4,B,C,D,E,A,74
1112
1113 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
1114 W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
1115 RR0 F4,E,A,B,C,D,76
1116 W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
1117 RR1 F4,E,A,B,C,D,76
1118 W_PRECALC_00_15_2 // W_TMP = W20 + K
1119 RR0 F4,C,D,E,A,B,78
1120 W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
1121 RR1 F4,C,D,E,A,B,78
1122 .endm
1123
1124 .macro ENDING // finish up updating hash digests (i=64:79)
1125 //i=80
1126 RR0 F4,B,C,D,E,A,64
1127 RR1 F4,B,C,D,E,A,64
1128 RR0 F4,E,A,B,C,D,66
1129 RR1 F4,E,A,B,C,D,66
1130
1131 //i=84
1132 RR0 F4,C,D,E,A,B,68
1133 RR1 F4,C,D,E,A,B,68
1134 RR0 F4,A,B,C,D,E,70
1135 RR1 F4,A,B,C,D,E,70
1136
1137 //i=88
1138 RR0 F4,D,E,A,B,C,72
1139 RR1 F4,D,E,A,B,C,72
1140 RR0 F4,B,C,D,E,A,74
1141 RR1 F4,B,C,D,E,A,74
1142
1143 //i=92
1144 RR0 F4,E,A,B,C,D,76
1145 RR1 F4,E,A,B,C,D,76
1146 RR0 F4,C,D,E,A,B,78
1147 RR1 F4,C,D,E,A,B,78
1148 .endm
1149
1150 // load hash digests A,B,C,D,E from memory into registers
1151 .macro LOAD_HASH
1152 #if defined (__x86_64__)
1153 mov (HASH_PTR), A
1154 mov 4(HASH_PTR), B
1155 mov 8(HASH_PTR), C
1156 mov 12(HASH_PTR), D
1157 mov 16(HASH_PTR), E
1158 #else
1159 mov HASH_PTR, T1
1160 mov (T1), A
1161 mov 4(T1), B
1162 mov 8(T1), C
1163 mov 12(T1), D
1164 mov 16(T1), E
1165 #endif
1166 .endm
1167
1168 .macro UPDATE_HASH
1169 add $0, $1
1170 mov $1, $0
1171 .endm
1172
1173 .macro UPDATE_ALL_HASH
1174 #if defined (__x86_64__)
1175 UPDATE_HASH (HASH_PTR), A
1176 UPDATE_HASH 4(HASH_PTR), B
1177 UPDATE_HASH 8(HASH_PTR), C
1178 UPDATE_HASH 12(HASH_PTR), D
1179 UPDATE_HASH 16(HASH_PTR), E
1180 #else
1181 mov HASH_PTR, T1
1182 UPDATE_HASH (T1), A
1183 UPDATE_HASH 4(T1), B
1184 UPDATE_HASH 8(T1), C
1185 UPDATE_HASH 12(T1), D
1186 UPDATE_HASH 16(T1), E
1187 #endif
1188 .endm
1189
1190
1191 /*
1192 main sha1 code for system without ssse3 support
1193 */
1194
1195 .macro SHA1_PIPELINED_MAIN_BODY_nossse3
1196 LOAD_HASH // load initial hashes into A,B,C,D,E (registers)
1197 INITIAL_W_PRECALC_nossse3 // big_endian_load(W) and W+K (i=0:15)
1198 .align 4,0x90
1199 0:
1200 INTERNAL_nossse3 // update W (i=16:79) and update ABCDE (i=0:63)
1201 #if Multiple_Blocks
1202 #if defined(__x86_64__)
1203 add $$64, BUFFER_PTR // BUFFER_PTR+=64;
1204 sub $$1, cnt // pre-decrement cnt by 1
1205 #else
1206 addl $$64, BUFFER_PTR // BUFFER_PTR+=64;
1207 subl $$1, cnt // pre-decrement cnt by 1
1208 #endif
1209 jbe 1f // if cnt <= 0, branch to finish off
1210 SOFTWARE_PIPELINING_nossse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1211 UPDATE_ALL_HASH // update output hashes
1212 jmp 0b // repeat for next block
1213 .align 4,0x90
1214 1:
1215 #endif
1216 ENDING // update ABCDE (i=64:79)
1217 UPDATE_ALL_HASH // update output hashes
1218 .endm
1219
1220 /*
1221 main sha1 code for system with ssse3 support
1222 */
1223
1224 .macro SHA1_PIPELINED_MAIN_BODY_ssse3
1225 LOAD_HASH // load initial hashes into A,B,C,D,E
1226 INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15)
1227 .align 4,0x90
1228 0:
1229 INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63)
1230 #if Multiple_Blocks
1231 #if defined(__x86_64__)
1232 add $$64, BUFFER_PTR // BUFFER_PTR+=64;
1233 sub $$1, cnt // pre-decrement cnt by 1
1234 #else
1235 addl $$64, BUFFER_PTR // BUFFER_PTR+=64;
1236 subl $$1, cnt // pre-decrement cnt by 1
1237 #endif
1238 jbe 1f // if cnt <= 0, branch to finish off
1239 SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1240 UPDATE_ALL_HASH // update output hashes
1241 jmp 0b // repeat for next block
1242 .align 4,0x90
1243 1:
1244 #endif
1245 ENDING // update ABCDE (i=64:79)
1246 UPDATE_ALL_HASH // update output hashes
1247 .endm
1248
1249 #ifdef KERNEL
1250 #include <i386/cpu_capabilities.h>
1251 #else
1252 #include <System/i386/cpu_capabilities.h>
1253 #endif
1254
1255 .text
1256
1257 .globl _SHA1Transform
1258 //.private_extern _SHA1Transform
1259 _SHA1Transform:
1260
1261 // detect SSSE3 and dispatch appropriate code branch
1262 #if defined __x86_64__
1263 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
1264 mov (%rax), %eax // %eax = __cpu_capabilities
1265 #else // i386
1266 #if defined KERNEL
1267 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
1268 mov (%eax), %eax // %eax = __cpu_capabilities
1269 #else
1270 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
1271 #endif
1272 #endif
1273 test $(kHasSupplementalSSE3), %eax
1274 je _SHA1Transform_nossse3 // branch to no-ssse3 code
1275
1276
1277 // start the sha1 code with ssse3 support
1278
1279 // save callee-save registers
1280 #if defined (__x86_64__)
1281 push %rbx
1282 push %rbp
1283 #else
1284 push %ebx
1285 push %ebp
1286 push %esi
1287 push %edi
1288 #endif
1289
1290 sub $stack_size, sp // allocate stack memory for use
1291
1292 // save used xmm register if this is for kernel
1293 #if KERNEL
1294 xmov %xmm0, 4*16(sp)
1295 xmov %xmm1, 5*16(sp)
1296 xmov %xmm2, 6*16(sp)
1297 xmov %xmm3, 7*16(sp)
1298 xmov %xmm4, 8*16(sp)
1299 xmov %xmm5, 9*16(sp)
1300 xmov %xmm6, 10*16(sp)
1301 xmov %xmm7, 11*16(sp)
1302 #if defined (__x86_64__)
1303 xmov %xmm8, 12*16(sp)
1304 xmov %xmm9, 13*16(sp)
1305 xmov %xmm10, 14*16(sp)
1306 #endif
1307 #endif
1308
1309 #if defined (__x86_64__)
1310
1311 // set up registers to free %edx/%edi/%esi for other use (ABCDE)
1312 mov ctx, HASH_PTR
1313 mov buf, BUFFER_PTR
1314 #if Multiple_Blocks
1315 mov %rdx, cnt
1316 #endif
1317 lea K_XMM_AR(%rip), K_BASE
1318 xmov 0x40(K_BASE), XMM_SHUFB_BSWAP
1319
1320 #else // __i386__
1321
1322 #if KERNEL
1323 lea K_XMM_AR, %eax
1324 #else
1325 // Get address of 0 in R.
1326 call 0f // Push program counter onto stack.
1327 0: pop %eax // Get program counter.
1328 lea K_XMM_AR-0b(%eax), %eax
1329 #endif
1330 mov %eax, K_BASE
1331 xmov 0x40(%eax), %xmm0
1332 xmov %xmm0, XMM_SHUFB_BSWAP
1333
1334 #endif
1335
1336 SHA1_PIPELINED_MAIN_BODY_ssse3
1337
1338 // restore used xmm registers if this is for kernel
1339 #if KERNEL
1340 xmov 4*16(sp), %xmm0
1341 xmov 5*16(sp), %xmm1
1342 xmov 6*16(sp), %xmm2
1343 xmov 7*16(sp), %xmm3
1344 xmov 8*16(sp), %xmm4
1345 xmov 9*16(sp), %xmm5
1346 xmov 10*16(sp), %xmm6
1347 xmov 11*16(sp), %xmm7
1348 #if defined (__x86_64__)
1349 xmov 12*16(sp), %xmm8
1350 xmov 13*16(sp), %xmm9
1351 xmov 14*16(sp), %xmm10
1352 #endif
1353 #endif
1354
1355 add $stack_size, sp // deallocate stack memory
1356
1357 // restore callee-save registers
1358 #if defined (__x86_64__)
1359 pop %rbp
1360 pop %rbx
1361 #else
1362 pop %edi
1363 pop %esi
1364 pop %ebp
1365 pop %ebx
1366 #endif
1367
1368 ret // return
1369
1370 // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions
1371
1372 .globl _SHA1Transform_nossse3
1373 .private_extern _SHA1Transform_nossse3
1374 _SHA1Transform_nossse3:
1375
1376 // push callee-save registers
1377 #if defined (__x86_64__)
1378 push %rbx
1379 push %rbp
1380 #else
1381 push %ebx
1382 push %ebp
1383 push %esi
1384 push %edi
1385 #endif
1386
1387 sub $stack_size, sp // allocate stack memory for local use
1388
1389 // save used xmm registers if this is for kernel
1390 #if KERNEL
1391 xmov %xmm0, 4*16(sp)
1392 xmov %xmm1, 5*16(sp)
1393 xmov %xmm2, 6*16(sp)
1394 xmov %xmm3, 7*16(sp)
1395 xmov %xmm4, 8*16(sp)
1396 xmov %xmm5, 9*16(sp)
1397 xmov %xmm6, 10*16(sp)
1398 xmov %xmm7, 11*16(sp)
1399 #if defined (__x86_64__)
1400 xmov %xmm8, 12*16(sp)
1401 xmov %xmm9, 13*16(sp)
1402 #endif
1403 #endif
1404
1405 #if defined (__x86_64__)
1406
1407 // set up registers to free %edx/%edi/%esi for other use (ABCDE)
1408 mov ctx, HASH_PTR
1409 mov buf, BUFFER_PTR
1410 #if Multiple_Blocks
1411 mov %rdx, cnt
1412 #endif
1413 lea K_XMM_AR(%rip), K_BASE
1414
1415 #else // __i386__
1416
1417 #if KERNEL
1418 lea K_XMM_AR, %eax
1419 #else
1420 // Get address of 0 in R.
1421 call 0f // Push program counter onto stack.
1422 0: pop %eax // Get program counter.
1423 lea K_XMM_AR-0b(%eax), %eax
1424 #endif
1425 mov %eax, K_BASE
1426
1427 #endif
1428
1429 SHA1_PIPELINED_MAIN_BODY_nossse3
1430
1431 // restore used xmm registers if this is for kernel
1432 #if KERNEL
1433 xmov 4*16(sp), %xmm0
1434 xmov 5*16(sp), %xmm1
1435 xmov 6*16(sp), %xmm2
1436 xmov 7*16(sp), %xmm3
1437 xmov 8*16(sp), %xmm4
1438 xmov 9*16(sp), %xmm5
1439 xmov 10*16(sp), %xmm6
1440 xmov 11*16(sp), %xmm7
1441 #if defined (__x86_64__)
1442 xmov 12*16(sp), %xmm8
1443 xmov 13*16(sp), %xmm9
1444 #endif
1445 #endif
1446
1447 add $stack_size, sp // deallocate stack memory
1448
1449 // restore callee-save registers
1450 #if defined (__x86_64__)
1451 pop %rbp
1452 pop %rbx
1453 #else
1454 pop %edi
1455 pop %esi
1456 pop %ebp
1457 pop %ebx
1458 #endif
1459
1460 ret // return
1461
1462 .const
1463 .align 4, 0x90
1464
1465 #define K1 0x5a827999
1466 #define K2 0x6ed9eba1
1467 #define K3 0x8f1bbcdc
1468 #define K4 0xca62c1d6
1469
1470 K_XMM_AR:
1471 .long K1
1472 .long K1
1473 .long K1
1474 .long K1
1475 .long K2
1476 .long K2
1477 .long K2
1478 .long K2
1479 .long K3
1480 .long K3
1481 .long K3
1482 .long K3
1483 .long K4
1484 .long K4
1485 .long K4
1486 .long K4
1487 // bswap_shufb_ctl: invoked thru 0x40(K_XMM_AR)
1488 .long 0x00010203
1489 .long 0x04050607
1490 .long 0x08090a0b
1491 .long 0x0c0d0e0f
1492
1493
1494
1495 #endif // architecture x86_64 or i386