]>
Commit | Line | Data |
---|---|---|
6d2010ae A |
1 | /* sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function |
2 | CoreOS - vector and numerics group | |
3 | cclee 6-21-10 | |
4 | ||
5 | The implementation is based on the principle described in an Intel online article | |
6 | "Improving the Performance of the Secure Hash Algorithm (SHA-1)" | |
7 | http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ | |
8 | ||
9 | ||
10 | Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function | |
11 | ||
12 | void SHA1( int HASH[], int MESSAGE[] ) | |
13 | { | |
14 | int A[81], B[81], C[81], D[81], E[81]; | |
15 | int W[80]; | |
16 | ||
17 | int i, FN; | |
18 | ||
19 | A[0] = HASH[0]; | |
20 | B[0] = HASH[1]; | |
21 | C[0] = HASH[2]; | |
22 | D[0] = HASH[3]; | |
23 | E[0] = HASH[4]; | |
24 | ||
25 | for ( i=0; i<80; ++i ) | |
26 | { | |
27 | if ( i < 16 ) | |
28 | W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] ); | |
29 | else | |
30 | W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); | |
31 | ||
32 | FN = F( i, B[i], C[i], D[i] ); | |
33 | ||
34 | A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i); | |
35 | B[i+1] = A[i]; | |
36 | C[i+1] = ROTATE_LEFT( B[i], 30 ); | |
37 | D[i+1] = C[i]; | |
38 | E[i+1] = D[i]; | |
39 | } | |
40 | ||
41 | HASH[0] += A[80]; | |
42 | HASH[1] += B[80]; | |
43 | HASH[2] += C[80]; | |
44 | HASH[3] += D[80]; | |
45 | HASH[4] += E[80]; | |
46 | } | |
47 | ||
48 | For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); | |
49 | ||
50 | The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79, | |
51 | ||
52 | 1. done on 4 consequtive W[i] values in a single XMM register | |
53 | W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 | |
54 | W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 | |
55 | W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 | |
56 | W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 | |
57 | ||
58 | 2. this additional calculation unfortunately requires many additional operations | |
59 | W[i+3] ^= W[i] rol 1 | |
60 | ||
61 | 3. once we have 4 W[i] values in XMM we can also add four K values with one instruction | |
62 | W[i:i+3] += {K,K,K,K} | |
63 | ||
64 | Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on | |
65 | The Dean Gaudet approach can be expressed as | |
66 | ||
67 | 1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1); | |
68 | 2. W[i+3] ^= W[i] rol 1 | |
69 | 3. W0 += {K,K,K,K} | |
70 | ||
71 | For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to | |
72 | ||
73 | 1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); | |
74 | ||
75 | Note: | |
76 | 1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory. | |
77 | 2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte) | |
78 | i=0, W28,W24,...,W0 | |
79 | i=4, W24,W20,...,W28 | |
80 | i=8, W20,W16,...,W24 | |
81 | . | |
82 | . | |
83 | and so forth. | |
84 | 3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr. | |
85 | a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation | |
86 | b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64) | |
87 | 4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available. | |
88 | If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched. | |
89 | ||
90 | */ | |
91 | ||
92 | /* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */ | |
93 | #define Multiple_Blocks 1 | |
94 | ||
95 | #if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures | |
96 | ||
97 | #if defined(__x86_64__) | |
98 | ||
99 | // set up for x86_64 | |
100 | #define stack_size (8+16*11+16*4) // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage | |
101 | #define sp %rsp // unifying architectural stack pointer representation | |
102 | #define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9) | |
103 | #define buf %rsi // 2nd input argument, will move to BUFFER_PTR (%r10) | |
104 | #define cnt %r11 // will copy from the 3rd input argument (%rdx) | |
105 | #define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values | |
106 | #define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E) | |
107 | #define BUFFER_PTR %r10 // pointer to input blocks | |
108 | ||
109 | #else // !__x86_64__ | |
110 | ||
111 | // set up for i386 | |
112 | #define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t)) | |
113 | #define sp %esp // unifying architectural stack pointer representation | |
114 | #define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp) | |
115 | #define BUFFER_PTR stack_size+16+8(sp) // use 2nd input argument from caller function | |
116 | #define cnt stack_size+16+12(sp) // use 3rd input argument from caller function | |
117 | #define K_BASE stack_size-4(sp) // use for K_BASE | |
118 | ||
119 | #endif // __x86_64__ | |
120 | ||
121 | // symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support | |
122 | ||
123 | #define W_TMP %xmm0 | |
124 | #define W_TMP2 %xmm1 | |
125 | #define W0 %xmm2 | |
126 | #define W4 %xmm3 | |
127 | #define W8 %xmm4 | |
128 | #define W12 %xmm5 | |
129 | #define W16 %xmm6 | |
130 | #define W20 %xmm7 | |
131 | #if defined(__x86_64__) | |
132 | #define W24 %xmm8 | |
133 | #define W28 %xmm9 | |
134 | #define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported | |
135 | #else // defined (__i386__) | |
136 | #define W24 12*16(sp) | |
137 | #define W28 13*16(sp) | |
138 | #define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported | |
139 | #endif | |
140 | ||
141 | #define xmov movaps // aligned 16-byte move | |
142 | #define xmovu movups // unaligned 16-byte move | |
143 | ||
144 | // intermediate hash variables | |
145 | #define A %ecx | |
146 | #define B %esi | |
147 | #define C %edi | |
148 | #define D %ebp | |
149 | #define E %edx | |
150 | ||
151 | // temp variables | |
152 | #define T1 %eax | |
153 | #define T2 %ebx | |
154 | ||
155 | #define WK(t) (t&15)*4(sp) | |
156 | ||
157 | // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); } | |
158 | // result in T1 | |
159 | .macro F1 | |
160 | mov $1, T1 | |
161 | xor $2, T1 | |
162 | and $0, T1 | |
163 | xor $2, T1 | |
164 | .endm | |
165 | ||
166 | // int F2(int B, int C, int D) { return (D ^ B ^ C); } | |
167 | // result in T1 | |
168 | .macro F2 | |
169 | mov $2, T1 | |
170 | xor $1, T1 | |
171 | xor $0, T1 | |
172 | .endm | |
173 | ||
174 | // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); } | |
175 | // result in T1 | |
176 | .macro F3 | |
177 | mov $1, T1 | |
178 | mov $0, T2 | |
179 | or $0, T1 | |
180 | and $1, T2 | |
181 | and $2, T1 | |
182 | or T2, T1 | |
183 | .endm | |
184 | ||
185 | // for i=60:79, F4 is identical to F2 | |
186 | #define F4 F2 | |
187 | ||
188 | ||
189 | /* | |
190 | i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]); | |
191 | ||
192 | with ssse3 support, this is achived via | |
193 | for (i=0;i<16;i+=4) { | |
194 | 1. W_TMP = new 16 bytes from MESSAGE[] | |
195 | 2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W | |
196 | 3. WTMP += {K,K,K,K}; | |
197 | 4. save quadruple W[i]+K[i] = W_TMP in the stack memory; | |
198 | } | |
199 | ||
200 | each step is represented in one of the following 4 macro definitions | |
201 | ||
202 | */ | |
203 | ||
204 | .macro W_PRECALC_00_15_0_ssse3 // input argument $0 : 0/4/8/12 | |
205 | #if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64 | |
206 | xmovu $0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned | |
207 | #else // BUFFER_PTR is from the argument set up in the caller | |
208 | mov BUFFER_PTR, T1 // T1 = BUFFER_PTR | |
209 | xmovu $0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned | |
210 | #endif | |
211 | .endm | |
212 | ||
213 | .macro W_PRECALC_00_15_1_ssse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28 | |
214 | pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian | |
215 | xmov W_TMP, $0 // save W_TMP in the circular buffer | |
216 | .endm | |
217 | ||
218 | .macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple. | |
219 | #if defined (__x86_64__) // K_BASE is already an address register in x86_64 | |
220 | paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K}; | |
221 | #else // K_BASE is previously set up in the stack memory | |
222 | mov K_BASE, T1 // T1 = K_BASE | |
223 | paddd (T1), W_TMP // W_TMP += {K,K,K,K}; | |
224 | #endif | |
225 | .endm | |
226 | ||
227 | .macro W_PRECALC_00_15_3 | |
228 | xmov W_TMP, WK($0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E | |
229 | .endm | |
230 | ||
231 | /* | |
232 | without ssse3 support, steps 1 and 2 need to be modified | |
233 | 1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space | |
234 | 2. load the 16-bytes from the aligned stack memory into W_TMP | |
235 | */ | |
236 | ||
237 | .macro W_PRECALC_00_15_0_nossse3 // input argument $0 : 0/4/8/12 | |
238 | ||
239 | #if defined (__x86_64__) | |
240 | #define BUFFERP BUFFER_PTR | |
241 | #else | |
242 | mov BUFFER_PTR, T2 // copy BUFFER_PTR (from caller 2nd argument) to T2 | |
243 | #define BUFFERP T2 | |
244 | #endif | |
245 | ||
246 | // load 1st word, bswap it, save it to stack | |
247 | mov $0*4(BUFFERP), T1 | |
248 | bswap T1 | |
249 | mov T1, 14*16(sp) | |
250 | ||
251 | // load 2nd word, bswap it, save it to stack | |
252 | mov 4+$0*4(BUFFERP), T1 | |
253 | bswap T1 | |
254 | mov T1, 4+14*16(sp) | |
255 | ||
256 | // load 3rd word, bswap it, save it to stack | |
257 | mov 8+$0*4(BUFFERP), T1 | |
258 | bswap T1 | |
259 | mov T1, 8+14*16(sp) | |
260 | ||
261 | // load 4th word, bswap it, save it to stack | |
262 | mov 12+$0*4(BUFFERP), T1 | |
263 | bswap T1 | |
264 | mov T1, 12+14*16(sp) | |
265 | .endm | |
266 | ||
267 | .macro W_PRECALC_00_15_1_nossse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28 | |
268 | xmov 14*16(sp), W_TMP // load the bswapped 16-bytes from the aligned stack memory | |
269 | xmov W_TMP, $0 // save W = W_TMP in the circular buffer | |
270 | .endm | |
271 | ||
272 | // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet | |
273 | /* | |
274 | W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 | |
275 | W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 | |
276 | W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 | |
277 | W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 | |
278 | ||
279 | W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2 | |
280 | ||
281 | The operation (updating W and W+K) is scheduled as and divided into 4 steps | |
282 | ||
283 | 0. W_tmp = W3; W = W14 ^ W8 | |
284 | 1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0); | |
285 | 2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W | |
286 | 3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K; | |
287 | ||
288 | */ | |
289 | ||
290 | .macro W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W | |
291 | xmov $1, $4 // W = W12 | |
292 | palignr $$8, $0, $4 // W = W14 | |
293 | xmov $3, W_TMP // W_TMP = W4 | |
294 | psrldq $$4, W_TMP // W_TMP = W3 | |
295 | pxor $2, $4 // W = W8 ^ W14 | |
296 | .endm | |
297 | ||
298 | .macro W_PRECALC_16_31_1 // input arguments : W16,W | |
299 | pxor $0, W_TMP // W_TMP = W3 ^ W16 | |
300 | pxor W_TMP, $1 // W = W3 ^ W16 ^ W8 ^ W14 | |
301 | xmov $1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14 | |
302 | xmov $1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14 | |
303 | pslldq $$12, W_TMP2 // W_TMP2 = (W[i] 0 0 0) | |
304 | .endm | |
305 | ||
306 | .macro W_PRECALC_16_31_2 // input argument : W | |
307 | psrld $$31, $0 // (W3 ^ W16 ^ W8 ^ W14)>>31 | |
308 | pslld $$1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1 | |
309 | por $0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 | |
310 | xmov W_TMP2, $0 // copy W[i] at location of W[i+3] | |
311 | psrld $$30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2 | |
312 | pslld $$2, $0 // W = W[i] higher 30 bits after rol 2 | |
313 | .endm | |
314 | ||
315 | .macro W_PRECALC_16_31_3 // input arguments: W, i, K_XMM | |
316 | #if defined (__i386__) | |
317 | mov K_BASE, T1 // K_BASE is store in the stack memory for i386 | |
318 | #endif | |
319 | pxor $0, W_TMP | |
320 | pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2 | |
321 | xmov W_TMP, $0 // save W = W_TMP in the W circular buffer | |
322 | #if defined (__x86_64__) | |
323 | paddd $2(K_BASE), W_TMP // W+K | |
324 | #else | |
325 | paddd $2(T1), W_TMP // W+K | |
326 | #endif | |
327 | xmov W_TMP, WK($1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E | |
328 | .endm | |
329 | ||
330 | // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions | |
331 | ||
332 | .macro W_PRECALC_16_31_0_nossse3 // input arguments : W16,W12,W8,W4,W | |
333 | xmov $1, $4 // W = W12 = (w9 w10 w11 w12) | |
334 | ||
335 | // the following is a wrokaround for palignr | |
336 | xmov $0, W_TMP // W16 = (w13 w14 w15 w16) | |
337 | pslldq $$8, $4 // shift left to make (w11 w12 0 0) | |
338 | psrldq $$8, W_TMP // shift right to make (0 0 w13 w14) | |
339 | por W_TMP, $4 // W = W14 = (w11 w12 w13 w14) | |
340 | ||
341 | xmov $3, W_TMP // W_TMP = W4 = (w1 w2 w3 w4) | |
342 | psrldq $$4, W_TMP // W_TMP = W3 = (0 w1 w2 w3) | |
343 | pxor $2, $4 // W = W8 ^ W14 | |
344 | .endm | |
345 | ||
346 | /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article | |
347 | ||
348 | W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); | |
349 | ||
350 | where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register. | |
351 | ||
352 | ||
353 | 0. W_tmp = W6; W = W28 ^ W32; | |
354 | 1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32; | |
355 | 2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2; | |
356 | 3. W = W_Tmp; WK = W_tmp + K; | |
357 | ||
358 | */ | |
359 | ||
360 | ||
361 | .macro W_PRECALC_32_79_0_ssse3 // inputr arguments : W28,W8,W4,W | |
362 | xmov $2, W_TMP // (w1 w2 w3 w4) | |
363 | pxor $0, $3 // W = W28 ^ W32; | |
364 | palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6; | |
365 | .endm | |
366 | ||
367 | // the following is a variant and will be used for system without ssse3 support | |
368 | .macro W_PRECALC_32_79_0_nossse3 // input arguments : W28,W8,W4,W | |
369 | xmov $2, W_TMP // (w1 w2 w3 w4) | |
370 | xmov $1, W_TMP2 // (w5 w6 w7 w8) | |
371 | pxor $0, $3 // W = W28 ^ W32 | |
372 | pslldq $$8, W_TMP // (w3 w4 0 0) | |
373 | psrldq $$8, W_TMP2 // (0 0 w5 w6) | |
374 | por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6 | |
375 | .endm | |
376 | ||
377 | // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers) | |
378 | .macro W_PRECALC_32_79_0_i386_ssse3 // input arguments : W28,W8,W4,W | |
379 | xmov $3, W_TMP // W32 | |
380 | pxor $0, W_TMP // W28 ^ W32 | |
381 | xmov W_TMP, $3 // W = W28 ^ W32; | |
382 | xmov $2, W_TMP // W4 | |
383 | palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6; | |
384 | .endm | |
385 | ||
386 | // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers) | |
387 | .macro W_PRECALC_32_79_0_i386_nossse3 // input arguments : W28,W8,W4,W | |
388 | xmov $3, W_TMP // W32 | |
389 | pxor $0, W_TMP // W28 ^ W32 | |
390 | xmov W_TMP, $3 // W = W28 ^ W32 | |
391 | xmov $2, W_TMP // W4 = (w1 w2 w3 w4) | |
392 | xmov $1, W_TMP2 // W8 = (w5 w6 w7 w8) | |
393 | pslldq $$8, W_TMP // (w3 w4 0 0) | |
394 | psrldq $$8, W_TMP2 // (0 0 w5 w6) | |
395 | por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6 | |
396 | .endm | |
397 | ||
398 | .macro W_PRECALC_32_79_1 // input arguments : W16,W | |
399 | pxor $0, W_TMP // W_tmp = W6 ^ W16 | |
400 | pxor $1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32 | |
401 | xmov W_TMP, $1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32 | |
402 | .endm | |
403 | ||
404 | .macro W_PRECALC_32_79_2 // input argument : W | |
405 | psrld $$30, $0 // W >> 30 | |
406 | pslld $$2, W_TMP // W << 2 | |
407 | por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2 | |
408 | .endm | |
409 | ||
410 | // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers) | |
411 | // this should be used when the input is either W24 or W28 on i386 architecture | |
412 | .macro W_PRECALC_32_79_2_i386 // input argument : W | |
413 | xmov $0, W_TMP2 // W | |
414 | psrld $$30, W_TMP2 // W >> 30 | |
415 | xmov W_TMP2, $0 // save (W >> 30) at W | |
416 | pslld $$2, W_TMP // W_tmp << 2 | |
417 | por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2 | |
418 | .endm | |
419 | ||
420 | .macro W_PRECALC_32_79_3 // input argument W, i, K_XMM | |
421 | #if defined (__x86_64__) | |
422 | xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2 | |
423 | paddd $2(K_BASE), W_TMP // W + K | |
424 | xmov W_TMP, WK($1&~3) // write W+K | |
425 | #else | |
426 | mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument) | |
427 | xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2 | |
428 | paddd $2(T1), W_TMP // W_tmp = W + K | |
429 | xmov W_TMP, WK($1&~3) // write WK | |
430 | #endif | |
431 | .endm | |
432 | ||
433 | ||
434 | /* The hash update operation is completed by the following statements. | |
435 | ||
436 | A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i); | |
437 | B[i+1] = A[i]; | |
438 | C[i+1] = ROTATE_LEFT( B[i], 30 ); | |
439 | D[i+1] = C[i]; | |
440 | E[i+1] = D[i]; | |
441 | ||
442 | Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows: | |
443 | ||
444 | A1 = FN + E0 + rol(A0,5) + WK; | |
445 | B1 = A0; | |
446 | C1 = rol(B0, 30); | |
447 | D1 = C0; | |
448 | E1 = D0; | |
449 | ||
450 | to avoid excessive memory movement between registers, | |
451 | 1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0, | |
452 | 2. C1 = rol(B0,30) can be temporarily saved in B0. | |
453 | ||
454 | Therefore, ignoring the time index, the update operation is equivalent to | |
455 | 1. E = FN(B,C,D) + E + rol(A,5) + WK(i) | |
456 | 2. B = rol(B,30) | |
457 | 3. the hashes are now stored in the order of E,A,B,C,D | |
458 | ||
459 | ||
460 | To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E | |
461 | 1. E = FN(B,C,D) + E + rol(A,5) + WK(i) | |
462 | 2. B = rol(B,30) | |
463 | // now the hashes are in the order of E,A,B,C,D | |
464 | 3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1) | |
465 | 4. A = rol(A,30) | |
466 | // now the hashes are in the order of D,E,A,B,C | |
467 | ||
468 | These operations are distributed into the following 2 macro definitions RR0 and RR1. | |
469 | ||
470 | */ | |
471 | ||
472 | .macro RR0 // input arguments : FN, A, B, C, D, E, i | |
473 | $0 $2, $3, $4 // T1 = FN(B,C,D) | |
474 | add WK($6), $5 // E + WK(i) | |
475 | rol $$30, $2 // B = rol(B,30) | |
476 | mov $1, T2 // T2 = A | |
477 | add WK($6+1), $4 // D + WK(i+1) | |
478 | rol $$5, T2 // rol(A,5) | |
479 | add T1, $5 // E = FN(B,C,D) + E + WK(i) | |
480 | .endm | |
481 | ||
482 | .macro RR1 | |
483 | add $5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i) | |
484 | mov T2, $5 // E = FN(B,C,D) + E + rol(A,5) + WK(i) | |
485 | rol $$5, T2 // rol(E,5) | |
486 | add T2, $4 // D + WK(i+1) + rol(E,5) | |
487 | $0 $1, $2, $3 // FN(A,B,C) | |
488 | add T1, $4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1) | |
489 | rol $$30, $1 // A = rol(A,30) | |
490 | .endm | |
491 | ||
492 | ||
493 | ||
494 | /* | |
495 | ||
496 | The following macro definitions are used to expand code for the per-block sha1 operation. | |
497 | ||
498 | INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory | |
499 | INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) | |
500 | ENDING : finishing up update the digests A/B/C/D/E (i=64:79) | |
501 | ||
502 | For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined | |
503 | into 1 macro definition for software pipeling. | |
504 | ||
505 | SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79) | |
506 | ||
507 | assume cnt (the number of blocks) >= 1, the main code body should look like | |
508 | ||
509 | INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15) | |
510 | do { | |
511 | INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63) | |
512 | cnt--; | |
513 | if (cnt==0) break; | |
514 | BUFFER_PTR += 64; | |
515 | SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15) | |
516 | } | |
517 | ENDING // update hash digests A/B/C/D/E (i=64:79) | |
518 | ||
519 | */ | |
520 | ||
521 | #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3 | |
522 | #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3 | |
523 | #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3 | |
524 | #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3 | |
525 | #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3 | |
526 | ||
527 | ||
528 | .macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory | |
529 | ||
530 | // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 | |
531 | W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) | |
532 | W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP | |
533 | W_PRECALC_00_15_2 // W_TMP = W0 + K | |
534 | W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K | |
535 | ||
536 | // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 | |
537 | W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) | |
538 | W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP | |
539 | W_PRECALC_00_15_2 // W_TMP = W28 + K | |
540 | W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K | |
541 | ||
542 | // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 | |
543 | W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) | |
544 | W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP | |
545 | W_PRECALC_00_15_2 // W_TMP = W24 + K | |
546 | W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K | |
547 | ||
548 | // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 | |
549 | W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) | |
550 | W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP | |
551 | W_PRECALC_00_15_2 // W_TMP = W20 + K | |
552 | W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K | |
553 | ||
554 | .endm | |
555 | ||
556 | ||
557 | .macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) | |
558 | ||
559 | // i=16 : W12,W8,W4,W0,W28,W24,W20,W16 | |
560 | W_PRECALC_16_31_0 W0,W28,W24,W20,W16 | |
561 | RR0 F1,A,B,C,D,E,0 | |
562 | W_PRECALC_16_31_1 W0,W16 | |
563 | RR1 F1,A,B,C,D,E,0 | |
564 | W_PRECALC_16_31_2 W16 | |
565 | RR0 F1,D,E,A,B,C,2 | |
566 | W_PRECALC_16_31_3 W16, 2, 0 | |
567 | RR1 F1,D,E,A,B,C,2 | |
568 | ||
569 | // i=20 : W8,W4,W0,W28,W24,W20,W16,W12 | |
570 | W_PRECALC_16_31_0 W28,W24,W20,W16,W12 | |
571 | RR0 F1,B,C,D,E,A,4 | |
572 | W_PRECALC_16_31_1 W28,W12 | |
573 | RR1 F1,B,C,D,E,A,4 | |
574 | W_PRECALC_16_31_2 W12 | |
575 | RR0 F1,E,A,B,C,D,6 | |
576 | W_PRECALC_16_31_3 W12, 6, 16 | |
577 | RR1 F1,E,A,B,C,D,6 | |
578 | ||
579 | // i=24 : W4,W0,W28,W24,W20,W16,W12,W8 | |
580 | W_PRECALC_16_31_0 W24,W20,W16,W12,W8 | |
581 | RR0 F1,C,D,E,A,B,8 | |
582 | W_PRECALC_16_31_1 W24,W8 | |
583 | RR1 F1,C,D,E,A,B,8 | |
584 | W_PRECALC_16_31_2 W8 | |
585 | RR0 F1,A,B,C,D,E,10 | |
586 | W_PRECALC_16_31_3 W8,10,16 | |
587 | RR1 F1,A,B,C,D,E,10 | |
588 | ||
589 | // i=28 : W0,W28,W24,W20,W16,W12,W8,W4 | |
590 | W_PRECALC_16_31_0 W20,W16,W12,W8,W4 | |
591 | RR0 F1,D,E,A,B,C,12 | |
592 | W_PRECALC_16_31_1 W20,W4 | |
593 | RR1 F1,D,E,A,B,C,12 | |
594 | W_PRECALC_16_31_2 W4 | |
595 | RR0 F1,B,C,D,E,A,14 | |
596 | W_PRECALC_16_31_3 W4,14,16 | |
597 | RR1 F1,B,C,D,E,A,14 | |
598 | ||
599 | // i=32 : W28,W24,W20,W16,W12,W8,W4,W0 | |
600 | W_PRECALC_32_79_0 W28,W8,W4,W0 | |
601 | RR0 F1,E,A,B,C,D,16 | |
602 | W_PRECALC_32_79_1 W16,W0 | |
603 | RR1 F1,E,A,B,C,D,16 | |
604 | W_PRECALC_32_79_2 W0 | |
605 | RR0 F1,C,D,E,A,B,18 | |
606 | W_PRECALC_32_79_3 W0,18,16 | |
607 | RR1 F1,C,D,E,A,B,18 | |
608 | ||
609 | // starting using F2 | |
610 | ||
611 | // i=36 : W24,W20,W16,W12,W8,W4,W0,W28 | |
612 | #if defined (__x86_64__) | |
613 | W_PRECALC_32_79_0 W24,W4,W0,W28 | |
614 | #else | |
615 | W_PRECALC_32_79_0_i386 W24,W4,W0,W28 | |
616 | #endif | |
617 | RR0 F2,A,B,C,D,E,20 | |
618 | W_PRECALC_32_79_1 W12,W28 | |
619 | RR1 F2,A,B,C,D,E,20 | |
620 | #if defined (__x86_64__) | |
621 | W_PRECALC_32_79_2 W28 | |
622 | #else | |
623 | W_PRECALC_32_79_2_i386 W28 | |
624 | #endif | |
625 | RR0 F2,D,E,A,B,C,22 | |
626 | W_PRECALC_32_79_3 W28,22,16 | |
627 | RR1 F2,D,E,A,B,C,22 | |
628 | ||
629 | // i=40 : W20,W16,W12,W8,W4,W0,W28,W24 | |
630 | #undef K_XMM | |
631 | #define K_XMM 32 | |
632 | #if defined (__x86_64__) | |
633 | W_PRECALC_32_79_0 W20,W0,W28,W24 | |
634 | #else | |
635 | W_PRECALC_32_79_0_i386 W20,W0,W28,W24 | |
636 | #endif | |
637 | RR0 F2,B,C,D,E,A,24 | |
638 | W_PRECALC_32_79_1 W8,W24 | |
639 | RR1 F2,B,C,D,E,A,24 | |
640 | #if defined (__x86_64__) | |
641 | W_PRECALC_32_79_2 W24 | |
642 | #else | |
643 | W_PRECALC_32_79_2_i386 W24 | |
644 | #endif | |
645 | RR0 F2,E,A,B,C,D,26 | |
646 | W_PRECALC_32_79_3 W24,26,K_XMM | |
647 | RR1 F2,E,A,B,C,D,26 | |
648 | ||
649 | // i=44 : W16,W12,W8,W4,W0,W28,W24,W20 | |
650 | W_PRECALC_32_79_0 W16,W28,W24,W20 | |
651 | RR0 F2,C,D,E,A,B,28 | |
652 | W_PRECALC_32_79_1 W4,W20 | |
653 | RR1 F2,C,D,E,A,B,28 | |
654 | W_PRECALC_32_79_2 W20 | |
655 | RR0 F2,A,B,C,D,E,30 | |
656 | W_PRECALC_32_79_3 W20,30,K_XMM | |
657 | RR1 F2,A,B,C,D,E,30 | |
658 | ||
659 | // i=48 : W12,W8,W4,W0,W28,W24,W20,W16 | |
660 | W_PRECALC_32_79_0 W12,W24,W20,W16 | |
661 | RR0 F2,D,E,A,B,C,32 | |
662 | W_PRECALC_32_79_1 W0,W16 | |
663 | RR1 F2,D,E,A,B,C,32 | |
664 | W_PRECALC_32_79_2 W16 | |
665 | RR0 F2,B,C,D,E,A,34 | |
666 | W_PRECALC_32_79_3 W16,34,K_XMM | |
667 | RR1 F2,B,C,D,E,A,34 | |
668 | ||
669 | // i=52 : W8,W4,W0,W28,W24,W20,W16,W12 | |
670 | W_PRECALC_32_79_0 W8,W20,W16,W12 | |
671 | RR0 F2,E,A,B,C,D,36 | |
672 | W_PRECALC_32_79_1 W28,W12 | |
673 | RR1 F2,E,A,B,C,D,36 | |
674 | W_PRECALC_32_79_2 W12 | |
675 | RR0 F2,C,D,E,A,B,38 | |
676 | W_PRECALC_32_79_3 W12,38,K_XMM | |
677 | RR1 F2,C,D,E,A,B,38 | |
678 | ||
679 | // starting using F3 | |
680 | ||
681 | // i=56 : W4,W0,W28,W24,W20,W16,W12,W8 | |
682 | W_PRECALC_32_79_0 W4,W16,W12,W8 | |
683 | RR0 F3,A,B,C,D,E,40 | |
684 | W_PRECALC_32_79_1 W24,W8 | |
685 | RR1 F3,A,B,C,D,E,40 | |
686 | W_PRECALC_32_79_2 W8 | |
687 | RR0 F3,D,E,A,B,C,42 | |
688 | W_PRECALC_32_79_3 W8,42,K_XMM | |
689 | RR1 F3,D,E,A,B,C,42 | |
690 | ||
691 | // i=60 : W0,W28,W24,W20,W16,W12,W8,W4 | |
692 | #undef K_XMM | |
693 | #define K_XMM 48 | |
694 | W_PRECALC_32_79_0 W0,W12,W8,W4 | |
695 | RR0 F3,B,C,D,E,A,44 | |
696 | W_PRECALC_32_79_1 W20,W4 | |
697 | RR1 F3,B,C,D,E,A,44 | |
698 | W_PRECALC_32_79_2 W4 | |
699 | RR0 F3,E,A,B,C,D,46 | |
700 | W_PRECALC_32_79_3 W4,46,K_XMM | |
701 | RR1 F3,E,A,B,C,D,46 | |
702 | ||
703 | // i=64 : W28,W24,W20,W16,W12,W8,W4,W0 | |
704 | W_PRECALC_32_79_0 W28,W8,W4,W0 | |
705 | RR0 F3,C,D,E,A,B,48 | |
706 | W_PRECALC_32_79_1 W16,W0 | |
707 | RR1 F3,C,D,E,A,B,48 | |
708 | W_PRECALC_32_79_2 W0 | |
709 | RR0 F3,A,B,C,D,E,50 | |
710 | W_PRECALC_32_79_3 W0,50,K_XMM | |
711 | RR1 F3,A,B,C,D,E,50 | |
712 | ||
713 | // i=68 : W24,W20,W16,W12,W8,W4,W0,W28 | |
714 | #if defined (__x86_64__) | |
715 | W_PRECALC_32_79_0 W24,W4,W0,W28 | |
716 | #else | |
717 | W_PRECALC_32_79_0_i386 W24,W4,W0,W28 | |
718 | #endif | |
719 | RR0 F3,D,E,A,B,C,52 | |
720 | W_PRECALC_32_79_1 W12,W28 | |
721 | RR1 F3,D,E,A,B,C,52 | |
722 | #if defined (__x86_64__) | |
723 | W_PRECALC_32_79_2 W28 | |
724 | #else | |
725 | W_PRECALC_32_79_2_i386 W28 | |
726 | #endif | |
727 | RR0 F3,B,C,D,E,A,54 | |
728 | W_PRECALC_32_79_3 W28,54,K_XMM | |
729 | RR1 F3,B,C,D,E,A,54 | |
730 | ||
731 | // i=72 : W20,W16,W12,W8,W4,W0,W28,W24 | |
732 | #if defined (__x86_64__) | |
733 | W_PRECALC_32_79_0 W20,W0,W28,W24 | |
734 | #else | |
735 | W_PRECALC_32_79_0_i386 W20,W0,W28,W24 | |
736 | #endif | |
737 | RR0 F3,E,A,B,C,D,56 | |
738 | W_PRECALC_32_79_1 W8,W24 | |
739 | RR1 F3,E,A,B,C,D,56 | |
740 | #if defined (__x86_64__) | |
741 | W_PRECALC_32_79_2 W24 | |
742 | #else | |
743 | W_PRECALC_32_79_2_i386 W24 | |
744 | #endif | |
745 | RR0 F3,C,D,E,A,B,58 | |
746 | W_PRECALC_32_79_3 W24,58,K_XMM | |
747 | RR1 F3,C,D,E,A,B,58 | |
748 | ||
749 | // starting using F4 | |
750 | ||
751 | // i=76 : W16,W12,W8,W4,W0,W28,W24,W20 | |
752 | W_PRECALC_32_79_0 W16,W28,W24,W20 | |
753 | RR0 F4,A,B,C,D,E,60 | |
754 | W_PRECALC_32_79_1 W4,W20 | |
755 | RR1 F4,A,B,C,D,E,60 | |
756 | W_PRECALC_32_79_2 W20 | |
757 | RR0 F4,D,E,A,B,C,62 | |
758 | W_PRECALC_32_79_3 W20,62,K_XMM | |
759 | RR1 F4,D,E,A,B,C,62 | |
760 | ||
761 | .endm | |
762 | ||
763 | .macro SOFTWARE_PIPELINING_ssse3 | |
764 | // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 | |
765 | W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) | |
766 | RR0 F4,B,C,D,E,A,64 | |
767 | W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP | |
768 | RR1 F4,B,C,D,E,A,64 | |
769 | W_PRECALC_00_15_2 // W_TMP = W0 + K | |
770 | RR0 F4,E,A,B,C,D,66 | |
771 | W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K | |
772 | RR1 F4,E,A,B,C,D,66 | |
773 | ||
774 | // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 | |
775 | W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) | |
776 | RR0 F4,C,D,E,A,B,68 | |
777 | W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP | |
778 | RR1 F4,C,D,E,A,B,68 | |
779 | W_PRECALC_00_15_2 // W_TMP = W28 + K | |
780 | RR0 F4,A,B,C,D,E,70 | |
781 | W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0] | |
782 | RR1 F4,A,B,C,D,E,70 | |
783 | ||
784 | // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 | |
785 | W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) | |
786 | RR0 F4,D,E,A,B,C,72 | |
787 | W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP | |
788 | RR1 F4,D,E,A,B,C,72 | |
789 | W_PRECALC_00_15_2 // W_TMP = W24 + K | |
790 | RR0 F4,B,C,D,E,A,74 | |
791 | W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K | |
792 | RR1 F4,B,C,D,E,A,74 | |
793 | ||
794 | // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 | |
795 | W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) | |
796 | RR0 F4,E,A,B,C,D,76 | |
797 | W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP | |
798 | RR1 F4,E,A,B,C,D,76 | |
799 | W_PRECALC_00_15_2 // W_TMP = W20 + K | |
800 | RR0 F4,C,D,E,A,B,78 | |
801 | W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K | |
802 | RR1 F4,C,D,E,A,B,78 | |
803 | .endm | |
804 | ||
805 | ||
806 | #undef W_PRECALC_00_15_0 | |
807 | #undef W_PRECALC_00_15_1 | |
808 | #undef W_PRECALC_16_31_0 | |
809 | #undef W_PRECALC_32_79_0 | |
810 | #undef W_PRECALC_32_79_0_i386 | |
811 | ||
812 | ||
813 | ||
814 | /* | |
815 | ||
816 | The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions. | |
817 | ||
818 | INITIAL_W_PRECALC_nossse3 | |
819 | INTERNAL_nossse3 | |
820 | SOFTWARE_PIPELINING_nossse3 | |
821 | ||
822 | They will be used in a sha1 code main body definition that will be used for system without ssse3 support. | |
823 | ||
824 | */ | |
825 | ||
826 | #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_nossse3 | |
827 | #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_nossse3 | |
828 | #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_nossse3 | |
829 | #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_nossse3 | |
830 | #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_nossse3 | |
831 | ||
832 | ||
833 | .macro INITIAL_W_PRECALC_nossse3 | |
834 | ||
835 | // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 | |
836 | W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) | |
837 | W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP | |
838 | W_PRECALC_00_15_2 // W_TMP = W0 + K | |
839 | W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K | |
840 | ||
841 | // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 | |
842 | W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) | |
843 | W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP | |
844 | W_PRECALC_00_15_2 // W_TMP = W28 + K | |
845 | W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K | |
846 | ||
847 | // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 | |
848 | W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) | |
849 | W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP | |
850 | W_PRECALC_00_15_2 // W_TMP = W24 + K | |
851 | W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K | |
852 | ||
853 | // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 | |
854 | W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) | |
855 | W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP | |
856 | W_PRECALC_00_15_2 // W_TMP = W20 + K | |
857 | W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K | |
858 | ||
859 | .endm | |
860 | ||
861 | ||
862 | .macro INTERNAL_nossse3 | |
863 | // i=16 | |
864 | // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16 | |
865 | W_PRECALC_16_31_0 W0,W28,W24,W20,W16 | |
866 | RR0 F1,A,B,C,D,E,0 | |
867 | W_PRECALC_16_31_1 W0,W16 | |
868 | RR1 F1,A,B,C,D,E,0 | |
869 | W_PRECALC_16_31_2 W16 | |
870 | RR0 F1,D,E,A,B,C,2 | |
871 | W_PRECALC_16_31_3 W16, 2, 0 | |
872 | RR1 F1,D,E,A,B,C,2 | |
873 | ||
874 | // i=20, | |
875 | // W8,W4,W0,W28,W24,W20,W16,W12 | |
876 | W_PRECALC_16_31_0 W28,W24,W20,W16,W12 | |
877 | RR0 F1,B,C,D,E,A,4 | |
878 | W_PRECALC_16_31_1 W28,W12 | |
879 | RR1 F1,B,C,D,E,A,4 | |
880 | ||
881 | W_PRECALC_16_31_2 W12 | |
882 | RR0 F1,E,A,B,C,D,6 | |
883 | W_PRECALC_16_31_3 W12, 6, 16 | |
884 | RR1 F1,E,A,B,C,D,6 | |
885 | ||
886 | // i=24, | |
887 | // W4,W0,W28,W24,W20,W16,W12,W8 | |
888 | W_PRECALC_16_31_0 W24,W20,W16,W12,W8 | |
889 | RR0 F1,C,D,E,A,B,8 | |
890 | W_PRECALC_16_31_1 W24,W8 | |
891 | RR1 F1,C,D,E,A,B,8 | |
892 | ||
893 | W_PRECALC_16_31_2 W8 | |
894 | RR0 F1,A,B,C,D,E,10 | |
895 | W_PRECALC_16_31_3 W8,10,16 | |
896 | RR1 F1,A,B,C,D,E,10 | |
897 | ||
898 | // i=28 | |
899 | // W0,W28,W24,W20,W16,W12,W8,W4 | |
900 | W_PRECALC_16_31_0 W20,W16,W12,W8,W4 | |
901 | RR0 F1,D,E,A,B,C,12 | |
902 | W_PRECALC_16_31_1 W20,W4 | |
903 | RR1 F1,D,E,A,B,C,12 | |
904 | ||
905 | W_PRECALC_16_31_2 W4 | |
906 | RR0 F1,B,C,D,E,A,14 | |
907 | W_PRECALC_16_31_3 W4,14,16 | |
908 | RR1 F1,B,C,D,E,A,14 | |
909 | ||
910 | //i=32 | |
911 | // W28,W24,W20,W16,W12,W8,W4,W0 | |
912 | W_PRECALC_32_79_0 W28,W8,W4,W0 | |
913 | RR0 F1,E,A,B,C,D,16 | |
914 | W_PRECALC_32_79_1 W16,W0 | |
915 | RR1 F1,E,A,B,C,D,16 | |
916 | W_PRECALC_32_79_2 W0 | |
917 | RR0 F1,C,D,E,A,B,18 | |
918 | W_PRECALC_32_79_3 W0,18,16 | |
919 | RR1 F1,C,D,E,A,B,18 | |
920 | ||
921 | //i=36 | |
922 | // W24,W20,W16,W12,W8,W4,W0,W28 | |
923 | #if defined (__x86_64__) | |
924 | W_PRECALC_32_79_0 W24,W4,W0,W28 | |
925 | #else | |
926 | W_PRECALC_32_79_0_i386 W24,W4,W0,W28 | |
927 | #endif | |
928 | RR0 F2,A,B,C,D,E,20 | |
929 | W_PRECALC_32_79_1 W12,W28 | |
930 | RR1 F2,A,B,C,D,E,20 | |
931 | #if defined (__x86_64__) | |
932 | W_PRECALC_32_79_2 W28 | |
933 | #else | |
934 | W_PRECALC_32_79_2_i386 W28 | |
935 | #endif | |
936 | RR0 F2,D,E,A,B,C,22 | |
937 | W_PRECALC_32_79_3 W28,22,16 | |
938 | RR1 F2,D,E,A,B,C,22 | |
939 | ||
940 | //i=40 | |
941 | #undef K_XMM | |
942 | #define K_XMM 32 | |
943 | // W20,W16,W12,W8,W4,W0,W28,W24 | |
944 | #if defined (__x86_64__) | |
945 | W_PRECALC_32_79_0 W20,W0,W28,W24 | |
946 | #else | |
947 | W_PRECALC_32_79_0_i386 W20,W0,W28,W24 | |
948 | #endif | |
949 | RR0 F2,B,C,D,E,A,24 | |
950 | W_PRECALC_32_79_1 W8,W24 | |
951 | RR1 F2,B,C,D,E,A,24 | |
952 | #if defined (__x86_64__) | |
953 | W_PRECALC_32_79_2 W24 | |
954 | #else | |
955 | W_PRECALC_32_79_2_i386 W24 | |
956 | #endif | |
957 | RR0 F2,E,A,B,C,D,26 | |
958 | W_PRECALC_32_79_3 W24,26,K_XMM | |
959 | RR1 F2,E,A,B,C,D,26 | |
960 | ||
961 | //i=44 | |
962 | // W16,W12,W8,W4,W0,W28,W24,W20 | |
963 | W_PRECALC_32_79_0 W16,W28,W24,W20 | |
964 | RR0 F2,C,D,E,A,B,28 | |
965 | W_PRECALC_32_79_1 W4,W20 | |
966 | RR1 F2,C,D,E,A,B,28 | |
967 | W_PRECALC_32_79_2 W20 | |
968 | RR0 F2,A,B,C,D,E,30 | |
969 | W_PRECALC_32_79_3 W20,30,K_XMM | |
970 | RR1 F2,A,B,C,D,E,30 | |
971 | ||
972 | //i=48 | |
973 | // W12,W8,W4,W0,W28,W24,W20,W16 | |
974 | W_PRECALC_32_79_0 W12,W24,W20,W16 | |
975 | RR0 F2,D,E,A,B,C,32 | |
976 | W_PRECALC_32_79_1 W0,W16 | |
977 | RR1 F2,D,E,A,B,C,32 | |
978 | W_PRECALC_32_79_2 W16 | |
979 | RR0 F2,B,C,D,E,A,34 | |
980 | W_PRECALC_32_79_3 W16,34,K_XMM | |
981 | RR1 F2,B,C,D,E,A,34 | |
982 | ||
983 | //i=52 | |
984 | // W8,W4,W0,W28,W24,W20,W16,W12 | |
985 | W_PRECALC_32_79_0 W8,W20,W16,W12 | |
986 | RR0 F2,E,A,B,C,D,36 | |
987 | W_PRECALC_32_79_1 W28,W12 | |
988 | RR1 F2,E,A,B,C,D,36 | |
989 | W_PRECALC_32_79_2 W12 | |
990 | RR0 F2,C,D,E,A,B,38 | |
991 | W_PRECALC_32_79_3 W12,38,K_XMM | |
992 | RR1 F2,C,D,E,A,B,38 | |
993 | ||
994 | //i=56 | |
995 | // W4,W0,W28,W24,W20,W16,W12,W8 | |
996 | W_PRECALC_32_79_0 W4,W16,W12,W8 | |
997 | RR0 F3,A,B,C,D,E,40 | |
998 | W_PRECALC_32_79_1 W24,W8 | |
999 | RR1 F3,A,B,C,D,E,40 | |
1000 | W_PRECALC_32_79_2 W8 | |
1001 | RR0 F3,D,E,A,B,C,42 | |
1002 | W_PRECALC_32_79_3 W8,42,K_XMM | |
1003 | RR1 F3,D,E,A,B,C,42 | |
1004 | ||
1005 | //i=60 | |
1006 | #undef K_XMM | |
1007 | #define K_XMM 48 | |
1008 | // W0,W28,W24,W20,W16,W12,W8,W4 | |
1009 | W_PRECALC_32_79_0 W0,W12,W8,W4 | |
1010 | RR0 F3,B,C,D,E,A,44 | |
1011 | W_PRECALC_32_79_1 W20,W4 | |
1012 | RR1 F3,B,C,D,E,A,44 | |
1013 | W_PRECALC_32_79_2 W4 | |
1014 | RR0 F3,E,A,B,C,D,46 | |
1015 | W_PRECALC_32_79_3 W4,46,K_XMM | |
1016 | RR1 F3,E,A,B,C,D,46 | |
1017 | ||
1018 | //i=64 | |
1019 | // W28,W24,W20,W16,W12,W8,W4,W0 | |
1020 | W_PRECALC_32_79_0 W28,W8,W4,W0 | |
1021 | RR0 F3,C,D,E,A,B,48 | |
1022 | W_PRECALC_32_79_1 W16,W0 | |
1023 | RR1 F3,C,D,E,A,B,48 | |
1024 | W_PRECALC_32_79_2 W0 | |
1025 | RR0 F3,A,B,C,D,E,50 | |
1026 | W_PRECALC_32_79_3 W0,50,K_XMM | |
1027 | RR1 F3,A,B,C,D,E,50 | |
1028 | ||
1029 | //i=68 | |
1030 | // W24,W20,W16,W12,W8,W4,W0,W28 | |
1031 | #if defined (__x86_64__) | |
1032 | W_PRECALC_32_79_0 W24,W4,W0,W28 | |
1033 | #else | |
1034 | W_PRECALC_32_79_0_i386 W24,W4,W0,W28 | |
1035 | #endif | |
1036 | RR0 F3,D,E,A,B,C,52 | |
1037 | W_PRECALC_32_79_1 W12,W28 | |
1038 | RR1 F3,D,E,A,B,C,52 | |
1039 | #if defined (__x86_64__) | |
1040 | W_PRECALC_32_79_2 W28 | |
1041 | #else | |
1042 | W_PRECALC_32_79_2_i386 W28 | |
1043 | #endif | |
1044 | RR0 F3,B,C,D,E,A,54 | |
1045 | W_PRECALC_32_79_3 W28,54,K_XMM | |
1046 | RR1 F3,B,C,D,E,A,54 | |
1047 | ||
1048 | //i=72 | |
1049 | // W20,W16,W12,W8,W4,W0,W28,W24 | |
1050 | #if defined (__x86_64__) | |
1051 | W_PRECALC_32_79_0 W20,W0,W28,W24 | |
1052 | #else | |
1053 | W_PRECALC_32_79_0_i386 W20,W0,W28,W24 | |
1054 | #endif | |
1055 | RR0 F3,E,A,B,C,D,56 | |
1056 | W_PRECALC_32_79_1 W8,W24 | |
1057 | RR1 F3,E,A,B,C,D,56 | |
1058 | #if defined (__x86_64__) | |
1059 | W_PRECALC_32_79_2 W24 | |
1060 | #else | |
1061 | W_PRECALC_32_79_2_i386 W24 | |
1062 | #endif | |
1063 | RR0 F3,C,D,E,A,B,58 | |
1064 | W_PRECALC_32_79_3 W24,58,K_XMM | |
1065 | RR1 F3,C,D,E,A,B,58 | |
1066 | ||
1067 | // starting using F4 | |
1068 | ||
1069 | //i=76 | |
1070 | // W16,W12,W8,W4,W0,W28,W24,W20 | |
1071 | W_PRECALC_32_79_0 W16,W28,W24,W20 | |
1072 | RR0 F4,A,B,C,D,E,60 | |
1073 | W_PRECALC_32_79_1 W4,W20 | |
1074 | RR1 F4,A,B,C,D,E,60 | |
1075 | W_PRECALC_32_79_2 W20 | |
1076 | RR0 F4,D,E,A,B,C,62 | |
1077 | W_PRECALC_32_79_3 W20,62,K_XMM | |
1078 | RR1 F4,D,E,A,B,C,62 | |
1079 | ||
1080 | .endm | |
1081 | ||
1082 | .macro SOFTWARE_PIPELINING_nossse3 | |
1083 | // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 | |
1084 | W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) | |
1085 | RR0 F4,B,C,D,E,A,64 | |
1086 | W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP | |
1087 | RR1 F4,B,C,D,E,A,64 | |
1088 | W_PRECALC_00_15_2 // W_TMP = W0 + K | |
1089 | RR0 F4,E,A,B,C,D,66 | |
1090 | W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K | |
1091 | RR1 F4,E,A,B,C,D,66 | |
1092 | ||
1093 | // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 | |
1094 | W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) | |
1095 | RR0 F4,C,D,E,A,B,68 | |
1096 | W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP | |
1097 | RR1 F4,C,D,E,A,B,68 | |
1098 | W_PRECALC_00_15_2 // W_TMP = W28 + K | |
1099 | RR0 F4,A,B,C,D,E,70 | |
1100 | W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0] | |
1101 | RR1 F4,A,B,C,D,E,70 | |
1102 | ||
1103 | // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 | |
1104 | W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) | |
1105 | RR0 F4,D,E,A,B,C,72 | |
1106 | W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP | |
1107 | RR1 F4,D,E,A,B,C,72 | |
1108 | W_PRECALC_00_15_2 // W_TMP = W24 + K | |
1109 | RR0 F4,B,C,D,E,A,74 | |
1110 | W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K | |
1111 | RR1 F4,B,C,D,E,A,74 | |
1112 | ||
1113 | // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 | |
1114 | W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) | |
1115 | RR0 F4,E,A,B,C,D,76 | |
1116 | W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP | |
1117 | RR1 F4,E,A,B,C,D,76 | |
1118 | W_PRECALC_00_15_2 // W_TMP = W20 + K | |
1119 | RR0 F4,C,D,E,A,B,78 | |
1120 | W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K | |
1121 | RR1 F4,C,D,E,A,B,78 | |
1122 | .endm | |
1123 | ||
1124 | .macro ENDING // finish up updating hash digests (i=64:79) | |
1125 | //i=80 | |
1126 | RR0 F4,B,C,D,E,A,64 | |
1127 | RR1 F4,B,C,D,E,A,64 | |
1128 | RR0 F4,E,A,B,C,D,66 | |
1129 | RR1 F4,E,A,B,C,D,66 | |
1130 | ||
1131 | //i=84 | |
1132 | RR0 F4,C,D,E,A,B,68 | |
1133 | RR1 F4,C,D,E,A,B,68 | |
1134 | RR0 F4,A,B,C,D,E,70 | |
1135 | RR1 F4,A,B,C,D,E,70 | |
1136 | ||
1137 | //i=88 | |
1138 | RR0 F4,D,E,A,B,C,72 | |
1139 | RR1 F4,D,E,A,B,C,72 | |
1140 | RR0 F4,B,C,D,E,A,74 | |
1141 | RR1 F4,B,C,D,E,A,74 | |
1142 | ||
1143 | //i=92 | |
1144 | RR0 F4,E,A,B,C,D,76 | |
1145 | RR1 F4,E,A,B,C,D,76 | |
1146 | RR0 F4,C,D,E,A,B,78 | |
1147 | RR1 F4,C,D,E,A,B,78 | |
1148 | .endm | |
1149 | ||
1150 | // load hash digests A,B,C,D,E from memory into registers | |
1151 | .macro LOAD_HASH | |
1152 | #if defined (__x86_64__) | |
1153 | mov (HASH_PTR), A | |
1154 | mov 4(HASH_PTR), B | |
1155 | mov 8(HASH_PTR), C | |
1156 | mov 12(HASH_PTR), D | |
1157 | mov 16(HASH_PTR), E | |
1158 | #else | |
1159 | mov HASH_PTR, T1 | |
1160 | mov (T1), A | |
1161 | mov 4(T1), B | |
1162 | mov 8(T1), C | |
1163 | mov 12(T1), D | |
1164 | mov 16(T1), E | |
1165 | #endif | |
1166 | .endm | |
1167 | ||
1168 | .macro UPDATE_HASH | |
1169 | add $0, $1 | |
1170 | mov $1, $0 | |
1171 | .endm | |
1172 | ||
1173 | .macro UPDATE_ALL_HASH | |
1174 | #if defined (__x86_64__) | |
1175 | UPDATE_HASH (HASH_PTR), A | |
1176 | UPDATE_HASH 4(HASH_PTR), B | |
1177 | UPDATE_HASH 8(HASH_PTR), C | |
1178 | UPDATE_HASH 12(HASH_PTR), D | |
1179 | UPDATE_HASH 16(HASH_PTR), E | |
1180 | #else | |
1181 | mov HASH_PTR, T1 | |
1182 | UPDATE_HASH (T1), A | |
1183 | UPDATE_HASH 4(T1), B | |
1184 | UPDATE_HASH 8(T1), C | |
1185 | UPDATE_HASH 12(T1), D | |
1186 | UPDATE_HASH 16(T1), E | |
1187 | #endif | |
1188 | .endm | |
1189 | ||
1190 | ||
1191 | /* | |
1192 | main sha1 code for system without ssse3 support | |
1193 | */ | |
1194 | ||
1195 | .macro SHA1_PIPELINED_MAIN_BODY_nossse3 | |
1196 | LOAD_HASH // load initial hashes into A,B,C,D,E (registers) | |
1197 | INITIAL_W_PRECALC_nossse3 // big_endian_load(W) and W+K (i=0:15) | |
1198 | .align 4,0x90 | |
1199 | 0: | |
1200 | INTERNAL_nossse3 // update W (i=16:79) and update ABCDE (i=0:63) | |
1201 | #if Multiple_Blocks | |
316670eb | 1202 | #if defined(__x86_64__) |
6d2010ae A |
1203 | add $$64, BUFFER_PTR // BUFFER_PTR+=64; |
1204 | sub $$1, cnt // pre-decrement cnt by 1 | |
316670eb A |
1205 | #else |
1206 | addl $$64, BUFFER_PTR // BUFFER_PTR+=64; | |
1207 | subl $$1, cnt // pre-decrement cnt by 1 | |
1208 | #endif | |
6d2010ae A |
1209 | jbe 1f // if cnt <= 0, branch to finish off |
1210 | SOFTWARE_PIPELINING_nossse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) | |
1211 | UPDATE_ALL_HASH // update output hashes | |
1212 | jmp 0b // repeat for next block | |
1213 | .align 4,0x90 | |
1214 | 1: | |
1215 | #endif | |
1216 | ENDING // update ABCDE (i=64:79) | |
1217 | UPDATE_ALL_HASH // update output hashes | |
1218 | .endm | |
1219 | ||
1220 | /* | |
1221 | main sha1 code for system with ssse3 support | |
1222 | */ | |
1223 | ||
1224 | .macro SHA1_PIPELINED_MAIN_BODY_ssse3 | |
1225 | LOAD_HASH // load initial hashes into A,B,C,D,E | |
1226 | INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15) | |
1227 | .align 4,0x90 | |
1228 | 0: | |
1229 | INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63) | |
1230 | #if Multiple_Blocks | |
316670eb | 1231 | #if defined(__x86_64__) |
6d2010ae A |
1232 | add $$64, BUFFER_PTR // BUFFER_PTR+=64; |
1233 | sub $$1, cnt // pre-decrement cnt by 1 | |
316670eb A |
1234 | #else |
1235 | addl $$64, BUFFER_PTR // BUFFER_PTR+=64; | |
1236 | subl $$1, cnt // pre-decrement cnt by 1 | |
1237 | #endif | |
6d2010ae A |
1238 | jbe 1f // if cnt <= 0, branch to finish off |
1239 | SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) | |
1240 | UPDATE_ALL_HASH // update output hashes | |
1241 | jmp 0b // repeat for next block | |
1242 | .align 4,0x90 | |
1243 | 1: | |
1244 | #endif | |
1245 | ENDING // update ABCDE (i=64:79) | |
1246 | UPDATE_ALL_HASH // update output hashes | |
1247 | .endm | |
1248 | ||
316670eb | 1249 | #ifdef KERNEL |
6d2010ae | 1250 | #include <i386/cpu_capabilities.h> |
316670eb A |
1251 | #else |
1252 | #include <System/i386/cpu_capabilities.h> | |
1253 | #endif | |
6d2010ae A |
1254 | |
1255 | .text | |
1256 | ||
1257 | .globl _SHA1Transform | |
316670eb | 1258 | //.private_extern _SHA1Transform |
6d2010ae A |
1259 | _SHA1Transform: |
1260 | ||
1261 | // detect SSSE3 and dispatch appropriate code branch | |
1262 | #if defined __x86_64__ | |
1263 | movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities | |
1264 | mov (%rax), %eax // %eax = __cpu_capabilities | |
1265 | #else // i386 | |
1266 | #if defined KERNEL | |
1267 | leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities | |
1268 | mov (%eax), %eax // %eax = __cpu_capabilities | |
1269 | #else | |
1270 | mov _COMM_PAGE_CPU_CAPABILITIES, %eax | |
1271 | #endif | |
1272 | #endif | |
1273 | test $(kHasSupplementalSSE3), %eax | |
1274 | je _SHA1Transform_nossse3 // branch to no-ssse3 code | |
1275 | ||
1276 | ||
1277 | // start the sha1 code with ssse3 support | |
1278 | ||
1279 | // save callee-save registers | |
1280 | #if defined (__x86_64__) | |
1281 | push %rbx | |
1282 | push %rbp | |
1283 | #else | |
1284 | push %ebx | |
1285 | push %ebp | |
1286 | push %esi | |
1287 | push %edi | |
1288 | #endif | |
1289 | ||
1290 | sub $stack_size, sp // allocate stack memory for use | |
1291 | ||
1292 | // save used xmm register if this is for kernel | |
1293 | #if KERNEL | |
1294 | xmov %xmm0, 4*16(sp) | |
1295 | xmov %xmm1, 5*16(sp) | |
1296 | xmov %xmm2, 6*16(sp) | |
1297 | xmov %xmm3, 7*16(sp) | |
1298 | xmov %xmm4, 8*16(sp) | |
1299 | xmov %xmm5, 9*16(sp) | |
1300 | xmov %xmm6, 10*16(sp) | |
1301 | xmov %xmm7, 11*16(sp) | |
1302 | #if defined (__x86_64__) | |
1303 | xmov %xmm8, 12*16(sp) | |
1304 | xmov %xmm9, 13*16(sp) | |
1305 | xmov %xmm10, 14*16(sp) | |
1306 | #endif | |
1307 | #endif | |
1308 | ||
1309 | #if defined (__x86_64__) | |
1310 | ||
1311 | // set up registers to free %edx/%edi/%esi for other use (ABCDE) | |
1312 | mov ctx, HASH_PTR | |
1313 | mov buf, BUFFER_PTR | |
1314 | #if Multiple_Blocks | |
1315 | mov %rdx, cnt | |
1316 | #endif | |
1317 | lea K_XMM_AR(%rip), K_BASE | |
1318 | xmov 0x40(K_BASE), XMM_SHUFB_BSWAP | |
1319 | ||
1320 | #else // __i386__ | |
1321 | ||
1322 | #if KERNEL | |
1323 | lea K_XMM_AR, %eax | |
1324 | #else | |
1325 | // Get address of 0 in R. | |
1326 | call 0f // Push program counter onto stack. | |
1327 | 0: pop %eax // Get program counter. | |
1328 | lea K_XMM_AR-0b(%eax), %eax | |
1329 | #endif | |
1330 | mov %eax, K_BASE | |
1331 | xmov 0x40(%eax), %xmm0 | |
1332 | xmov %xmm0, XMM_SHUFB_BSWAP | |
1333 | ||
1334 | #endif | |
1335 | ||
1336 | SHA1_PIPELINED_MAIN_BODY_ssse3 | |
1337 | ||
1338 | // restore used xmm registers if this is for kernel | |
1339 | #if KERNEL | |
1340 | xmov 4*16(sp), %xmm0 | |
1341 | xmov 5*16(sp), %xmm1 | |
1342 | xmov 6*16(sp), %xmm2 | |
1343 | xmov 7*16(sp), %xmm3 | |
1344 | xmov 8*16(sp), %xmm4 | |
1345 | xmov 9*16(sp), %xmm5 | |
1346 | xmov 10*16(sp), %xmm6 | |
1347 | xmov 11*16(sp), %xmm7 | |
1348 | #if defined (__x86_64__) | |
1349 | xmov 12*16(sp), %xmm8 | |
1350 | xmov 13*16(sp), %xmm9 | |
1351 | xmov 14*16(sp), %xmm10 | |
1352 | #endif | |
1353 | #endif | |
1354 | ||
1355 | add $stack_size, sp // deallocate stack memory | |
1356 | ||
1357 | // restore callee-save registers | |
1358 | #if defined (__x86_64__) | |
1359 | pop %rbp | |
1360 | pop %rbx | |
1361 | #else | |
1362 | pop %edi | |
1363 | pop %esi | |
1364 | pop %ebp | |
1365 | pop %ebx | |
1366 | #endif | |
1367 | ||
1368 | ret // return | |
1369 | ||
1370 | // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions | |
1371 | ||
1372 | .globl _SHA1Transform_nossse3 | |
1373 | .private_extern _SHA1Transform_nossse3 | |
1374 | _SHA1Transform_nossse3: | |
1375 | ||
1376 | // push callee-save registers | |
1377 | #if defined (__x86_64__) | |
1378 | push %rbx | |
1379 | push %rbp | |
1380 | #else | |
1381 | push %ebx | |
1382 | push %ebp | |
1383 | push %esi | |
1384 | push %edi | |
1385 | #endif | |
1386 | ||
1387 | sub $stack_size, sp // allocate stack memory for local use | |
1388 | ||
1389 | // save used xmm registers if this is for kernel | |
1390 | #if KERNEL | |
1391 | xmov %xmm0, 4*16(sp) | |
1392 | xmov %xmm1, 5*16(sp) | |
1393 | xmov %xmm2, 6*16(sp) | |
1394 | xmov %xmm3, 7*16(sp) | |
1395 | xmov %xmm4, 8*16(sp) | |
1396 | xmov %xmm5, 9*16(sp) | |
1397 | xmov %xmm6, 10*16(sp) | |
1398 | xmov %xmm7, 11*16(sp) | |
1399 | #if defined (__x86_64__) | |
1400 | xmov %xmm8, 12*16(sp) | |
1401 | xmov %xmm9, 13*16(sp) | |
1402 | #endif | |
1403 | #endif | |
1404 | ||
1405 | #if defined (__x86_64__) | |
1406 | ||
1407 | // set up registers to free %edx/%edi/%esi for other use (ABCDE) | |
1408 | mov ctx, HASH_PTR | |
1409 | mov buf, BUFFER_PTR | |
1410 | #if Multiple_Blocks | |
1411 | mov %rdx, cnt | |
1412 | #endif | |
1413 | lea K_XMM_AR(%rip), K_BASE | |
1414 | ||
1415 | #else // __i386__ | |
1416 | ||
1417 | #if KERNEL | |
1418 | lea K_XMM_AR, %eax | |
1419 | #else | |
1420 | // Get address of 0 in R. | |
1421 | call 0f // Push program counter onto stack. | |
1422 | 0: pop %eax // Get program counter. | |
1423 | lea K_XMM_AR-0b(%eax), %eax | |
1424 | #endif | |
1425 | mov %eax, K_BASE | |
1426 | ||
1427 | #endif | |
1428 | ||
1429 | SHA1_PIPELINED_MAIN_BODY_nossse3 | |
1430 | ||
1431 | // restore used xmm registers if this is for kernel | |
1432 | #if KERNEL | |
1433 | xmov 4*16(sp), %xmm0 | |
1434 | xmov 5*16(sp), %xmm1 | |
1435 | xmov 6*16(sp), %xmm2 | |
1436 | xmov 7*16(sp), %xmm3 | |
1437 | xmov 8*16(sp), %xmm4 | |
1438 | xmov 9*16(sp), %xmm5 | |
1439 | xmov 10*16(sp), %xmm6 | |
1440 | xmov 11*16(sp), %xmm7 | |
1441 | #if defined (__x86_64__) | |
1442 | xmov 12*16(sp), %xmm8 | |
1443 | xmov 13*16(sp), %xmm9 | |
1444 | #endif | |
1445 | #endif | |
1446 | ||
1447 | add $stack_size, sp // deallocate stack memory | |
1448 | ||
1449 | // restore callee-save registers | |
1450 | #if defined (__x86_64__) | |
1451 | pop %rbp | |
1452 | pop %rbx | |
1453 | #else | |
1454 | pop %edi | |
1455 | pop %esi | |
1456 | pop %ebp | |
1457 | pop %ebx | |
1458 | #endif | |
1459 | ||
1460 | ret // return | |
1461 | ||
1462 | .const | |
1463 | .align 4, 0x90 | |
1464 | ||
1465 | #define K1 0x5a827999 | |
1466 | #define K2 0x6ed9eba1 | |
1467 | #define K3 0x8f1bbcdc | |
1468 | #define K4 0xca62c1d6 | |
1469 | ||
1470 | K_XMM_AR: | |
1471 | .long K1 | |
1472 | .long K1 | |
1473 | .long K1 | |
1474 | .long K1 | |
1475 | .long K2 | |
1476 | .long K2 | |
1477 | .long K2 | |
1478 | .long K2 | |
1479 | .long K3 | |
1480 | .long K3 | |
1481 | .long K3 | |
1482 | .long K3 | |
1483 | .long K4 | |
1484 | .long K4 | |
1485 | .long K4 | |
1486 | .long K4 | |
1487 | // bswap_shufb_ctl: invoked thru 0x40(K_XMM_AR) | |
1488 | .long 0x00010203 | |
1489 | .long 0x04050607 | |
1490 | .long 0x08090a0b | |
1491 | .long 0x0c0d0e0f | |
1492 | ||
1493 | ||
1494 | ||
1495 | #endif // architecture x86_64 or i386 |