]>
Commit | Line | Data |
---|---|---|
6d2010ae A |
1 | /* This file defines _aes_decrypt_key, _aes_decrypt_key128, |
2 | _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be | |
3 | included in another assembly file with the preprocessor #include directive, | |
4 | to benefit from some assembly-time calculations. | |
5 | ||
6 | Written by Eric Postpischil, January 2008. | |
7 | ||
8 | The comments here do not say much about the algorithm; the code just | |
9 | follows the FIPS-197 specification. I recommend reading the specification | |
10 | before working with this code or examining the C code in the parent | |
11 | directory that illustrates key expansion. | |
12 | ||
13 | One complication is that this routine both expands the key and applies | |
14 | InvMixColumn to most of the words in the expanded key. This modifies the | |
15 | key for use with the Equivalent Inverse Cipher. | |
16 | ||
17 | During key expansion, there are sequences of four or six words that are | |
18 | produced like this: | |
19 | ||
20 | E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function. | |
21 | E[i+1] = E[i+1-Nk] ^ E[i+0]. | |
22 | E[i+2] = E[i+2-Nk] ^ E[i+1]. | |
23 | E[i+3] = E[i+3-Nk] ^ E[i+2]. | |
24 | ||
25 | When Nk is four or eight, the sequence stops there. When it is six, it | |
26 | goes on for two more words. Let I be the InvMixColumn function. for the | |
27 | Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]), | |
28 | I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not | |
29 | need to calculate I four times. In AES' finite field, I is a linear | |
30 | combination of the four bytes of its input. The ^ operation on the bits | |
31 | that represent field elements is an addition in the Galois field. So | |
32 | I(a ^ b) = I(a) ^ I(b). Then we have: | |
33 | ||
34 | I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])). | |
35 | I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]). | |
36 | I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]). | |
37 | I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]). | |
38 | ||
39 | To compute this, we compute I(f(E[i-1])) and XOR it with the previously | |
40 | stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously | |
41 | stored E[i+1-Nk])) to get I(E[i+1])), and so on. | |
42 | ||
43 | Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to | |
44 | compute the pre-InvMixColumn words of the expanded key; it is not | |
45 | sufficient to have the post-InvMixColumn words. | |
46 | */ | |
47 | ||
48 | ||
49 | /* Routine: | |
50 | ||
51 | _aes_decrypt_key. | |
52 | ||
53 | _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256. | |
54 | ||
55 | Function: | |
56 | ||
57 | Expand the user's cipher key into the key schedule, as defined in | |
58 | Federal Information Processing Standards Publication 197 (FIPS-197), | |
59 | November 26, 2001. | |
60 | ||
61 | For decryption, the key is modified as shown in Figure 15 in FIPS-197, | |
62 | to support the Equivalent Inverse Cipher. | |
63 | ||
64 | Input: | |
65 | ||
66 | Constant data: | |
67 | ||
68 | The following names must be locally defined so the assembler | |
69 | can calculate certain offsets. | |
70 | ||
71 | static const Word _AESSubBytesWordTable[4][256]. | |
72 | ||
73 | _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where | |
74 | SubBytes is defined in FIPS-197. _AESSubBytesWordTable | |
75 | differs from _AESEncryptTable in that it does not include | |
76 | the MixColumn operation. It is used in performing the last | |
77 | round, which differs fromm the previous rounds in that it | |
78 | does not include the MixColumn operation. | |
79 | ||
80 | static const Word _AESSInvMixColumnTable[4][256]. | |
81 | ||
82 | _AESInvMixColumnTable[i][j] contains the contribution of byte | |
83 | j to element i of the InvMixColumn operation. | |
84 | ||
85 | The four bytes of the word _AESInvMixColumnTable[0][j] are: | |
86 | ||
87 | {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j}, | |
88 | ||
89 | listed in increasing address order, where multiplication is | |
90 | performed in the Galois field. {j} designates the element of | |
91 | the Galois field represented by j. _AESInvMixColumn[i][j] has | |
92 | the same bytes, rotated right in the order shown above. | |
93 | ||
94 | static const Byte _AESRcon[]. | |
95 | ||
96 | Round constants, beginning with AESRcon[1] for the first round | |
97 | (AESRcon[0] is padding.) | |
98 | ||
99 | Arguments: | |
100 | ||
101 | const uint8_t *Key | |
102 | ||
103 | Address of user's cipher key. | |
104 | ||
105 | int Length | |
106 | ||
107 | Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in | |
108 | user's cipher key. | |
109 | ||
110 | This argument is used with _aes_decrypt_key. It is not | |
111 | present for the other routines. In those routines, Context | |
112 | is the second argument. | |
113 | ||
114 | aes_decrypt_ctx *Context | |
115 | ||
116 | Structure to contain the expanded key beginning at offset | |
117 | ContextKey and a four-byte "key length" beginning at offset | |
118 | ContextKeyLength. The "key length" is the number of bytes from | |
119 | the start of the first round key to the startof the last rond | |
120 | key. That is 16 less than the number of bytes in the entire | |
121 | key. | |
122 | ||
123 | Output: | |
124 | ||
125 | The expanded key and the "key length" are written to *Context. | |
126 | ||
127 | Return: | |
128 | ||
129 | aes_rval // -1 if "key length" is invalid. 0 otherwise. | |
130 | */ | |
131 | /* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ | |
132 | ||
133 | #ifdef KERNEL | |
134 | #include <i386/cpu_capabilities.h> | |
135 | #else | |
136 | #include <System/i386/cpu_capabilities.h> | |
137 | #endif | |
138 | ||
139 | #define dr r0d // Dissection register. | |
140 | #define drl r0l // Low 8 bits of dissection register. | |
141 | #define drh r0h // Second-lowest 8 bits of dissection register. | |
142 | ||
143 | #define t0 r1 | |
144 | #define t0d r1d // Low 32 bits of t0. | |
145 | ||
146 | #define STable r2 // Address of SubBytes table. Overlaps Nk. | |
147 | #define ITable r3 // Address of InvMixColumn table. | |
148 | #define offset Arch(r5, r11) // Address offset and loop sentinel. | |
149 | ||
150 | #define R r7 // Address of round constant. | |
151 | #define K r7 // User key pointer. | |
152 | // R and K overlap. | |
153 | ||
154 | #define E r6 // Expanded key pointer. | |
155 | ||
156 | #define ve0 %xmm0 | |
157 | #define ve1 %xmm1 | |
158 | #define ve2 %xmm2 | |
159 | #define ve3 %xmm3 | |
160 | #define ve4 %xmm4 | |
161 | #define ve5 %xmm5 | |
162 | #define vt1 %xmm6 | |
163 | #define vt0 %xmm7 | |
164 | ||
165 | #define LookupS(table, index) (table)*TableSize(STable, index, 4) | |
166 | #define LookupI(table, index) (table)*TableSize(ITable, index, 4) | |
167 | ||
168 | ||
169 | /* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard | |
170 | subroutine. It does not conform to the ABI. It is an integral part of | |
171 | _ExpandKeyForDecryption and shares register use with it. | |
172 | */ | |
173 | InvMixColumn: | |
174 | movzx drl, t0 | |
175 | movd LookupI(0, t0), vt0 // Look up byte 0 in table 0. | |
176 | movzx drh, t0d | |
177 | movd LookupI(1, t0), vt1 // Look up byte 1 in table 1. | |
178 | pxor vt1, vt0 | |
179 | shr $16, dr | |
180 | movzx drl, t0d | |
181 | movd LookupI(2, t0), vt1 // Look up byte 2 in table 2. | |
182 | pxor vt1, vt0 | |
183 | movzx drh, t0d | |
184 | movd LookupI(3, t0), vt1 // Look up byte 3 in table 3. | |
185 | pxor vt1, vt0 | |
186 | ret | |
187 | ||
188 | ||
189 | // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0. | |
190 | .macro SubWordRotWord | |
191 | movzx drl, t0 | |
192 | movd LookupS(3, t0), vt1 // Look up byte 0 in table 3. | |
193 | pxor vt1, vt0 | |
194 | movzx drh, t0d | |
195 | movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. | |
196 | pxor vt1, vt0 | |
197 | shr $$16, dr | |
198 | movzx drl, t0d | |
199 | movd LookupS(1, t0), vt1 // Look up byte 2 in table 1. | |
200 | pxor vt1, vt0 | |
201 | movzx drh, t0d | |
202 | movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. | |
203 | pxor vt1, vt0 | |
204 | .endmacro | |
205 | ||
206 | ||
207 | // SubWord puts SubWord(dr) into vt0. | |
208 | .macro SubWord | |
209 | movzx drl, t0 | |
210 | movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. | |
211 | movzx drh, t0d | |
212 | movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. | |
213 | pxor vt1,vt0 | |
214 | shr $$16, dr | |
215 | movzx drl, t0d | |
216 | movd LookupS(2, t0), vt1 // Look up byte 2 in table 2. | |
217 | pxor vt1,vt0 | |
218 | movzx drh, t0d | |
219 | movd LookupS(3, t0), vt1 // Look up byte 3 in table 3. | |
220 | pxor vt1,vt0 | |
221 | .endmacro | |
222 | ||
223 | .text | |
224 | .globl _aes_decrypt_key | |
225 | // .private_extern _aes_decrypt_key | |
226 | _aes_decrypt_key: | |
227 | ||
228 | // detect AES HW, cclee 3-13-10 | |
229 | #if defined __x86_64__ | |
230 | movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities | |
231 | mov (%rax), %eax // %eax = __cpu_capabilities | |
232 | #else | |
233 | #if defined KERNEL | |
234 | leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities | |
235 | mov (%eax), %eax // %eax = __cpu_capabilities | |
236 | #else | |
237 | mov _COMM_PAGE_CPU_CAPABILITIES, %eax | |
238 | #endif | |
239 | ||
240 | #endif | |
241 | test $(kHasAES), %eax // __cpu_capabilities & kHasAES | |
242 | jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw | |
243 | /* Save registers and set SaveSize to the number of bytes pushed onto the | |
244 | stack so far, including the caller's return address. | |
245 | */ | |
246 | push r3 | |
247 | #if defined __i386__ | |
248 | push r5 | |
249 | push r6 | |
250 | push r7 | |
251 | #define SaveSize (5*4) | |
252 | #else | |
253 | #define SaveSize (2*8) | |
254 | #endif | |
255 | ||
256 | /* Number of bytes used for local variables: | |
257 | ||
258 | 8 16-byte spaces to save XMM registers. | |
259 | ||
260 | 8 four-byte spaces for work. | |
261 | */ | |
262 | #define LocalsSize (8*16 + 8*4) | |
263 | ||
264 | // Define stack offset to storage space for local data. | |
265 | #define Local (8*16) | |
266 | ||
267 | #if 0 < LocalsSize | |
268 | // Padding to position stack pointer at a multiple of 16 bytes. | |
269 | #define Padding (15 & -(SaveSize + LocalsSize)) | |
270 | sub $Padding + LocalsSize, r4 // Allocate space on stack. | |
271 | #else | |
272 | #define Padding 0 | |
273 | #endif | |
274 | ||
275 | /* StackFrame is the number of bytes in our stack frame, from caller's | |
276 | stack pointer to ours (so it includes the return address). | |
277 | */ | |
278 | #define StackFrame (SaveSize + Padding + LocalsSize) | |
279 | ||
280 | // Save xmm registers. | |
281 | movaps %xmm0, 0*16(r4) | |
282 | movaps %xmm1, 1*16(r4) | |
283 | movaps %xmm2, 2*16(r4) | |
284 | movaps %xmm3, 3*16(r4) | |
285 | movaps %xmm4, 4*16(r4) | |
286 | movaps %xmm5, 5*16(r4) | |
287 | movaps %xmm6, 6*16(r4) | |
288 | movaps %xmm7, 7*16(r4) | |
289 | ||
290 | #if defined __i386__ | |
291 | ||
292 | // Define location of argument i. | |
293 | #define Argument(i) StackFrame+4*(i)(r4) | |
294 | ||
295 | #define Nk t0d | |
296 | ||
297 | // Load arguments. | |
298 | mov Argument(2), E | |
299 | mov Argument(1), Nk | |
300 | mov Argument(0), K | |
301 | ||
302 | #elif defined __x86_64__ | |
303 | ||
304 | #define Nk r9d // Number of words in key. | |
305 | mov r6d, Nk // Move Nk argument out of way. | |
306 | mov r2, E // Move E argument to common register. | |
307 | ||
308 | #endif | |
309 | ||
310 | // Dispatch on key length. | |
311 | cmp $128, Nk | |
312 | jge 2f | |
313 | shl $3, Nk // Convert from bytes to bits. | |
314 | cmp $128, Nk | |
315 | 2: | |
316 | je DKeyHas4Words | |
317 | cmp $192, Nk | |
318 | je DKeyHas6Words | |
319 | cmp $256, Nk | |
320 | je DKeyHas8Words | |
321 | mov $-1, r0 // Return error. | |
322 | jmp 9f | |
323 | ||
324 | ||
325 | .globl _aes_decrypt_key128 | |
326 | // .private_extern _aes_decrypt_key128 | |
327 | _aes_decrypt_key128: | |
328 | ||
329 | /* Save registers and set SaveSize to the number of bytes pushed onto the | |
330 | stack so far, including the caller's return address. | |
331 | */ | |
332 | push r3 | |
333 | #if defined __i386__ | |
334 | push r5 | |
335 | push r6 | |
336 | push r7 | |
337 | #define SaveSize (5*4) | |
338 | #else | |
339 | #define SaveSize (2*8) | |
340 | #endif | |
341 | ||
342 | /* Number of bytes used for local variables: | |
343 | ||
344 | 8 16-byte spaces to save XMM registers. | |
345 | ||
346 | 8 four-byte spaces for work. | |
347 | */ | |
348 | #define LocalsSize (8*16 + 8*4) | |
349 | ||
350 | // Define stack offset to storage space for local data. | |
351 | #define Local (8*16) | |
352 | ||
353 | #if 0 < LocalsSize | |
354 | // Padding to position stack pointer at a multiple of 16 bytes. | |
355 | #define Padding (15 & -(SaveSize + LocalsSize)) | |
356 | sub $Padding + LocalsSize, r4 // Allocate space on stack. | |
357 | #else | |
358 | #define Padding 0 | |
359 | #endif | |
360 | ||
361 | /* StackFrame is the number of bytes in our stack frame, from caller's | |
362 | stack pointer to ours (so it includes the return address). | |
363 | */ | |
364 | #define StackFrame (SaveSize + Padding + LocalsSize) | |
365 | ||
366 | // Save xmm registers. | |
367 | movaps %xmm0, 0*16(r4) | |
368 | movaps %xmm1, 1*16(r4) | |
369 | movaps %xmm2, 2*16(r4) | |
370 | movaps %xmm3, 3*16(r4) | |
371 | movaps %xmm4, 4*16(r4) | |
372 | movaps %xmm5, 5*16(r4) | |
373 | movaps %xmm6, 6*16(r4) | |
374 | movaps %xmm7, 7*16(r4) | |
375 | ||
376 | #if defined __i386__ | |
377 | ||
378 | // Load arguments. | |
379 | #define Argument(i) StackFrame+4*(i)(r4) | |
380 | mov Argument(1), E | |
381 | mov Argument(0), K | |
382 | ||
383 | #endif | |
384 | ||
385 | // Merge point for _aes_decrypt_key and _aes_decrypt_key128. | |
386 | DKeyHas4Words: | |
387 | ||
388 | // First words of expanded key are copied from user key. | |
389 | movd 0*4(K), ve0 | |
390 | movd 1*4(K), ve1 | |
391 | movd 2*4(K), ve2 | |
392 | movd 3*4(K), ve3 | |
393 | ||
394 | movl $10*16, ContextKeyLength(E) // Set "key length." | |
395 | ||
396 | #if 0 != ContextKey | |
397 | add $ContextKey, E | |
398 | #endif | |
399 | ||
400 | // K cannot be used after we write to R, since they use the same register. | |
401 | ||
402 | #if defined __i386__ | |
403 | ||
404 | lea _AESRcon, R | |
405 | lea _AESInvMixColumnTable, ITable | |
406 | lea _AESSubBytesWordTable, STable | |
407 | ||
408 | #elif defined __x86_64__ | |
409 | ||
410 | lea _AESRcon(%rip), R | |
411 | lea _AESInvMixColumnTable(%rip), ITable | |
412 | lea _AESSubBytesWordTable(%rip), STable | |
413 | ||
414 | #endif | |
415 | ||
416 | /* With a four-word key, there are ten rounds (eleven 16-byte key blocks), | |
417 | nine of which have InvMixColumn applied. | |
418 | */ | |
419 | mov $-9*4*4, offset | |
420 | sub offset, E | |
421 | ||
422 | // Store initial words of expanded key, which are copies of user's key. | |
423 | movd ve0, 0*4(E, offset) | |
424 | movd ve1, 1*4(E, offset) | |
425 | movd ve2, 2*4(E, offset) | |
426 | movd ve3, 3*4(E, offset) | |
427 | ||
428 | /* Here is the first iteration of the key expansion. It is separate from the | |
429 | main loop below because we need to apply InvMixColumn to each of the | |
430 | outputs, in ve0 through ve3. In the main loop, the technique described at | |
431 | the top of this file is used to compute the proper outputs while using | |
432 | InvMixColumn only once. | |
433 | */ | |
434 | add $1, R // Advance pointer. | |
435 | movd ve3, dr // Put previous word into work register. | |
436 | movzx (R), t0d // Get round constant. | |
437 | movd t0d, vt0 | |
438 | ||
439 | SubWordRotWord | |
440 | pxor vt0, ve0 | |
441 | ||
442 | // Chain to successive words. | |
443 | pxor ve0, ve1 | |
444 | pxor ve1, ve2 | |
445 | pxor ve2, ve3 | |
446 | ||
447 | add $4*4, offset | |
448 | ||
449 | /* Apply InvMixColumn to each word. The transformed values are stored in | |
450 | the expanded key. The original values are retained in registers for | |
451 | further computation. | |
452 | */ | |
453 | movd ve0, dr | |
454 | call InvMixColumn | |
455 | movd vt0, 0*4(E, offset) | |
456 | ||
457 | movd ve1, dr | |
458 | call InvMixColumn | |
459 | movd vt0, 1*4(E, offset) | |
460 | ||
461 | movd ve2, dr | |
462 | call InvMixColumn | |
463 | movd vt0, 2*4(E, offset) | |
464 | ||
465 | movd ve3, dr | |
466 | call InvMixColumn | |
467 | movd vt0, 3*4(E, offset) | |
468 | ||
469 | // Here is the main loop. | |
470 | 1: | |
471 | add $1, R // Advance pointer. | |
472 | movd ve3, dr // Put previous word into work register. | |
473 | movzx (R), t0d // Get round constant. | |
474 | movd t0d, vt0 | |
475 | ||
476 | SubWordRotWord | |
477 | pxor vt0, ve0 | |
478 | ||
479 | // Chain to successive words. | |
480 | pxor ve0, ve1 | |
481 | pxor ve1, ve2 | |
482 | pxor ve2, ve3 | |
483 | /* Dr. Brian Gladman uses a technique with a single XOR here instead | |
484 | of the previous four. There is some periodic behavior in the key | |
485 | expansion, and Gladman maintains E[4*i+3] for the latest four | |
486 | values of i. XORing the value in vt0 with one of these yields its | |
487 | replacement. However, using this technique requires additional | |
488 | instructions before the loop (to initialize the values) and after | |
489 | it (to extract the final values to be stored) and either some way | |
490 | to rotate or index four values in the loop or a four-fold unrolling | |
491 | of the loop to provide the indexing. Experiment suggests the | |
492 | former is not worthwhile. Unrolling the loop might give a small | |
493 | gain, at the cost of increased use of instruction cache, increased | |
494 | instructions loads the first time the routine is executed, and | |
495 | increased code complexity, so I decided against it. | |
496 | */ | |
497 | ||
498 | // Apply InvMixColumn to the difference. | |
499 | movd vt0, dr | |
500 | call InvMixColumn | |
501 | ||
502 | add $4*4, offset | |
503 | ||
504 | // Chain the transformed difference to previously transformed outputs. | |
505 | movd (0-4)*4(E, offset), vt1 | |
506 | pxor vt1, vt0 | |
507 | movd vt0, 0*4(E, offset) | |
508 | ||
509 | movd (1-4)*4(E, offset), vt1 | |
510 | pxor vt1, vt0 | |
511 | movd vt0, 1*4(E, offset) | |
512 | ||
513 | movd (2-4)*4(E, offset), vt1 | |
514 | pxor vt1, vt0 | |
515 | movd vt0, 2*4(E, offset) | |
516 | ||
517 | movd (3-4)*4(E, offset), vt1 | |
518 | pxor vt1, vt0 | |
519 | movd vt0, 3*4(E, offset) | |
520 | ||
521 | jl 1b | |
522 | ||
523 | // Here is the final iteration, which does not perform InvMixColumn. | |
524 | ||
525 | movd ve3, dr // Put previous word into work register. | |
526 | movzx 1(R), t0d // Get round constant. | |
527 | movd t0d, vt0 | |
528 | ||
529 | SubWordRotWord | |
530 | pxor vt0, ve0 | |
531 | ||
532 | // Chain to successive words. | |
533 | movd ve0, 4*4(E, offset) | |
534 | pxor ve0, ve1 | |
535 | movd ve1, 5*4(E, offset) | |
536 | pxor ve1, ve2 | |
537 | movd ve2, 6*4(E, offset) | |
538 | pxor ve2, ve3 | |
539 | movd ve3, 7*4(E, offset) | |
540 | ||
541 | xor r0, r0 // Return success. | |
542 | ||
543 | 9: | |
544 | // Pop stack and restore registers. | |
545 | movaps 7*16(r4), %xmm7 | |
546 | movaps 6*16(r4), %xmm6 | |
547 | movaps 5*16(r4), %xmm5 | |
548 | movaps 4*16(r4), %xmm4 | |
549 | movaps 3*16(r4), %xmm3 | |
550 | movaps 2*16(r4), %xmm2 | |
551 | movaps 1*16(r4), %xmm1 | |
552 | movaps 0*16(r4), %xmm0 | |
553 | #if 0 < LocalsSize | |
554 | add $Padding + LocalsSize, r4 | |
555 | #endif | |
556 | #if defined __i386__ | |
557 | pop r7 | |
558 | pop r6 | |
559 | pop r5 | |
560 | #endif | |
561 | pop r3 | |
562 | ||
563 | ret | |
564 | ||
565 | ||
566 | .globl _aes_decrypt_key192 | |
567 | // .private_extern _aes_decrypt_key192 | |
568 | _aes_decrypt_key192: | |
569 | ||
570 | /* Save registers and set SaveSize to the number of bytes pushed onto the | |
571 | stack so far, including the caller's return address. | |
572 | */ | |
573 | push r3 | |
574 | #if defined __i386__ | |
575 | push r5 | |
576 | push r6 | |
577 | push r7 | |
578 | #define SaveSize (5*4) | |
579 | #else | |
580 | #define SaveSize (2*8) | |
581 | #endif | |
582 | ||
583 | /* Number of bytes used for local variables: | |
584 | ||
585 | 8 16-byte spaces to save XMM registers. | |
586 | ||
587 | 8 four-byte spaces for work. | |
588 | */ | |
589 | #define LocalsSize (8*16 + 8*4) | |
590 | ||
591 | // Define stack offset to storage space for local data. | |
592 | #define Local (8*16) | |
593 | ||
594 | #if 0 < LocalsSize | |
595 | // Padding to position stack pointer at a multiple of 16 bytes. | |
596 | #define Padding (15 & -(SaveSize + LocalsSize)) | |
597 | sub $Padding + LocalsSize, r4 // Allocate space on stack. | |
598 | #else | |
599 | #define Padding 0 | |
600 | #endif | |
601 | ||
602 | /* StackFrame is the number of bytes in our stack frame, from caller's | |
603 | stack pointer to ours (so it includes the return address). | |
604 | */ | |
605 | #define StackFrame (SaveSize + Padding + LocalsSize) | |
606 | ||
607 | // Save xmm registers. | |
608 | movaps %xmm0, 0*16(r4) | |
609 | movaps %xmm1, 1*16(r4) | |
610 | movaps %xmm2, 2*16(r4) | |
611 | movaps %xmm3, 3*16(r4) | |
612 | movaps %xmm4, 4*16(r4) | |
613 | movaps %xmm5, 5*16(r4) | |
614 | movaps %xmm6, 6*16(r4) | |
615 | movaps %xmm7, 7*16(r4) | |
616 | ||
617 | #if defined __i386__ | |
618 | ||
619 | // Load arguments. | |
620 | #define Argument(i) StackFrame+4*(i)(r4) | |
621 | mov Argument(1), E | |
622 | mov Argument(0), K | |
623 | ||
624 | #endif | |
625 | ||
626 | // Merge point for _aes_decrypt_key and _aes_decrypt_key192. | |
627 | DKeyHas6Words: | |
628 | ||
629 | // First words of expanded key are copied from user key. | |
630 | movd 0*4(K), ve0 | |
631 | movd 1*4(K), ve1 | |
632 | movd 2*4(K), ve2 | |
633 | movd 3*4(K), ve3 | |
634 | ||
635 | movl $12*16, ContextKeyLength(E) // Set "key length." | |
636 | ||
637 | #if 0 != ContextKey | |
638 | add $ContextKey, E | |
639 | #endif | |
640 | ||
641 | movd 4*4(K), ve4 | |
642 | movd 5*4(K), ve5 | |
643 | ||
644 | // K cannot be used after we write to R, since they use the same register. | |
645 | ||
646 | #if defined __i386__ | |
647 | ||
648 | lea _AESRcon, R | |
649 | lea _AESInvMixColumnTable, ITable | |
650 | lea _AESSubBytesWordTable, STable | |
651 | ||
652 | #elif defined __x86_64__ | |
653 | ||
654 | lea _AESRcon(%rip), R | |
655 | lea _AESInvMixColumnTable(%rip), ITable | |
656 | lea _AESSubBytesWordTable(%rip), STable | |
657 | ||
658 | #endif | |
659 | ||
660 | /* With a six-word key, there are twelve rounds (thirteen 16-byte key | |
661 | blocks), eleven of which have InvMixColumn applied. The key expansion | |
662 | proceeds in iterations of six four-byte words, so the termination | |
663 | condition is a bit complicated. We set offset to the negative of 10 | |
664 | four four-byte words, and the loop branch does another iteration if | |
665 | offset is less than or equal to zero, meaning the number of iterations | |
666 | performed so far is less than or equal to 10. Thus, after ten | |
667 | iterations, it branches again. After the eleventh iteration, it | |
668 | stops. Code after the end of the loop computes the twelfth key block, | |
669 | which does not have InvMixColumn applied. | |
670 | */ | |
671 | mov $-10*4*4, offset | |
672 | sub offset, E | |
673 | ||
674 | // Store initial words of expanded key, which are copies of user's key. | |
675 | movd ve0, 0*4(E, offset) | |
676 | movd ve1, 1*4(E, offset) | |
677 | movd ve2, 2*4(E, offset) | |
678 | movd ve3, 3*4(E, offset) | |
679 | ||
680 | /* The first four words are stored untransformed. After that, words in | |
681 | the expanded key are transformed by InvMixColumn. | |
682 | */ | |
683 | movd ve4, dr | |
684 | call InvMixColumn | |
685 | movd vt0, 4*4(E, offset) | |
686 | ||
687 | movd ve5, dr | |
688 | call InvMixColumn | |
689 | movd vt0, 5*4(E, offset) | |
690 | ||
691 | /* Here is the first iteration of the key expansion. It is separate from the | |
692 | main loop below because we need to apply InvMixColumn to each of the | |
693 | outputs, in ve0 through ve5. In the main loop, the technique described at | |
694 | the top of this file is used to compute the proper outputs while using | |
695 | InvMixColumn only once. | |
696 | */ | |
697 | add $1, R // Advance pointer. | |
698 | movd ve5, dr // Put previous word into work register. | |
699 | movzx (R), t0d // Get round constant. | |
700 | movd t0d, vt0 | |
701 | ||
702 | SubWordRotWord | |
703 | pxor vt0, ve0 | |
704 | ||
705 | // Chain to successive words. | |
706 | pxor ve0, ve1 | |
707 | pxor ve1, ve2 | |
708 | pxor ve2, ve3 | |
709 | pxor ve3, ve4 | |
710 | pxor ve4, ve5 | |
711 | ||
712 | add $6*4, offset | |
713 | ||
714 | /* Apply InvMixColumn to each word. The transformed values are stored in | |
715 | the expanded key. The original values are retained in registers for | |
716 | further computation. | |
717 | */ | |
718 | movd ve0, dr | |
719 | call InvMixColumn | |
720 | movd vt0, 0*4(E, offset) | |
721 | ||
722 | movd ve1, dr | |
723 | call InvMixColumn | |
724 | movd vt0, 1*4(E, offset) | |
725 | ||
726 | movd ve2, dr | |
727 | call InvMixColumn | |
728 | movd vt0, 2*4(E, offset) | |
729 | ||
730 | movd ve3, dr | |
731 | call InvMixColumn | |
732 | movd vt0, 3*4(E, offset) | |
733 | ||
734 | movd (4-6)*4(E, offset), vt1 | |
735 | pxor vt1, vt0 | |
736 | movd vt0, 4*4(E, offset) | |
737 | ||
738 | movd (5-6)*4(E, offset), vt1 | |
739 | pxor vt1, vt0 | |
740 | movd vt0, 5*4(E, offset) | |
741 | ||
742 | // Here is the main loop. | |
743 | 1: | |
744 | add $1, R // Advance pointer. | |
745 | movd ve5, dr // Put previous word into work register. | |
746 | movzx (R), t0d // Get round constant. | |
747 | movd t0d, vt0 | |
748 | ||
749 | SubWordRotWord | |
750 | pxor vt0, ve0 | |
751 | ||
752 | // Chain to successive words. | |
753 | pxor ve0, ve1 | |
754 | pxor ve1, ve2 | |
755 | pxor ve2, ve3 | |
756 | pxor ve3, ve4 | |
757 | pxor ve4, ve5 | |
758 | ||
759 | // Apply InvMixColumn to the difference. | |
760 | movd vt0, dr | |
761 | call InvMixColumn | |
762 | ||
763 | add $6*4, offset | |
764 | ||
765 | // Chain the transformed difference to previously transformed outputs. | |
766 | movd (0-6)*4(E, offset), vt1 | |
767 | pxor vt1, vt0 | |
768 | movd vt0, 0*4(E, offset) | |
769 | ||
770 | movd (1-6)*4(E, offset), vt1 | |
771 | pxor vt1, vt0 | |
772 | movd vt0, 1*4(E, offset) | |
773 | ||
774 | movd (2-6)*4(E, offset), vt1 | |
775 | pxor vt1, vt0 | |
776 | movd vt0, 2*4(E, offset) | |
777 | ||
778 | movd (3-6)*4(E, offset), vt1 | |
779 | pxor vt1, vt0 | |
780 | movd vt0, 3*4(E, offset) | |
781 | ||
782 | movd (4-6)*4(E, offset), vt1 | |
783 | pxor vt1, vt0 | |
784 | movd vt0, 4*4(E, offset) | |
785 | ||
786 | movd (5-6)*4(E, offset), vt1 | |
787 | pxor vt1, vt0 | |
788 | movd vt0, 5*4(E, offset) | |
789 | ||
790 | jle 1b | |
791 | ||
792 | // Here is the final iteration, which does not perform InvMixColumn. | |
793 | ||
794 | movd ve5, dr // Put previous word into work register. | |
795 | movzx 1(R), t0d // Get round constant. | |
796 | movd t0d, vt0 | |
797 | ||
798 | SubWordRotWord | |
799 | pxor vt0, ve0 | |
800 | ||
801 | // Chain to successive words. | |
802 | movd ve0, 6*4(E, offset) | |
803 | pxor ve0, ve1 | |
804 | movd ve1, 7*4(E, offset) | |
805 | pxor ve1, ve2 | |
806 | movd ve2, 8*4(E, offset) | |
807 | pxor ve2, ve3 | |
808 | movd ve3, 9*4(E, offset) | |
809 | ||
810 | xor r0, r0 // Return success. | |
811 | ||
812 | // Pop stack and restore registers. | |
813 | movaps 7*16(r4), %xmm7 | |
814 | movaps 6*16(r4), %xmm6 | |
815 | movaps 5*16(r4), %xmm5 | |
816 | movaps 4*16(r4), %xmm4 | |
817 | movaps 3*16(r4), %xmm3 | |
818 | movaps 2*16(r4), %xmm2 | |
819 | movaps 1*16(r4), %xmm1 | |
820 | movaps 0*16(r4), %xmm0 | |
821 | #if 0 < LocalsSize | |
822 | add $Padding + LocalsSize, r4 | |
823 | #endif | |
824 | #if defined __i386__ | |
825 | pop r7 | |
826 | pop r6 | |
827 | pop r5 | |
828 | #endif | |
829 | pop r3 | |
830 | ||
831 | ret | |
832 | ||
833 | ||
834 | .globl _aes_decrypt_key256 | |
835 | // .private_extern _aes_decrypt_key256 | |
836 | _aes_decrypt_key256: | |
837 | ||
838 | /* Save registers and set SaveSize to the number of bytes pushed onto the | |
839 | stack so far, including the caller's return address. | |
840 | */ | |
841 | push r3 | |
842 | #if defined __i386__ | |
843 | push r5 | |
844 | push r6 | |
845 | push r7 | |
846 | #define SaveSize (5*4) | |
847 | #else | |
848 | #define SaveSize (2*8) | |
849 | #endif | |
850 | ||
851 | /* Number of bytes used for local variables: | |
852 | ||
853 | 8 16-byte spaces to save XMM registers. | |
854 | ||
855 | 8 four-byte spaces for work. | |
856 | */ | |
857 | #define LocalsSize (8*16 + 8*4) | |
858 | ||
859 | // Define stack offset to storage space for local data. | |
860 | #define Local (8*16) | |
861 | ||
862 | #if 0 < LocalsSize | |
863 | // Padding to position stack pointer at a multiple of 16 bytes. | |
864 | #define Padding (15 & -(SaveSize + LocalsSize)) | |
865 | sub $Padding + LocalsSize, r4 // Allocate space on stack. | |
866 | #else | |
867 | #define Padding 0 | |
868 | #endif | |
869 | ||
870 | /* StackFrame is the number of bytes in our stack frame, from caller's | |
871 | stack pointer to ours (so it includes the return address). | |
872 | */ | |
873 | #define StackFrame (SaveSize + Padding + LocalsSize) | |
874 | ||
875 | // Save xmm registers. | |
876 | movaps %xmm0, 0*16(r4) | |
877 | movaps %xmm1, 1*16(r4) | |
878 | movaps %xmm2, 2*16(r4) | |
879 | movaps %xmm3, 3*16(r4) | |
880 | movaps %xmm4, 4*16(r4) | |
881 | movaps %xmm5, 5*16(r4) | |
882 | movaps %xmm6, 6*16(r4) | |
883 | movaps %xmm7, 7*16(r4) | |
884 | ||
885 | #if defined __i386__ | |
886 | ||
887 | // Load arguments. | |
888 | #define Argument(i) StackFrame+4*(i)(r4) | |
889 | mov Argument(1), E | |
890 | mov Argument(0), K | |
891 | ||
892 | #endif | |
893 | ||
894 | // Merge point for _aes_decrypt_key and _aes_decrypt_key256. | |
895 | DKeyHas8Words: | |
896 | ||
897 | // First words of expanded key are copied from user key. | |
898 | movd 0*4(K), ve0 | |
899 | movd 1*4(K), ve1 | |
900 | movd 2*4(K), ve2 | |
901 | movd 3*4(K), ve3 | |
902 | ||
903 | movl $14*16, ContextKeyLength(E) // Set "key length." | |
904 | ||
905 | #if 0 != ContextKey | |
906 | add $ContextKey, E | |
907 | #endif | |
908 | ||
909 | // Store initial words of expanded key, which are copies of user's key. | |
910 | movd ve0, 0*4(E) | |
911 | movd ve1, 1*4(E) | |
912 | movd ve2, 2*4(E) | |
913 | movd ve3, 3*4(E) | |
914 | movd 4*4(K), ve0 | |
915 | movd 5*4(K), ve1 | |
916 | movd 6*4(K), ve2 | |
917 | movd 7*4(K), ve3 | |
918 | ||
919 | // K cannot be used after we write to R, since they use the same register. | |
920 | ||
921 | #if defined __i386__ | |
922 | ||
923 | lea _AESRcon, R | |
924 | lea _AESInvMixColumnTable, ITable | |
925 | lea _AESSubBytesWordTable, STable | |
926 | ||
927 | #elif defined __x86_64__ | |
928 | ||
929 | lea _AESRcon(%rip), R | |
930 | lea _AESInvMixColumnTable(%rip), ITable | |
931 | lea _AESSubBytesWordTable(%rip), STable | |
932 | ||
933 | #endif | |
934 | ||
935 | /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key | |
936 | blocks), thirteen of which have InvMixColumn applied. | |
937 | */ | |
938 | mov $-12*4*4, offset | |
939 | sub offset, E | |
940 | ||
941 | // Save untransformed values in stack area. | |
942 | movd ve0, 4*4+Local(r4) | |
943 | movd ve1, 5*4+Local(r4) | |
944 | movd ve2, 6*4+Local(r4) | |
945 | movd ve3, 7*4+Local(r4) | |
946 | ||
947 | /* Apply InvMixColumn to words 4 through 7. The transformed values are | |
948 | stored in the expanded key. The original values are saved in the stack | |
949 | area for further computation. | |
950 | */ | |
951 | movd ve0, dr | |
952 | call InvMixColumn | |
953 | movd vt0, 4*4(E, offset) | |
954 | ||
955 | movd ve1, dr | |
956 | call InvMixColumn | |
957 | movd vt0, 5*4(E, offset) | |
958 | ||
959 | movd ve2, dr | |
960 | call InvMixColumn | |
961 | movd vt0, 6*4(E, offset) | |
962 | ||
963 | movd ve3, dr | |
964 | call InvMixColumn | |
965 | movd vt0, 7*4(E, offset) | |
966 | ||
967 | /* Here is the first iteration of the key expansion. It is separate from the | |
968 | main loop below because we need to apply InvMixColumn to each of the | |
969 | outputs, in ve0 through ve3. In the main loop, the technique described at | |
970 | the top of this file is used to compute the proper outputs while using | |
971 | InvMixColumn only once. | |
972 | */ | |
973 | add $1, R // Advance pointer. | |
974 | movd ve3, dr // Put previous word into work register. | |
975 | movzx (R), t0d // Get round constant. | |
976 | movd t0d, vt0 | |
977 | ||
978 | SubWordRotWord | |
979 | ||
980 | add $8*4, offset | |
981 | ||
982 | movd (0-8)*4(E, offset), ve0 // Get old word. | |
983 | pxor vt0, ve0 | |
984 | movd ve0, 0*4+Local(r4) // Save on stack. | |
985 | movd ve0, dr | |
986 | call InvMixColumn | |
987 | movd vt0, 0*4(E, offset) // Write to expanded key. | |
988 | ||
989 | /* Chain to successive words and apply InvMixColumn to each word. The | |
990 | transformed values are stored in the expanded key. The original | |
991 | values are retained in local data for further computation. | |
992 | */ | |
993 | movd (1-8)*4(E, offset), ve1 // Get old word. | |
994 | pxor ve0, ve1 // Chain. | |
995 | movd ve1, 1*4+Local(r4) // Save on stack. | |
996 | movd ve1, dr | |
997 | call InvMixColumn | |
998 | movd vt0, 1*4(E, offset) // Write to expanded key. | |
999 | ||
1000 | movd (2-8)*4(E, offset), ve2 // Get old word. | |
1001 | pxor ve1, ve2 // Chain. | |
1002 | movd ve2, 2*4+Local(r4) // Save on stack. | |
1003 | movd ve2, dr | |
1004 | call InvMixColumn | |
1005 | movd vt0, 2*4(E, offset) // Write to expanded key. | |
1006 | ||
1007 | movd (3-8)*4(E, offset), ve3 // Get old word. | |
1008 | pxor ve2, ve3 // Chain. | |
1009 | movd ve3, 3*4+Local(r4) // Save on stack. | |
1010 | movd ve3, dr | |
1011 | call InvMixColumn | |
1012 | movd vt0, 3*4(E, offset) // Write to expanded key. | |
1013 | ||
1014 | movd ve3, dr // Put previous word into work register. | |
1015 | SubWord | |
1016 | ||
1017 | movd 4*4+Local(r4), ve0 // Get old word. | |
1018 | pxor vt0, ve0 // Chain. | |
1019 | movd ve0, 4*4+Local(r4) // Save on stack. | |
1020 | ||
1021 | movd 5*4+Local(r4), ve1 // Get old word. | |
1022 | pxor ve0, ve1 // Chain. | |
1023 | movd ve1, 5*4+Local(r4) // Save on stack. | |
1024 | ||
1025 | movd 6*4+Local(r4), ve2 // Get old word. | |
1026 | pxor ve1, ve2 // Chain. | |
1027 | movd ve2, 6*4+Local(r4) // Save on stack. | |
1028 | ||
1029 | movd 7*4+Local(r4), ve3 // Get old word. | |
1030 | pxor ve2, ve3 // Chain. | |
1031 | movd ve3, 7*4+Local(r4) // Save on stack. | |
1032 | ||
1033 | movd vt0, dr // Move change to work register. | |
1034 | call InvMixColumn | |
1035 | ||
1036 | movd (4-8)*4(E, offset), vt1 // Get old word. | |
1037 | pxor vt1, vt0 // Chain. | |
1038 | movd vt0, 4*4(E, offset) // Write new word to expanded key. | |
1039 | ||
1040 | movd (5-8)*4(E, offset), vt1 // Get old word. | |
1041 | pxor vt1, vt0 // Chain. | |
1042 | movd vt0, 5*4(E, offset) // Write new word to expanded key. | |
1043 | ||
1044 | movd (6-8)*4(E, offset), vt1 // Get old word. | |
1045 | pxor vt1, vt0 // Chain. | |
1046 | movd vt0, 6*4(E, offset) // Write new word to expanded key. | |
1047 | ||
1048 | movd (7-8)*4(E, offset), vt1 // Get old word. | |
1049 | pxor vt1, vt0 // Chain. | |
1050 | movd vt0, 7*4(E, offset) // Write new word to expanded key. | |
1051 | ||
1052 | // Here is the main loop. | |
1053 | 1: | |
1054 | add $1, R // Advance pointer. | |
1055 | movd ve3, dr // Put previous word into work register. | |
1056 | movzx (R), t0d // Get round constant. | |
1057 | movd t0d, vt0 | |
1058 | ||
1059 | SubWordRotWord | |
1060 | ||
1061 | movd 0*4+Local(r4), ve0 // Get old word. | |
1062 | pxor vt0, ve0 | |
1063 | movd ve0, 0*4+Local(r4) // Save on stack. | |
1064 | ||
1065 | // Chain to successive words. | |
1066 | movd 1*4+Local(r4), ve1 // Get old word. | |
1067 | pxor ve0, ve1 // Chain. | |
1068 | movd ve1, 1*4+Local(r4) // Save on stack. | |
1069 | ||
1070 | movd 2*4+Local(r4), ve2 // Get old word. | |
1071 | pxor ve1, ve2 // Chain. | |
1072 | movd ve2, 2*4+Local(r4) // Save on stack. | |
1073 | ||
1074 | movd 3*4+Local(r4), ve3 // Get old word. | |
1075 | pxor ve2, ve3 // Chain. | |
1076 | movd ve3, 3*4+Local(r4) // Save on stack. | |
1077 | ||
1078 | movd vt0, dr // Move change to work register. | |
1079 | call InvMixColumn | |
1080 | ||
1081 | movd 0*4(E, offset), vt1 // Get old word. | |
1082 | pxor vt1, vt0 // Chain. | |
1083 | movd vt0, (0+8)*4(E, offset) // Write new word to expanded key. | |
1084 | ||
1085 | movd 1*4(E, offset), vt1 // Get old word. | |
1086 | pxor vt1, vt0 // Chain. | |
1087 | movd vt0, (1+8)*4(E, offset) // Write new word to expanded key. | |
1088 | ||
1089 | movd 2*4(E, offset), vt1 // Get old word. | |
1090 | pxor vt1, vt0 // Chain. | |
1091 | movd vt0, (2+8)*4(E, offset) // Write new word to expanded key. | |
1092 | ||
1093 | movd 3*4(E, offset), vt1 // Get old word. | |
1094 | pxor vt1, vt0 // Chain. | |
1095 | movd vt0, (3+8)*4(E, offset) // Write new word to expanded key. | |
1096 | ||
1097 | movd ve3, dr // Put previous word into work register. | |
1098 | SubWord | |
1099 | ||
1100 | movd 4*4+Local(r4), ve0 // Get old word. | |
1101 | pxor vt0, ve0 // Chain. | |
1102 | movd ve0, 4*4+Local(r4) // Save on stack. | |
1103 | ||
1104 | movd 5*4+Local(r4), ve1 // Get old word. | |
1105 | pxor ve0, ve1 // Chain. | |
1106 | movd ve1, 5*4+Local(r4) // Save on stack. | |
1107 | ||
1108 | movd 6*4+Local(r4), ve2 // Get old word. | |
1109 | pxor ve1, ve2 // Chain. | |
1110 | movd ve2, 6*4+Local(r4) // Save on stack. | |
1111 | ||
1112 | movd 7*4+Local(r4), ve3 // Get old word. | |
1113 | pxor ve2, ve3 // Chain. | |
1114 | movd ve3, 7*4+Local(r4) // Save on stack. | |
1115 | ||
1116 | movd vt0, dr // Move change to work register. | |
1117 | call InvMixColumn | |
1118 | ||
1119 | movd 4*4(E, offset), vt1 // Get old word. | |
1120 | pxor vt1, vt0 // Chain. | |
1121 | movd vt0, (4+8)*4(E, offset) // Write new word to expanded key. | |
1122 | ||
1123 | movd 5*4(E, offset), vt1 // Get old word. | |
1124 | pxor vt1, vt0 // Chain. | |
1125 | movd vt0, (5+8)*4(E, offset) // Write new word to expanded key. | |
1126 | ||
1127 | movd 6*4(E, offset), vt1 // Get old word. | |
1128 | pxor vt1, vt0 // Chain. | |
1129 | movd vt0, (6+8)*4(E, offset) // Write new word to expanded key. | |
1130 | ||
1131 | movd 7*4(E, offset), vt1 // Get old word. | |
1132 | pxor vt1, vt0 // Chain. | |
1133 | movd vt0, (7+8)*4(E, offset) // Write new word to expanded key. | |
1134 | ||
1135 | add $8*4, offset | |
1136 | ||
1137 | jl 1b | |
1138 | ||
1139 | movd ve3, dr // Put previous word into work register. | |
1140 | movzx 1(R), t0d // Get round constant. | |
1141 | movd t0d, vt0 | |
1142 | ||
1143 | SubWordRotWord | |
1144 | ||
1145 | movd 0*4+Local(r4), ve0 // Get old word. | |
1146 | pxor vt0, ve0 // Chain. | |
1147 | movd ve0, (0+8)*4(E, offset) | |
1148 | ||
1149 | // Chain to successive words. | |
1150 | movd 1*4+Local(r4), ve1 // Get old word. | |
1151 | pxor ve0, ve1 // Chain. | |
1152 | movd ve1, (1+8)*4(E, offset) | |
1153 | ||
1154 | movd 2*4+Local(r4), ve2 // Get old word. | |
1155 | pxor ve1, ve2 // Chain. | |
1156 | movd ve2, (2+8)*4(E, offset) | |
1157 | ||
1158 | movd 3*4+Local(r4), ve3 // Get old word. | |
1159 | pxor ve2, ve3 // Chain. | |
1160 | movd ve3, (3+8)*4(E, offset) | |
1161 | ||
1162 | xor r0, r0 // Return success. | |
1163 | ||
1164 | // Pop stack and restore registers. | |
1165 | movaps 7*16(r4), %xmm7 | |
1166 | movaps 6*16(r4), %xmm6 | |
1167 | movaps 5*16(r4), %xmm5 | |
1168 | movaps 4*16(r4), %xmm4 | |
1169 | movaps 3*16(r4), %xmm3 | |
1170 | movaps 2*16(r4), %xmm2 | |
1171 | movaps 1*16(r4), %xmm1 | |
1172 | movaps 0*16(r4), %xmm0 | |
1173 | #if 0 < LocalsSize | |
1174 | add $Padding + LocalsSize, r4 | |
1175 | #endif | |
1176 | #if defined __i386__ | |
1177 | pop r7 | |
1178 | pop r6 | |
1179 | pop r5 | |
1180 | #endif | |
1181 | pop r3 | |
1182 | ||
1183 | ret | |
1184 | ||
1185 | ||
1186 | #undef Address | |
1187 | #undef Argument | |
1188 | #undef E | |
1189 | #undef ITable | |
1190 | #undef K | |
1191 | #undef Local | |
1192 | #undef LocalsSize | |
1193 | #undef LookupI | |
1194 | #undef LookupS | |
1195 | #undef Nk | |
1196 | #undef Padding | |
1197 | #undef R | |
1198 | #undef SaveSize | |
1199 | #undef STable | |
1200 | #undef StackFrame | |
1201 | #undef dr | |
1202 | #undef drh | |
1203 | #undef drl | |
1204 | #undef offset | |
1205 | #undef t0 | |
1206 | #undef t0d | |
1207 | #undef ve0 | |
1208 | #undef ve1 | |
1209 | #undef ve2 | |
1210 | #undef ve3 | |
1211 | #undef ve4 | |
1212 | #undef ve5 | |
1213 | #undef vt0 | |
1214 | #undef vt1 |