]> git.saurik.com Git - apple/xnu.git/blame - bsd/crypto/aes/i386/ExpandKeyForDecryption.s
xnu-1699.22.73.tar.gz
[apple/xnu.git] / bsd / crypto / aes / i386 / ExpandKeyForDecryption.s
CommitLineData
6d2010ae
A
1/* This file defines _aes_decrypt_key, _aes_decrypt_key128,
2 _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be
3 included in another assembly file with the preprocessor #include directive,
4 to benefit from some assembly-time calculations.
5
6 Written by Eric Postpischil, January 2008.
7
8 The comments here do not say much about the algorithm; the code just
9 follows the FIPS-197 specification. I recommend reading the specification
10 before working with this code or examining the C code in the parent
11 directory that illustrates key expansion.
12
13 One complication is that this routine both expands the key and applies
14 InvMixColumn to most of the words in the expanded key. This modifies the
15 key for use with the Equivalent Inverse Cipher.
16
17 During key expansion, there are sequences of four or six words that are
18 produced like this:
19
20 E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function.
21 E[i+1] = E[i+1-Nk] ^ E[i+0].
22 E[i+2] = E[i+2-Nk] ^ E[i+1].
23 E[i+3] = E[i+3-Nk] ^ E[i+2].
24
25 When Nk is four or eight, the sequence stops there. When it is six, it
26 goes on for two more words. Let I be the InvMixColumn function. for the
27 Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]),
28 I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not
29 need to calculate I four times. In AES' finite field, I is a linear
30 combination of the four bytes of its input. The ^ operation on the bits
31 that represent field elements is an addition in the Galois field. So
32 I(a ^ b) = I(a) ^ I(b). Then we have:
33
34 I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])).
35 I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]).
36 I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]).
37 I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]).
38
39 To compute this, we compute I(f(E[i-1])) and XOR it with the previously
40 stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously
41 stored E[i+1-Nk])) to get I(E[i+1])), and so on.
42
43 Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to
44 compute the pre-InvMixColumn words of the expanded key; it is not
45 sufficient to have the post-InvMixColumn words.
46*/
47
48
49/* Routine:
50
51 _aes_decrypt_key.
52
53 _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256.
54
55 Function:
56
57 Expand the user's cipher key into the key schedule, as defined in
58 Federal Information Processing Standards Publication 197 (FIPS-197),
59 November 26, 2001.
60
61 For decryption, the key is modified as shown in Figure 15 in FIPS-197,
62 to support the Equivalent Inverse Cipher.
63
64 Input:
65
66 Constant data:
67
68 The following names must be locally defined so the assembler
69 can calculate certain offsets.
70
71 static const Word _AESSubBytesWordTable[4][256].
72
73 _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
74 SubBytes is defined in FIPS-197. _AESSubBytesWordTable
75 differs from _AESEncryptTable in that it does not include
76 the MixColumn operation. It is used in performing the last
77 round, which differs fromm the previous rounds in that it
78 does not include the MixColumn operation.
79
80 static const Word _AESSInvMixColumnTable[4][256].
81
82 _AESInvMixColumnTable[i][j] contains the contribution of byte
83 j to element i of the InvMixColumn operation.
84
85 The four bytes of the word _AESInvMixColumnTable[0][j] are:
86
87 {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j},
88
89 listed in increasing address order, where multiplication is
90 performed in the Galois field. {j} designates the element of
91 the Galois field represented by j. _AESInvMixColumn[i][j] has
92 the same bytes, rotated right in the order shown above.
93
94 static const Byte _AESRcon[].
95
96 Round constants, beginning with AESRcon[1] for the first round
97 (AESRcon[0] is padding.)
98
99 Arguments:
100
101 const uint8_t *Key
102
103 Address of user's cipher key.
104
105 int Length
106
107 Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in
108 user's cipher key.
109
110 This argument is used with _aes_decrypt_key. It is not
111 present for the other routines. In those routines, Context
112 is the second argument.
113
114 aes_decrypt_ctx *Context
115
116 Structure to contain the expanded key beginning at offset
117 ContextKey and a four-byte "key length" beginning at offset
118 ContextKeyLength. The "key length" is the number of bytes from
119 the start of the first round key to the startof the last rond
120 key. That is 16 less than the number of bytes in the entire
121 key.
122
123 Output:
124
125 The expanded key and the "key length" are written to *Context.
126
127 Return:
128
129 aes_rval // -1 if "key length" is invalid. 0 otherwise.
130*/
131/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */
132
133#ifdef KERNEL
134#include <i386/cpu_capabilities.h>
135#else
136#include <System/i386/cpu_capabilities.h>
137#endif
138
139#define dr r0d // Dissection register.
140#define drl r0l // Low 8 bits of dissection register.
141#define drh r0h // Second-lowest 8 bits of dissection register.
142
143#define t0 r1
144#define t0d r1d // Low 32 bits of t0.
145
146#define STable r2 // Address of SubBytes table. Overlaps Nk.
147#define ITable r3 // Address of InvMixColumn table.
148#define offset Arch(r5, r11) // Address offset and loop sentinel.
149
150#define R r7 // Address of round constant.
151#define K r7 // User key pointer.
152 // R and K overlap.
153
154#define E r6 // Expanded key pointer.
155
156#define ve0 %xmm0
157#define ve1 %xmm1
158#define ve2 %xmm2
159#define ve3 %xmm3
160#define ve4 %xmm4
161#define ve5 %xmm5
162#define vt1 %xmm6
163#define vt0 %xmm7
164
165#define LookupS(table, index) (table)*TableSize(STable, index, 4)
166#define LookupI(table, index) (table)*TableSize(ITable, index, 4)
167
168
169/* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard
170 subroutine. It does not conform to the ABI. It is an integral part of
171 _ExpandKeyForDecryption and shares register use with it.
172*/
173InvMixColumn:
174 movzx drl, t0
175 movd LookupI(0, t0), vt0 // Look up byte 0 in table 0.
176 movzx drh, t0d
177 movd LookupI(1, t0), vt1 // Look up byte 1 in table 1.
178 pxor vt1, vt0
179 shr $16, dr
180 movzx drl, t0d
181 movd LookupI(2, t0), vt1 // Look up byte 2 in table 2.
182 pxor vt1, vt0
183 movzx drh, t0d
184 movd LookupI(3, t0), vt1 // Look up byte 3 in table 3.
185 pxor vt1, vt0
186 ret
187
188
189 // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0.
190 .macro SubWordRotWord
191 movzx drl, t0
192 movd LookupS(3, t0), vt1 // Look up byte 0 in table 3.
193 pxor vt1, vt0
194 movzx drh, t0d
195 movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
196 pxor vt1, vt0
197 shr $$16, dr
198 movzx drl, t0d
199 movd LookupS(1, t0), vt1 // Look up byte 2 in table 1.
200 pxor vt1, vt0
201 movzx drh, t0d
202 movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
203 pxor vt1, vt0
204 .endmacro
205
206
207 // SubWord puts SubWord(dr) into vt0.
208 .macro SubWord
209 movzx drl, t0
210 movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
211 movzx drh, t0d
212 movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
213 pxor vt1,vt0
214 shr $$16, dr
215 movzx drl, t0d
216 movd LookupS(2, t0), vt1 // Look up byte 2 in table 2.
217 pxor vt1,vt0
218 movzx drh, t0d
219 movd LookupS(3, t0), vt1 // Look up byte 3 in table 3.
220 pxor vt1,vt0
221 .endmacro
222
223 .text
224 .globl _aes_decrypt_key
225// .private_extern _aes_decrypt_key
226_aes_decrypt_key:
227
228 // detect AES HW, cclee 3-13-10
229#if defined __x86_64__
230 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
231 mov (%rax), %eax // %eax = __cpu_capabilities
232#else
233#if defined KERNEL
234 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
235 mov (%eax), %eax // %eax = __cpu_capabilities
236#else
237 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
238#endif
239
240#endif
241 test $(kHasAES), %eax // __cpu_capabilities & kHasAES
242 jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw
243 /* Save registers and set SaveSize to the number of bytes pushed onto the
244 stack so far, including the caller's return address.
245 */
246 push r3
247 #if defined __i386__
248 push r5
249 push r6
250 push r7
251 #define SaveSize (5*4)
252 #else
253 #define SaveSize (2*8)
254 #endif
255
256 /* Number of bytes used for local variables:
257
258 8 16-byte spaces to save XMM registers.
259
260 8 four-byte spaces for work.
261 */
262 #define LocalsSize (8*16 + 8*4)
263
264 // Define stack offset to storage space for local data.
265 #define Local (8*16)
266
267 #if 0 < LocalsSize
268 // Padding to position stack pointer at a multiple of 16 bytes.
269 #define Padding (15 & -(SaveSize + LocalsSize))
270 sub $Padding + LocalsSize, r4 // Allocate space on stack.
271 #else
272 #define Padding 0
273 #endif
274
275 /* StackFrame is the number of bytes in our stack frame, from caller's
276 stack pointer to ours (so it includes the return address).
277 */
278 #define StackFrame (SaveSize + Padding + LocalsSize)
279
280 // Save xmm registers.
281 movaps %xmm0, 0*16(r4)
282 movaps %xmm1, 1*16(r4)
283 movaps %xmm2, 2*16(r4)
284 movaps %xmm3, 3*16(r4)
285 movaps %xmm4, 4*16(r4)
286 movaps %xmm5, 5*16(r4)
287 movaps %xmm6, 6*16(r4)
288 movaps %xmm7, 7*16(r4)
289
290#if defined __i386__
291
292 // Define location of argument i.
293 #define Argument(i) StackFrame+4*(i)(r4)
294
295 #define Nk t0d
296
297 // Load arguments.
298 mov Argument(2), E
299 mov Argument(1), Nk
300 mov Argument(0), K
301
302#elif defined __x86_64__
303
304 #define Nk r9d // Number of words in key.
305 mov r6d, Nk // Move Nk argument out of way.
306 mov r2, E // Move E argument to common register.
307
308#endif
309
310 // Dispatch on key length.
311 cmp $128, Nk
312 jge 2f
313 shl $3, Nk // Convert from bytes to bits.
314 cmp $128, Nk
3152:
316 je DKeyHas4Words
317 cmp $192, Nk
318 je DKeyHas6Words
319 cmp $256, Nk
320 je DKeyHas8Words
321 mov $-1, r0 // Return error.
322 jmp 9f
323
324
325 .globl _aes_decrypt_key128
326// .private_extern _aes_decrypt_key128
327_aes_decrypt_key128:
328
329 /* Save registers and set SaveSize to the number of bytes pushed onto the
330 stack so far, including the caller's return address.
331 */
332 push r3
333 #if defined __i386__
334 push r5
335 push r6
336 push r7
337 #define SaveSize (5*4)
338 #else
339 #define SaveSize (2*8)
340 #endif
341
342 /* Number of bytes used for local variables:
343
344 8 16-byte spaces to save XMM registers.
345
346 8 four-byte spaces for work.
347 */
348 #define LocalsSize (8*16 + 8*4)
349
350 // Define stack offset to storage space for local data.
351 #define Local (8*16)
352
353 #if 0 < LocalsSize
354 // Padding to position stack pointer at a multiple of 16 bytes.
355 #define Padding (15 & -(SaveSize + LocalsSize))
356 sub $Padding + LocalsSize, r4 // Allocate space on stack.
357 #else
358 #define Padding 0
359 #endif
360
361 /* StackFrame is the number of bytes in our stack frame, from caller's
362 stack pointer to ours (so it includes the return address).
363 */
364 #define StackFrame (SaveSize + Padding + LocalsSize)
365
366 // Save xmm registers.
367 movaps %xmm0, 0*16(r4)
368 movaps %xmm1, 1*16(r4)
369 movaps %xmm2, 2*16(r4)
370 movaps %xmm3, 3*16(r4)
371 movaps %xmm4, 4*16(r4)
372 movaps %xmm5, 5*16(r4)
373 movaps %xmm6, 6*16(r4)
374 movaps %xmm7, 7*16(r4)
375
376#if defined __i386__
377
378 // Load arguments.
379 #define Argument(i) StackFrame+4*(i)(r4)
380 mov Argument(1), E
381 mov Argument(0), K
382
383#endif
384
385// Merge point for _aes_decrypt_key and _aes_decrypt_key128.
386DKeyHas4Words:
387
388 // First words of expanded key are copied from user key.
389 movd 0*4(K), ve0
390 movd 1*4(K), ve1
391 movd 2*4(K), ve2
392 movd 3*4(K), ve3
393
394 movl $10*16, ContextKeyLength(E) // Set "key length."
395
396 #if 0 != ContextKey
397 add $ContextKey, E
398 #endif
399
400 // K cannot be used after we write to R, since they use the same register.
401
402 #if defined __i386__
403
404 lea _AESRcon, R
405 lea _AESInvMixColumnTable, ITable
406 lea _AESSubBytesWordTable, STable
407
408 #elif defined __x86_64__
409
410 lea _AESRcon(%rip), R
411 lea _AESInvMixColumnTable(%rip), ITable
412 lea _AESSubBytesWordTable(%rip), STable
413
414 #endif
415
416 /* With a four-word key, there are ten rounds (eleven 16-byte key blocks),
417 nine of which have InvMixColumn applied.
418 */
419 mov $-9*4*4, offset
420 sub offset, E
421
422 // Store initial words of expanded key, which are copies of user's key.
423 movd ve0, 0*4(E, offset)
424 movd ve1, 1*4(E, offset)
425 movd ve2, 2*4(E, offset)
426 movd ve3, 3*4(E, offset)
427
428/* Here is the first iteration of the key expansion. It is separate from the
429 main loop below because we need to apply InvMixColumn to each of the
430 outputs, in ve0 through ve3. In the main loop, the technique described at
431 the top of this file is used to compute the proper outputs while using
432 InvMixColumn only once.
433*/
434 add $1, R // Advance pointer.
435 movd ve3, dr // Put previous word into work register.
436 movzx (R), t0d // Get round constant.
437 movd t0d, vt0
438
439 SubWordRotWord
440 pxor vt0, ve0
441
442 // Chain to successive words.
443 pxor ve0, ve1
444 pxor ve1, ve2
445 pxor ve2, ve3
446
447 add $4*4, offset
448
449 /* Apply InvMixColumn to each word. The transformed values are stored in
450 the expanded key. The original values are retained in registers for
451 further computation.
452 */
453 movd ve0, dr
454 call InvMixColumn
455 movd vt0, 0*4(E, offset)
456
457 movd ve1, dr
458 call InvMixColumn
459 movd vt0, 1*4(E, offset)
460
461 movd ve2, dr
462 call InvMixColumn
463 movd vt0, 2*4(E, offset)
464
465 movd ve3, dr
466 call InvMixColumn
467 movd vt0, 3*4(E, offset)
468
469// Here is the main loop.
4701:
471 add $1, R // Advance pointer.
472 movd ve3, dr // Put previous word into work register.
473 movzx (R), t0d // Get round constant.
474 movd t0d, vt0
475
476 SubWordRotWord
477 pxor vt0, ve0
478
479 // Chain to successive words.
480 pxor ve0, ve1
481 pxor ve1, ve2
482 pxor ve2, ve3
483 /* Dr. Brian Gladman uses a technique with a single XOR here instead
484 of the previous four. There is some periodic behavior in the key
485 expansion, and Gladman maintains E[4*i+3] for the latest four
486 values of i. XORing the value in vt0 with one of these yields its
487 replacement. However, using this technique requires additional
488 instructions before the loop (to initialize the values) and after
489 it (to extract the final values to be stored) and either some way
490 to rotate or index four values in the loop or a four-fold unrolling
491 of the loop to provide the indexing. Experiment suggests the
492 former is not worthwhile. Unrolling the loop might give a small
493 gain, at the cost of increased use of instruction cache, increased
494 instructions loads the first time the routine is executed, and
495 increased code complexity, so I decided against it.
496 */
497
498 // Apply InvMixColumn to the difference.
499 movd vt0, dr
500 call InvMixColumn
501
502 add $4*4, offset
503
504 // Chain the transformed difference to previously transformed outputs.
505 movd (0-4)*4(E, offset), vt1
506 pxor vt1, vt0
507 movd vt0, 0*4(E, offset)
508
509 movd (1-4)*4(E, offset), vt1
510 pxor vt1, vt0
511 movd vt0, 1*4(E, offset)
512
513 movd (2-4)*4(E, offset), vt1
514 pxor vt1, vt0
515 movd vt0, 2*4(E, offset)
516
517 movd (3-4)*4(E, offset), vt1
518 pxor vt1, vt0
519 movd vt0, 3*4(E, offset)
520
521 jl 1b
522
523// Here is the final iteration, which does not perform InvMixColumn.
524
525 movd ve3, dr // Put previous word into work register.
526 movzx 1(R), t0d // Get round constant.
527 movd t0d, vt0
528
529 SubWordRotWord
530 pxor vt0, ve0
531
532 // Chain to successive words.
533 movd ve0, 4*4(E, offset)
534 pxor ve0, ve1
535 movd ve1, 5*4(E, offset)
536 pxor ve1, ve2
537 movd ve2, 6*4(E, offset)
538 pxor ve2, ve3
539 movd ve3, 7*4(E, offset)
540
541 xor r0, r0 // Return success.
542
5439:
544 // Pop stack and restore registers.
545 movaps 7*16(r4), %xmm7
546 movaps 6*16(r4), %xmm6
547 movaps 5*16(r4), %xmm5
548 movaps 4*16(r4), %xmm4
549 movaps 3*16(r4), %xmm3
550 movaps 2*16(r4), %xmm2
551 movaps 1*16(r4), %xmm1
552 movaps 0*16(r4), %xmm0
553 #if 0 < LocalsSize
554 add $Padding + LocalsSize, r4
555 #endif
556 #if defined __i386__
557 pop r7
558 pop r6
559 pop r5
560 #endif
561 pop r3
562
563 ret
564
565
566 .globl _aes_decrypt_key192
567// .private_extern _aes_decrypt_key192
568_aes_decrypt_key192:
569
570 /* Save registers and set SaveSize to the number of bytes pushed onto the
571 stack so far, including the caller's return address.
572 */
573 push r3
574 #if defined __i386__
575 push r5
576 push r6
577 push r7
578 #define SaveSize (5*4)
579 #else
580 #define SaveSize (2*8)
581 #endif
582
583 /* Number of bytes used for local variables:
584
585 8 16-byte spaces to save XMM registers.
586
587 8 four-byte spaces for work.
588 */
589 #define LocalsSize (8*16 + 8*4)
590
591 // Define stack offset to storage space for local data.
592 #define Local (8*16)
593
594 #if 0 < LocalsSize
595 // Padding to position stack pointer at a multiple of 16 bytes.
596 #define Padding (15 & -(SaveSize + LocalsSize))
597 sub $Padding + LocalsSize, r4 // Allocate space on stack.
598 #else
599 #define Padding 0
600 #endif
601
602 /* StackFrame is the number of bytes in our stack frame, from caller's
603 stack pointer to ours (so it includes the return address).
604 */
605 #define StackFrame (SaveSize + Padding + LocalsSize)
606
607 // Save xmm registers.
608 movaps %xmm0, 0*16(r4)
609 movaps %xmm1, 1*16(r4)
610 movaps %xmm2, 2*16(r4)
611 movaps %xmm3, 3*16(r4)
612 movaps %xmm4, 4*16(r4)
613 movaps %xmm5, 5*16(r4)
614 movaps %xmm6, 6*16(r4)
615 movaps %xmm7, 7*16(r4)
616
617#if defined __i386__
618
619 // Load arguments.
620 #define Argument(i) StackFrame+4*(i)(r4)
621 mov Argument(1), E
622 mov Argument(0), K
623
624#endif
625
626// Merge point for _aes_decrypt_key and _aes_decrypt_key192.
627DKeyHas6Words:
628
629 // First words of expanded key are copied from user key.
630 movd 0*4(K), ve0
631 movd 1*4(K), ve1
632 movd 2*4(K), ve2
633 movd 3*4(K), ve3
634
635 movl $12*16, ContextKeyLength(E) // Set "key length."
636
637 #if 0 != ContextKey
638 add $ContextKey, E
639 #endif
640
641 movd 4*4(K), ve4
642 movd 5*4(K), ve5
643
644 // K cannot be used after we write to R, since they use the same register.
645
646 #if defined __i386__
647
648 lea _AESRcon, R
649 lea _AESInvMixColumnTable, ITable
650 lea _AESSubBytesWordTable, STable
651
652 #elif defined __x86_64__
653
654 lea _AESRcon(%rip), R
655 lea _AESInvMixColumnTable(%rip), ITable
656 lea _AESSubBytesWordTable(%rip), STable
657
658 #endif
659
660 /* With a six-word key, there are twelve rounds (thirteen 16-byte key
661 blocks), eleven of which have InvMixColumn applied. The key expansion
662 proceeds in iterations of six four-byte words, so the termination
663 condition is a bit complicated. We set offset to the negative of 10
664 four four-byte words, and the loop branch does another iteration if
665 offset is less than or equal to zero, meaning the number of iterations
666 performed so far is less than or equal to 10. Thus, after ten
667 iterations, it branches again. After the eleventh iteration, it
668 stops. Code after the end of the loop computes the twelfth key block,
669 which does not have InvMixColumn applied.
670 */
671 mov $-10*4*4, offset
672 sub offset, E
673
674 // Store initial words of expanded key, which are copies of user's key.
675 movd ve0, 0*4(E, offset)
676 movd ve1, 1*4(E, offset)
677 movd ve2, 2*4(E, offset)
678 movd ve3, 3*4(E, offset)
679
680 /* The first four words are stored untransformed. After that, words in
681 the expanded key are transformed by InvMixColumn.
682 */
683 movd ve4, dr
684 call InvMixColumn
685 movd vt0, 4*4(E, offset)
686
687 movd ve5, dr
688 call InvMixColumn
689 movd vt0, 5*4(E, offset)
690
691/* Here is the first iteration of the key expansion. It is separate from the
692 main loop below because we need to apply InvMixColumn to each of the
693 outputs, in ve0 through ve5. In the main loop, the technique described at
694 the top of this file is used to compute the proper outputs while using
695 InvMixColumn only once.
696*/
697 add $1, R // Advance pointer.
698 movd ve5, dr // Put previous word into work register.
699 movzx (R), t0d // Get round constant.
700 movd t0d, vt0
701
702 SubWordRotWord
703 pxor vt0, ve0
704
705 // Chain to successive words.
706 pxor ve0, ve1
707 pxor ve1, ve2
708 pxor ve2, ve3
709 pxor ve3, ve4
710 pxor ve4, ve5
711
712 add $6*4, offset
713
714 /* Apply InvMixColumn to each word. The transformed values are stored in
715 the expanded key. The original values are retained in registers for
716 further computation.
717 */
718 movd ve0, dr
719 call InvMixColumn
720 movd vt0, 0*4(E, offset)
721
722 movd ve1, dr
723 call InvMixColumn
724 movd vt0, 1*4(E, offset)
725
726 movd ve2, dr
727 call InvMixColumn
728 movd vt0, 2*4(E, offset)
729
730 movd ve3, dr
731 call InvMixColumn
732 movd vt0, 3*4(E, offset)
733
734 movd (4-6)*4(E, offset), vt1
735 pxor vt1, vt0
736 movd vt0, 4*4(E, offset)
737
738 movd (5-6)*4(E, offset), vt1
739 pxor vt1, vt0
740 movd vt0, 5*4(E, offset)
741
742// Here is the main loop.
7431:
744 add $1, R // Advance pointer.
745 movd ve5, dr // Put previous word into work register.
746 movzx (R), t0d // Get round constant.
747 movd t0d, vt0
748
749 SubWordRotWord
750 pxor vt0, ve0
751
752 // Chain to successive words.
753 pxor ve0, ve1
754 pxor ve1, ve2
755 pxor ve2, ve3
756 pxor ve3, ve4
757 pxor ve4, ve5
758
759 // Apply InvMixColumn to the difference.
760 movd vt0, dr
761 call InvMixColumn
762
763 add $6*4, offset
764
765 // Chain the transformed difference to previously transformed outputs.
766 movd (0-6)*4(E, offset), vt1
767 pxor vt1, vt0
768 movd vt0, 0*4(E, offset)
769
770 movd (1-6)*4(E, offset), vt1
771 pxor vt1, vt0
772 movd vt0, 1*4(E, offset)
773
774 movd (2-6)*4(E, offset), vt1
775 pxor vt1, vt0
776 movd vt0, 2*4(E, offset)
777
778 movd (3-6)*4(E, offset), vt1
779 pxor vt1, vt0
780 movd vt0, 3*4(E, offset)
781
782 movd (4-6)*4(E, offset), vt1
783 pxor vt1, vt0
784 movd vt0, 4*4(E, offset)
785
786 movd (5-6)*4(E, offset), vt1
787 pxor vt1, vt0
788 movd vt0, 5*4(E, offset)
789
790 jle 1b
791
792// Here is the final iteration, which does not perform InvMixColumn.
793
794 movd ve5, dr // Put previous word into work register.
795 movzx 1(R), t0d // Get round constant.
796 movd t0d, vt0
797
798 SubWordRotWord
799 pxor vt0, ve0
800
801 // Chain to successive words.
802 movd ve0, 6*4(E, offset)
803 pxor ve0, ve1
804 movd ve1, 7*4(E, offset)
805 pxor ve1, ve2
806 movd ve2, 8*4(E, offset)
807 pxor ve2, ve3
808 movd ve3, 9*4(E, offset)
809
810 xor r0, r0 // Return success.
811
812 // Pop stack and restore registers.
813 movaps 7*16(r4), %xmm7
814 movaps 6*16(r4), %xmm6
815 movaps 5*16(r4), %xmm5
816 movaps 4*16(r4), %xmm4
817 movaps 3*16(r4), %xmm3
818 movaps 2*16(r4), %xmm2
819 movaps 1*16(r4), %xmm1
820 movaps 0*16(r4), %xmm0
821 #if 0 < LocalsSize
822 add $Padding + LocalsSize, r4
823 #endif
824 #if defined __i386__
825 pop r7
826 pop r6
827 pop r5
828 #endif
829 pop r3
830
831 ret
832
833
834 .globl _aes_decrypt_key256
835// .private_extern _aes_decrypt_key256
836_aes_decrypt_key256:
837
838 /* Save registers and set SaveSize to the number of bytes pushed onto the
839 stack so far, including the caller's return address.
840 */
841 push r3
842 #if defined __i386__
843 push r5
844 push r6
845 push r7
846 #define SaveSize (5*4)
847 #else
848 #define SaveSize (2*8)
849 #endif
850
851 /* Number of bytes used for local variables:
852
853 8 16-byte spaces to save XMM registers.
854
855 8 four-byte spaces for work.
856 */
857 #define LocalsSize (8*16 + 8*4)
858
859 // Define stack offset to storage space for local data.
860 #define Local (8*16)
861
862 #if 0 < LocalsSize
863 // Padding to position stack pointer at a multiple of 16 bytes.
864 #define Padding (15 & -(SaveSize + LocalsSize))
865 sub $Padding + LocalsSize, r4 // Allocate space on stack.
866 #else
867 #define Padding 0
868 #endif
869
870 /* StackFrame is the number of bytes in our stack frame, from caller's
871 stack pointer to ours (so it includes the return address).
872 */
873 #define StackFrame (SaveSize + Padding + LocalsSize)
874
875 // Save xmm registers.
876 movaps %xmm0, 0*16(r4)
877 movaps %xmm1, 1*16(r4)
878 movaps %xmm2, 2*16(r4)
879 movaps %xmm3, 3*16(r4)
880 movaps %xmm4, 4*16(r4)
881 movaps %xmm5, 5*16(r4)
882 movaps %xmm6, 6*16(r4)
883 movaps %xmm7, 7*16(r4)
884
885#if defined __i386__
886
887 // Load arguments.
888 #define Argument(i) StackFrame+4*(i)(r4)
889 mov Argument(1), E
890 mov Argument(0), K
891
892#endif
893
894// Merge point for _aes_decrypt_key and _aes_decrypt_key256.
895DKeyHas8Words:
896
897 // First words of expanded key are copied from user key.
898 movd 0*4(K), ve0
899 movd 1*4(K), ve1
900 movd 2*4(K), ve2
901 movd 3*4(K), ve3
902
903 movl $14*16, ContextKeyLength(E) // Set "key length."
904
905 #if 0 != ContextKey
906 add $ContextKey, E
907 #endif
908
909 // Store initial words of expanded key, which are copies of user's key.
910 movd ve0, 0*4(E)
911 movd ve1, 1*4(E)
912 movd ve2, 2*4(E)
913 movd ve3, 3*4(E)
914 movd 4*4(K), ve0
915 movd 5*4(K), ve1
916 movd 6*4(K), ve2
917 movd 7*4(K), ve3
918
919 // K cannot be used after we write to R, since they use the same register.
920
921 #if defined __i386__
922
923 lea _AESRcon, R
924 lea _AESInvMixColumnTable, ITable
925 lea _AESSubBytesWordTable, STable
926
927 #elif defined __x86_64__
928
929 lea _AESRcon(%rip), R
930 lea _AESInvMixColumnTable(%rip), ITable
931 lea _AESSubBytesWordTable(%rip), STable
932
933 #endif
934
935 /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
936 blocks), thirteen of which have InvMixColumn applied.
937 */
938 mov $-12*4*4, offset
939 sub offset, E
940
941 // Save untransformed values in stack area.
942 movd ve0, 4*4+Local(r4)
943 movd ve1, 5*4+Local(r4)
944 movd ve2, 6*4+Local(r4)
945 movd ve3, 7*4+Local(r4)
946
947 /* Apply InvMixColumn to words 4 through 7. The transformed values are
948 stored in the expanded key. The original values are saved in the stack
949 area for further computation.
950 */
951 movd ve0, dr
952 call InvMixColumn
953 movd vt0, 4*4(E, offset)
954
955 movd ve1, dr
956 call InvMixColumn
957 movd vt0, 5*4(E, offset)
958
959 movd ve2, dr
960 call InvMixColumn
961 movd vt0, 6*4(E, offset)
962
963 movd ve3, dr
964 call InvMixColumn
965 movd vt0, 7*4(E, offset)
966
967/* Here is the first iteration of the key expansion. It is separate from the
968 main loop below because we need to apply InvMixColumn to each of the
969 outputs, in ve0 through ve3. In the main loop, the technique described at
970 the top of this file is used to compute the proper outputs while using
971 InvMixColumn only once.
972*/
973 add $1, R // Advance pointer.
974 movd ve3, dr // Put previous word into work register.
975 movzx (R), t0d // Get round constant.
976 movd t0d, vt0
977
978 SubWordRotWord
979
980 add $8*4, offset
981
982 movd (0-8)*4(E, offset), ve0 // Get old word.
983 pxor vt0, ve0
984 movd ve0, 0*4+Local(r4) // Save on stack.
985 movd ve0, dr
986 call InvMixColumn
987 movd vt0, 0*4(E, offset) // Write to expanded key.
988
989 /* Chain to successive words and apply InvMixColumn to each word. The
990 transformed values are stored in the expanded key. The original
991 values are retained in local data for further computation.
992 */
993 movd (1-8)*4(E, offset), ve1 // Get old word.
994 pxor ve0, ve1 // Chain.
995 movd ve1, 1*4+Local(r4) // Save on stack.
996 movd ve1, dr
997 call InvMixColumn
998 movd vt0, 1*4(E, offset) // Write to expanded key.
999
1000 movd (2-8)*4(E, offset), ve2 // Get old word.
1001 pxor ve1, ve2 // Chain.
1002 movd ve2, 2*4+Local(r4) // Save on stack.
1003 movd ve2, dr
1004 call InvMixColumn
1005 movd vt0, 2*4(E, offset) // Write to expanded key.
1006
1007 movd (3-8)*4(E, offset), ve3 // Get old word.
1008 pxor ve2, ve3 // Chain.
1009 movd ve3, 3*4+Local(r4) // Save on stack.
1010 movd ve3, dr
1011 call InvMixColumn
1012 movd vt0, 3*4(E, offset) // Write to expanded key.
1013
1014 movd ve3, dr // Put previous word into work register.
1015 SubWord
1016
1017 movd 4*4+Local(r4), ve0 // Get old word.
1018 pxor vt0, ve0 // Chain.
1019 movd ve0, 4*4+Local(r4) // Save on stack.
1020
1021 movd 5*4+Local(r4), ve1 // Get old word.
1022 pxor ve0, ve1 // Chain.
1023 movd ve1, 5*4+Local(r4) // Save on stack.
1024
1025 movd 6*4+Local(r4), ve2 // Get old word.
1026 pxor ve1, ve2 // Chain.
1027 movd ve2, 6*4+Local(r4) // Save on stack.
1028
1029 movd 7*4+Local(r4), ve3 // Get old word.
1030 pxor ve2, ve3 // Chain.
1031 movd ve3, 7*4+Local(r4) // Save on stack.
1032
1033 movd vt0, dr // Move change to work register.
1034 call InvMixColumn
1035
1036 movd (4-8)*4(E, offset), vt1 // Get old word.
1037 pxor vt1, vt0 // Chain.
1038 movd vt0, 4*4(E, offset) // Write new word to expanded key.
1039
1040 movd (5-8)*4(E, offset), vt1 // Get old word.
1041 pxor vt1, vt0 // Chain.
1042 movd vt0, 5*4(E, offset) // Write new word to expanded key.
1043
1044 movd (6-8)*4(E, offset), vt1 // Get old word.
1045 pxor vt1, vt0 // Chain.
1046 movd vt0, 6*4(E, offset) // Write new word to expanded key.
1047
1048 movd (7-8)*4(E, offset), vt1 // Get old word.
1049 pxor vt1, vt0 // Chain.
1050 movd vt0, 7*4(E, offset) // Write new word to expanded key.
1051
1052// Here is the main loop.
10531:
1054 add $1, R // Advance pointer.
1055 movd ve3, dr // Put previous word into work register.
1056 movzx (R), t0d // Get round constant.
1057 movd t0d, vt0
1058
1059 SubWordRotWord
1060
1061 movd 0*4+Local(r4), ve0 // Get old word.
1062 pxor vt0, ve0
1063 movd ve0, 0*4+Local(r4) // Save on stack.
1064
1065 // Chain to successive words.
1066 movd 1*4+Local(r4), ve1 // Get old word.
1067 pxor ve0, ve1 // Chain.
1068 movd ve1, 1*4+Local(r4) // Save on stack.
1069
1070 movd 2*4+Local(r4), ve2 // Get old word.
1071 pxor ve1, ve2 // Chain.
1072 movd ve2, 2*4+Local(r4) // Save on stack.
1073
1074 movd 3*4+Local(r4), ve3 // Get old word.
1075 pxor ve2, ve3 // Chain.
1076 movd ve3, 3*4+Local(r4) // Save on stack.
1077
1078 movd vt0, dr // Move change to work register.
1079 call InvMixColumn
1080
1081 movd 0*4(E, offset), vt1 // Get old word.
1082 pxor vt1, vt0 // Chain.
1083 movd vt0, (0+8)*4(E, offset) // Write new word to expanded key.
1084
1085 movd 1*4(E, offset), vt1 // Get old word.
1086 pxor vt1, vt0 // Chain.
1087 movd vt0, (1+8)*4(E, offset) // Write new word to expanded key.
1088
1089 movd 2*4(E, offset), vt1 // Get old word.
1090 pxor vt1, vt0 // Chain.
1091 movd vt0, (2+8)*4(E, offset) // Write new word to expanded key.
1092
1093 movd 3*4(E, offset), vt1 // Get old word.
1094 pxor vt1, vt0 // Chain.
1095 movd vt0, (3+8)*4(E, offset) // Write new word to expanded key.
1096
1097 movd ve3, dr // Put previous word into work register.
1098 SubWord
1099
1100 movd 4*4+Local(r4), ve0 // Get old word.
1101 pxor vt0, ve0 // Chain.
1102 movd ve0, 4*4+Local(r4) // Save on stack.
1103
1104 movd 5*4+Local(r4), ve1 // Get old word.
1105 pxor ve0, ve1 // Chain.
1106 movd ve1, 5*4+Local(r4) // Save on stack.
1107
1108 movd 6*4+Local(r4), ve2 // Get old word.
1109 pxor ve1, ve2 // Chain.
1110 movd ve2, 6*4+Local(r4) // Save on stack.
1111
1112 movd 7*4+Local(r4), ve3 // Get old word.
1113 pxor ve2, ve3 // Chain.
1114 movd ve3, 7*4+Local(r4) // Save on stack.
1115
1116 movd vt0, dr // Move change to work register.
1117 call InvMixColumn
1118
1119 movd 4*4(E, offset), vt1 // Get old word.
1120 pxor vt1, vt0 // Chain.
1121 movd vt0, (4+8)*4(E, offset) // Write new word to expanded key.
1122
1123 movd 5*4(E, offset), vt1 // Get old word.
1124 pxor vt1, vt0 // Chain.
1125 movd vt0, (5+8)*4(E, offset) // Write new word to expanded key.
1126
1127 movd 6*4(E, offset), vt1 // Get old word.
1128 pxor vt1, vt0 // Chain.
1129 movd vt0, (6+8)*4(E, offset) // Write new word to expanded key.
1130
1131 movd 7*4(E, offset), vt1 // Get old word.
1132 pxor vt1, vt0 // Chain.
1133 movd vt0, (7+8)*4(E, offset) // Write new word to expanded key.
1134
1135 add $8*4, offset
1136
1137 jl 1b
1138
1139 movd ve3, dr // Put previous word into work register.
1140 movzx 1(R), t0d // Get round constant.
1141 movd t0d, vt0
1142
1143 SubWordRotWord
1144
1145 movd 0*4+Local(r4), ve0 // Get old word.
1146 pxor vt0, ve0 // Chain.
1147 movd ve0, (0+8)*4(E, offset)
1148
1149 // Chain to successive words.
1150 movd 1*4+Local(r4), ve1 // Get old word.
1151 pxor ve0, ve1 // Chain.
1152 movd ve1, (1+8)*4(E, offset)
1153
1154 movd 2*4+Local(r4), ve2 // Get old word.
1155 pxor ve1, ve2 // Chain.
1156 movd ve2, (2+8)*4(E, offset)
1157
1158 movd 3*4+Local(r4), ve3 // Get old word.
1159 pxor ve2, ve3 // Chain.
1160 movd ve3, (3+8)*4(E, offset)
1161
1162 xor r0, r0 // Return success.
1163
1164 // Pop stack and restore registers.
1165 movaps 7*16(r4), %xmm7
1166 movaps 6*16(r4), %xmm6
1167 movaps 5*16(r4), %xmm5
1168 movaps 4*16(r4), %xmm4
1169 movaps 3*16(r4), %xmm3
1170 movaps 2*16(r4), %xmm2
1171 movaps 1*16(r4), %xmm1
1172 movaps 0*16(r4), %xmm0
1173 #if 0 < LocalsSize
1174 add $Padding + LocalsSize, r4
1175 #endif
1176 #if defined __i386__
1177 pop r7
1178 pop r6
1179 pop r5
1180 #endif
1181 pop r3
1182
1183 ret
1184
1185
1186#undef Address
1187#undef Argument
1188#undef E
1189#undef ITable
1190#undef K
1191#undef Local
1192#undef LocalsSize
1193#undef LookupI
1194#undef LookupS
1195#undef Nk
1196#undef Padding
1197#undef R
1198#undef SaveSize
1199#undef STable
1200#undef StackFrame
1201#undef dr
1202#undef drh
1203#undef drl
1204#undef offset
1205#undef t0
1206#undef t0d
1207#undef ve0
1208#undef ve1
1209#undef ve2
1210#undef ve3
1211#undef ve4
1212#undef ve5
1213#undef vt0
1214#undef vt1