1 /* This file defines _aes_decrypt_key, _aes_decrypt_key128,
2 _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be
3 included in another assembly file with the preprocessor #include directive,
4 to benefit from some assembly-time calculations.
6 Written by Eric Postpischil, January 2008.
8 The comments here do not say much about the algorithm; the code just
9 follows the FIPS-197 specification. I recommend reading the specification
10 before working with this code or examining the C code in the parent
11 directory that illustrates key expansion.
13 One complication is that this routine both expands the key and applies
14 InvMixColumn to most of the words in the expanded key. This modifies the
15 key for use with the Equivalent Inverse Cipher.
17 During key expansion, there are sequences of four or six words that are
20 E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function.
21 E[i+1] = E[i+1-Nk] ^ E[i+0].
22 E[i+2] = E[i+2-Nk] ^ E[i+1].
23 E[i+3] = E[i+3-Nk] ^ E[i+2].
25 When Nk is four or eight, the sequence stops there. When it is six, it
26 goes on for two more words. Let I be the InvMixColumn function. for the
27 Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]),
28 I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not
29 need to calculate I four times. In AES' finite field, I is a linear
30 combination of the four bytes of its input. The ^ operation on the bits
31 that represent field elements is an addition in the Galois field. So
32 I(a ^ b) = I(a) ^ I(b). Then we have:
34 I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])).
35 I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]).
36 I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]).
37 I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]).
39 To compute this, we compute I(f(E[i-1])) and XOR it with the previously
40 stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously
41 stored E[i+1-Nk])) to get I(E[i+1])), and so on.
43 Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to
44 compute the pre-InvMixColumn words of the expanded key; it is not
45 sufficient to have the post-InvMixColumn words.
53 _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256.
57 Expand the user's cipher key into the key schedule, as defined in
58 Federal Information Processing Standards Publication 197 (FIPS-197),
61 For decryption, the key is modified as shown in Figure 15 in FIPS-197,
62 to support the Equivalent Inverse Cipher.
68 The following names must be locally defined so the assembler
69 can calculate certain offsets.
71 static const Word _AESSubBytesWordTable[4][256].
73 _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
74 SubBytes is defined in FIPS-197. _AESSubBytesWordTable
75 differs from _AESEncryptTable in that it does not include
76 the MixColumn operation. It is used in performing the last
77 round, which differs fromm the previous rounds in that it
78 does not include the MixColumn operation.
80 static const Word _AESSInvMixColumnTable[4][256].
82 _AESInvMixColumnTable[i][j] contains the contribution of byte
83 j to element i of the InvMixColumn operation.
85 The four bytes of the word _AESInvMixColumnTable[0][j] are:
87 {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j},
89 listed in increasing address order, where multiplication is
90 performed in the Galois field. {j} designates the element of
91 the Galois field represented by j. _AESInvMixColumn[i][j] has
92 the same bytes, rotated right in the order shown above.
94 static const Byte _AESRcon[].
96 Round constants, beginning with AESRcon[1] for the first round
97 (AESRcon[0] is padding.)
103 Address of user's cipher key.
107 Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in
110 This argument is used with _aes_decrypt_key. It is not
111 present for the other routines. In those routines, Context
112 is the second argument.
114 aes_decrypt_ctx *Context
116 Structure to contain the expanded key beginning at offset
117 ContextKey and a four-byte "key length" beginning at offset
118 ContextKeyLength. The "key length" is the number of bytes from
119 the start of the first round key to the startof the last rond
120 key. That is 16 less than the number of bytes in the entire
125 The expanded key and the "key length" are written to *Context.
129 aes_rval // -1 if "key length" is invalid. 0 otherwise.
131 /* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */
134 #include <i386/cpu_capabilities.h>
136 #include <System/i386/cpu_capabilities.h>
139 #define dr r0d // Dissection register.
140 #define drl r0l // Low 8 bits of dissection register.
141 #define drh r0h // Second-lowest 8 bits of dissection register.
144 #define t0d r1d // Low 32 bits of t0.
146 #define STable r2 // Address of SubBytes table. Overlaps Nk.
147 #define ITable r3 // Address of InvMixColumn table.
148 #define offset Arch(r5, r11) // Address offset and loop sentinel.
150 #define R r7 // Address of round constant.
151 #define K r7 // User key pointer.
154 #define E r6 // Expanded key pointer.
165 #define LookupS(table, index) (table)*TableSize(STable, index, 4)
166 #define LookupI(table, index) (table)*TableSize(ITable, index, 4)
169 /* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard
170 subroutine. It does not conform to the ABI. It is an integral part of
171 _ExpandKeyForDecryption and shares register use with it.
175 movd LookupI(0, t0), vt0 // Look up byte 0 in table 0.
177 movd LookupI(1, t0), vt1 // Look up byte 1 in table 1.
181 movd LookupI(2, t0), vt1 // Look up byte 2 in table 2.
184 movd LookupI(3, t0), vt1 // Look up byte 3 in table 3.
189 // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0.
190 .macro SubWordRotWord
192 movd LookupS(3, t0), vt1 // Look up byte 0 in table 3.
195 movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
199 movd LookupS(1, t0), vt1 // Look up byte 2 in table 1.
202 movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
207 // SubWord puts SubWord(dr) into vt0.
210 movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
212 movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
216 movd LookupS(2, t0), vt1 // Look up byte 2 in table 2.
219 movd LookupS(3, t0), vt1 // Look up byte 3 in table 3.
224 .globl _aes_decrypt_key
225 // .private_extern _aes_decrypt_key
228 // detect AES HW, cclee 3-13-10
229 #if defined __x86_64__
230 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
231 mov (%rax), %eax // %eax = __cpu_capabilities
234 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
235 mov (%eax), %eax // %eax = __cpu_capabilities
237 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
241 test $(kHasAES), %eax // __cpu_capabilities & kHasAES
242 jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw
243 /* Save registers and set SaveSize to the number of bytes pushed onto the
244 stack so far, including the caller's return address.
251 #define SaveSize (5*4)
253 #define SaveSize (2*8)
256 /* Number of bytes used for local variables:
258 8 16-byte spaces to save XMM registers.
260 8 four-byte spaces for work.
262 #define LocalsSize (8*16 + 8*4)
264 // Define stack offset to storage space for local data.
268 // Padding to position stack pointer at a multiple of 16 bytes.
269 #define Padding (15 & -(SaveSize + LocalsSize))
270 sub $Padding + LocalsSize, r4 // Allocate space on stack.
275 /* StackFrame is the number of bytes in our stack frame, from caller's
276 stack pointer to ours (so it includes the return address).
278 #define StackFrame (SaveSize + Padding + LocalsSize)
280 // Save xmm registers.
281 movaps %xmm0, 0*16(r4)
282 movaps %xmm1, 1*16(r4)
283 movaps %xmm2, 2*16(r4)
284 movaps %xmm3, 3*16(r4)
285 movaps %xmm4, 4*16(r4)
286 movaps %xmm5, 5*16(r4)
287 movaps %xmm6, 6*16(r4)
288 movaps %xmm7, 7*16(r4)
292 // Define location of argument i.
293 #define Argument(i) StackFrame+4*(i)(r4)
302 #elif defined __x86_64__
304 #define Nk r9d // Number of words in key.
305 mov r6d, Nk // Move Nk argument out of way.
306 mov r2, E // Move E argument to common register.
310 // Dispatch on key length.
313 shl $3, Nk // Convert from bytes to bits.
321 mov $-1, r0 // Return error.
325 .globl _aes_decrypt_key128
326 // .private_extern _aes_decrypt_key128
329 /* Save registers and set SaveSize to the number of bytes pushed onto the
330 stack so far, including the caller's return address.
337 #define SaveSize (5*4)
339 #define SaveSize (2*8)
342 /* Number of bytes used for local variables:
344 8 16-byte spaces to save XMM registers.
346 8 four-byte spaces for work.
348 #define LocalsSize (8*16 + 8*4)
350 // Define stack offset to storage space for local data.
354 // Padding to position stack pointer at a multiple of 16 bytes.
355 #define Padding (15 & -(SaveSize + LocalsSize))
356 sub $Padding + LocalsSize, r4 // Allocate space on stack.
361 /* StackFrame is the number of bytes in our stack frame, from caller's
362 stack pointer to ours (so it includes the return address).
364 #define StackFrame (SaveSize + Padding + LocalsSize)
366 // Save xmm registers.
367 movaps %xmm0, 0*16(r4)
368 movaps %xmm1, 1*16(r4)
369 movaps %xmm2, 2*16(r4)
370 movaps %xmm3, 3*16(r4)
371 movaps %xmm4, 4*16(r4)
372 movaps %xmm5, 5*16(r4)
373 movaps %xmm6, 6*16(r4)
374 movaps %xmm7, 7*16(r4)
379 #define Argument(i) StackFrame+4*(i)(r4)
385 // Merge point for _aes_decrypt_key and _aes_decrypt_key128.
388 // First words of expanded key are copied from user key.
394 movl $10*16, ContextKeyLength(E) // Set "key length."
400 // K cannot be used after we write to R, since they use the same register.
405 lea _AESInvMixColumnTable, ITable
406 lea _AESSubBytesWordTable, STable
408 #elif defined __x86_64__
410 lea _AESRcon(%rip), R
411 lea _AESInvMixColumnTable(%rip), ITable
412 lea _AESSubBytesWordTable(%rip), STable
416 /* With a four-word key, there are ten rounds (eleven 16-byte key blocks),
417 nine of which have InvMixColumn applied.
422 // Store initial words of expanded key, which are copies of user's key.
423 movd ve0, 0*4(E, offset)
424 movd ve1, 1*4(E, offset)
425 movd ve2, 2*4(E, offset)
426 movd ve3, 3*4(E, offset)
428 /* Here is the first iteration of the key expansion. It is separate from the
429 main loop below because we need to apply InvMixColumn to each of the
430 outputs, in ve0 through ve3. In the main loop, the technique described at
431 the top of this file is used to compute the proper outputs while using
432 InvMixColumn only once.
434 add $1, R // Advance pointer.
435 movd ve3, dr // Put previous word into work register.
436 movzx (R), t0d // Get round constant.
442 // Chain to successive words.
449 /* Apply InvMixColumn to each word. The transformed values are stored in
450 the expanded key. The original values are retained in registers for
455 movd vt0, 0*4(E, offset)
459 movd vt0, 1*4(E, offset)
463 movd vt0, 2*4(E, offset)
467 movd vt0, 3*4(E, offset)
469 // Here is the main loop.
471 add $1, R // Advance pointer.
472 movd ve3, dr // Put previous word into work register.
473 movzx (R), t0d // Get round constant.
479 // Chain to successive words.
483 /* Dr. Brian Gladman uses a technique with a single XOR here instead
484 of the previous four. There is some periodic behavior in the key
485 expansion, and Gladman maintains E[4*i+3] for the latest four
486 values of i. XORing the value in vt0 with one of these yields its
487 replacement. However, using this technique requires additional
488 instructions before the loop (to initialize the values) and after
489 it (to extract the final values to be stored) and either some way
490 to rotate or index four values in the loop or a four-fold unrolling
491 of the loop to provide the indexing. Experiment suggests the
492 former is not worthwhile. Unrolling the loop might give a small
493 gain, at the cost of increased use of instruction cache, increased
494 instructions loads the first time the routine is executed, and
495 increased code complexity, so I decided against it.
498 // Apply InvMixColumn to the difference.
504 // Chain the transformed difference to previously transformed outputs.
505 movd (0-4)*4(E, offset), vt1
507 movd vt0, 0*4(E, offset)
509 movd (1-4)*4(E, offset), vt1
511 movd vt0, 1*4(E, offset)
513 movd (2-4)*4(E, offset), vt1
515 movd vt0, 2*4(E, offset)
517 movd (3-4)*4(E, offset), vt1
519 movd vt0, 3*4(E, offset)
523 // Here is the final iteration, which does not perform InvMixColumn.
525 movd ve3, dr // Put previous word into work register.
526 movzx 1(R), t0d // Get round constant.
532 // Chain to successive words.
533 movd ve0, 4*4(E, offset)
535 movd ve1, 5*4(E, offset)
537 movd ve2, 6*4(E, offset)
539 movd ve3, 7*4(E, offset)
541 xor r0, r0 // Return success.
544 // Pop stack and restore registers.
545 movaps 7*16(r4), %xmm7
546 movaps 6*16(r4), %xmm6
547 movaps 5*16(r4), %xmm5
548 movaps 4*16(r4), %xmm4
549 movaps 3*16(r4), %xmm3
550 movaps 2*16(r4), %xmm2
551 movaps 1*16(r4), %xmm1
552 movaps 0*16(r4), %xmm0
554 add $Padding + LocalsSize, r4
566 .globl _aes_decrypt_key192
567 // .private_extern _aes_decrypt_key192
570 /* Save registers and set SaveSize to the number of bytes pushed onto the
571 stack so far, including the caller's return address.
578 #define SaveSize (5*4)
580 #define SaveSize (2*8)
583 /* Number of bytes used for local variables:
585 8 16-byte spaces to save XMM registers.
587 8 four-byte spaces for work.
589 #define LocalsSize (8*16 + 8*4)
591 // Define stack offset to storage space for local data.
595 // Padding to position stack pointer at a multiple of 16 bytes.
596 #define Padding (15 & -(SaveSize + LocalsSize))
597 sub $Padding + LocalsSize, r4 // Allocate space on stack.
602 /* StackFrame is the number of bytes in our stack frame, from caller's
603 stack pointer to ours (so it includes the return address).
605 #define StackFrame (SaveSize + Padding + LocalsSize)
607 // Save xmm registers.
608 movaps %xmm0, 0*16(r4)
609 movaps %xmm1, 1*16(r4)
610 movaps %xmm2, 2*16(r4)
611 movaps %xmm3, 3*16(r4)
612 movaps %xmm4, 4*16(r4)
613 movaps %xmm5, 5*16(r4)
614 movaps %xmm6, 6*16(r4)
615 movaps %xmm7, 7*16(r4)
620 #define Argument(i) StackFrame+4*(i)(r4)
626 // Merge point for _aes_decrypt_key and _aes_decrypt_key192.
629 // First words of expanded key are copied from user key.
635 movl $12*16, ContextKeyLength(E) // Set "key length."
644 // K cannot be used after we write to R, since they use the same register.
649 lea _AESInvMixColumnTable, ITable
650 lea _AESSubBytesWordTable, STable
652 #elif defined __x86_64__
654 lea _AESRcon(%rip), R
655 lea _AESInvMixColumnTable(%rip), ITable
656 lea _AESSubBytesWordTable(%rip), STable
660 /* With a six-word key, there are twelve rounds (thirteen 16-byte key
661 blocks), eleven of which have InvMixColumn applied. The key expansion
662 proceeds in iterations of six four-byte words, so the termination
663 condition is a bit complicated. We set offset to the negative of 10
664 four four-byte words, and the loop branch does another iteration if
665 offset is less than or equal to zero, meaning the number of iterations
666 performed so far is less than or equal to 10. Thus, after ten
667 iterations, it branches again. After the eleventh iteration, it
668 stops. Code after the end of the loop computes the twelfth key block,
669 which does not have InvMixColumn applied.
674 // Store initial words of expanded key, which are copies of user's key.
675 movd ve0, 0*4(E, offset)
676 movd ve1, 1*4(E, offset)
677 movd ve2, 2*4(E, offset)
678 movd ve3, 3*4(E, offset)
680 /* The first four words are stored untransformed. After that, words in
681 the expanded key are transformed by InvMixColumn.
685 movd vt0, 4*4(E, offset)
689 movd vt0, 5*4(E, offset)
691 /* Here is the first iteration of the key expansion. It is separate from the
692 main loop below because we need to apply InvMixColumn to each of the
693 outputs, in ve0 through ve5. In the main loop, the technique described at
694 the top of this file is used to compute the proper outputs while using
695 InvMixColumn only once.
697 add $1, R // Advance pointer.
698 movd ve5, dr // Put previous word into work register.
699 movzx (R), t0d // Get round constant.
705 // Chain to successive words.
714 /* Apply InvMixColumn to each word. The transformed values are stored in
715 the expanded key. The original values are retained in registers for
720 movd vt0, 0*4(E, offset)
724 movd vt0, 1*4(E, offset)
728 movd vt0, 2*4(E, offset)
732 movd vt0, 3*4(E, offset)
734 movd (4-6)*4(E, offset), vt1
736 movd vt0, 4*4(E, offset)
738 movd (5-6)*4(E, offset), vt1
740 movd vt0, 5*4(E, offset)
742 // Here is the main loop.
744 add $1, R // Advance pointer.
745 movd ve5, dr // Put previous word into work register.
746 movzx (R), t0d // Get round constant.
752 // Chain to successive words.
759 // Apply InvMixColumn to the difference.
765 // Chain the transformed difference to previously transformed outputs.
766 movd (0-6)*4(E, offset), vt1
768 movd vt0, 0*4(E, offset)
770 movd (1-6)*4(E, offset), vt1
772 movd vt0, 1*4(E, offset)
774 movd (2-6)*4(E, offset), vt1
776 movd vt0, 2*4(E, offset)
778 movd (3-6)*4(E, offset), vt1
780 movd vt0, 3*4(E, offset)
782 movd (4-6)*4(E, offset), vt1
784 movd vt0, 4*4(E, offset)
786 movd (5-6)*4(E, offset), vt1
788 movd vt0, 5*4(E, offset)
792 // Here is the final iteration, which does not perform InvMixColumn.
794 movd ve5, dr // Put previous word into work register.
795 movzx 1(R), t0d // Get round constant.
801 // Chain to successive words.
802 movd ve0, 6*4(E, offset)
804 movd ve1, 7*4(E, offset)
806 movd ve2, 8*4(E, offset)
808 movd ve3, 9*4(E, offset)
810 xor r0, r0 // Return success.
812 // Pop stack and restore registers.
813 movaps 7*16(r4), %xmm7
814 movaps 6*16(r4), %xmm6
815 movaps 5*16(r4), %xmm5
816 movaps 4*16(r4), %xmm4
817 movaps 3*16(r4), %xmm3
818 movaps 2*16(r4), %xmm2
819 movaps 1*16(r4), %xmm1
820 movaps 0*16(r4), %xmm0
822 add $Padding + LocalsSize, r4
834 .globl _aes_decrypt_key256
835 // .private_extern _aes_decrypt_key256
838 /* Save registers and set SaveSize to the number of bytes pushed onto the
839 stack so far, including the caller's return address.
846 #define SaveSize (5*4)
848 #define SaveSize (2*8)
851 /* Number of bytes used for local variables:
853 8 16-byte spaces to save XMM registers.
855 8 four-byte spaces for work.
857 #define LocalsSize (8*16 + 8*4)
859 // Define stack offset to storage space for local data.
863 // Padding to position stack pointer at a multiple of 16 bytes.
864 #define Padding (15 & -(SaveSize + LocalsSize))
865 sub $Padding + LocalsSize, r4 // Allocate space on stack.
870 /* StackFrame is the number of bytes in our stack frame, from caller's
871 stack pointer to ours (so it includes the return address).
873 #define StackFrame (SaveSize + Padding + LocalsSize)
875 // Save xmm registers.
876 movaps %xmm0, 0*16(r4)
877 movaps %xmm1, 1*16(r4)
878 movaps %xmm2, 2*16(r4)
879 movaps %xmm3, 3*16(r4)
880 movaps %xmm4, 4*16(r4)
881 movaps %xmm5, 5*16(r4)
882 movaps %xmm6, 6*16(r4)
883 movaps %xmm7, 7*16(r4)
888 #define Argument(i) StackFrame+4*(i)(r4)
894 // Merge point for _aes_decrypt_key and _aes_decrypt_key256.
897 // First words of expanded key are copied from user key.
903 movl $14*16, ContextKeyLength(E) // Set "key length."
909 // Store initial words of expanded key, which are copies of user's key.
919 // K cannot be used after we write to R, since they use the same register.
924 lea _AESInvMixColumnTable, ITable
925 lea _AESSubBytesWordTable, STable
927 #elif defined __x86_64__
929 lea _AESRcon(%rip), R
930 lea _AESInvMixColumnTable(%rip), ITable
931 lea _AESSubBytesWordTable(%rip), STable
935 /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
936 blocks), thirteen of which have InvMixColumn applied.
941 // Save untransformed values in stack area.
942 movd ve0, 4*4+Local(r4)
943 movd ve1, 5*4+Local(r4)
944 movd ve2, 6*4+Local(r4)
945 movd ve3, 7*4+Local(r4)
947 /* Apply InvMixColumn to words 4 through 7. The transformed values are
948 stored in the expanded key. The original values are saved in the stack
949 area for further computation.
953 movd vt0, 4*4(E, offset)
957 movd vt0, 5*4(E, offset)
961 movd vt0, 6*4(E, offset)
965 movd vt0, 7*4(E, offset)
967 /* Here is the first iteration of the key expansion. It is separate from the
968 main loop below because we need to apply InvMixColumn to each of the
969 outputs, in ve0 through ve3. In the main loop, the technique described at
970 the top of this file is used to compute the proper outputs while using
971 InvMixColumn only once.
973 add $1, R // Advance pointer.
974 movd ve3, dr // Put previous word into work register.
975 movzx (R), t0d // Get round constant.
982 movd (0-8)*4(E, offset), ve0 // Get old word.
984 movd ve0, 0*4+Local(r4) // Save on stack.
987 movd vt0, 0*4(E, offset) // Write to expanded key.
989 /* Chain to successive words and apply InvMixColumn to each word. The
990 transformed values are stored in the expanded key. The original
991 values are retained in local data for further computation.
993 movd (1-8)*4(E, offset), ve1 // Get old word.
994 pxor ve0, ve1 // Chain.
995 movd ve1, 1*4+Local(r4) // Save on stack.
998 movd vt0, 1*4(E, offset) // Write to expanded key.
1000 movd (2-8)*4(E, offset), ve2 // Get old word.
1001 pxor ve1, ve2 // Chain.
1002 movd ve2, 2*4+Local(r4) // Save on stack.
1005 movd vt0, 2*4(E, offset) // Write to expanded key.
1007 movd (3-8)*4(E, offset), ve3 // Get old word.
1008 pxor ve2, ve3 // Chain.
1009 movd ve3, 3*4+Local(r4) // Save on stack.
1012 movd vt0, 3*4(E, offset) // Write to expanded key.
1014 movd ve3, dr // Put previous word into work register.
1017 movd 4*4+Local(r4), ve0 // Get old word.
1018 pxor vt0, ve0 // Chain.
1019 movd ve0, 4*4+Local(r4) // Save on stack.
1021 movd 5*4+Local(r4), ve1 // Get old word.
1022 pxor ve0, ve1 // Chain.
1023 movd ve1, 5*4+Local(r4) // Save on stack.
1025 movd 6*4+Local(r4), ve2 // Get old word.
1026 pxor ve1, ve2 // Chain.
1027 movd ve2, 6*4+Local(r4) // Save on stack.
1029 movd 7*4+Local(r4), ve3 // Get old word.
1030 pxor ve2, ve3 // Chain.
1031 movd ve3, 7*4+Local(r4) // Save on stack.
1033 movd vt0, dr // Move change to work register.
1036 movd (4-8)*4(E, offset), vt1 // Get old word.
1037 pxor vt1, vt0 // Chain.
1038 movd vt0, 4*4(E, offset) // Write new word to expanded key.
1040 movd (5-8)*4(E, offset), vt1 // Get old word.
1041 pxor vt1, vt0 // Chain.
1042 movd vt0, 5*4(E, offset) // Write new word to expanded key.
1044 movd (6-8)*4(E, offset), vt1 // Get old word.
1045 pxor vt1, vt0 // Chain.
1046 movd vt0, 6*4(E, offset) // Write new word to expanded key.
1048 movd (7-8)*4(E, offset), vt1 // Get old word.
1049 pxor vt1, vt0 // Chain.
1050 movd vt0, 7*4(E, offset) // Write new word to expanded key.
1052 // Here is the main loop.
1054 add $1, R // Advance pointer.
1055 movd ve3, dr // Put previous word into work register.
1056 movzx (R), t0d // Get round constant.
1061 movd 0*4+Local(r4), ve0 // Get old word.
1063 movd ve0, 0*4+Local(r4) // Save on stack.
1065 // Chain to successive words.
1066 movd 1*4+Local(r4), ve1 // Get old word.
1067 pxor ve0, ve1 // Chain.
1068 movd ve1, 1*4+Local(r4) // Save on stack.
1070 movd 2*4+Local(r4), ve2 // Get old word.
1071 pxor ve1, ve2 // Chain.
1072 movd ve2, 2*4+Local(r4) // Save on stack.
1074 movd 3*4+Local(r4), ve3 // Get old word.
1075 pxor ve2, ve3 // Chain.
1076 movd ve3, 3*4+Local(r4) // Save on stack.
1078 movd vt0, dr // Move change to work register.
1081 movd 0*4(E, offset), vt1 // Get old word.
1082 pxor vt1, vt0 // Chain.
1083 movd vt0, (0+8)*4(E, offset) // Write new word to expanded key.
1085 movd 1*4(E, offset), vt1 // Get old word.
1086 pxor vt1, vt0 // Chain.
1087 movd vt0, (1+8)*4(E, offset) // Write new word to expanded key.
1089 movd 2*4(E, offset), vt1 // Get old word.
1090 pxor vt1, vt0 // Chain.
1091 movd vt0, (2+8)*4(E, offset) // Write new word to expanded key.
1093 movd 3*4(E, offset), vt1 // Get old word.
1094 pxor vt1, vt0 // Chain.
1095 movd vt0, (3+8)*4(E, offset) // Write new word to expanded key.
1097 movd ve3, dr // Put previous word into work register.
1100 movd 4*4+Local(r4), ve0 // Get old word.
1101 pxor vt0, ve0 // Chain.
1102 movd ve0, 4*4+Local(r4) // Save on stack.
1104 movd 5*4+Local(r4), ve1 // Get old word.
1105 pxor ve0, ve1 // Chain.
1106 movd ve1, 5*4+Local(r4) // Save on stack.
1108 movd 6*4+Local(r4), ve2 // Get old word.
1109 pxor ve1, ve2 // Chain.
1110 movd ve2, 6*4+Local(r4) // Save on stack.
1112 movd 7*4+Local(r4), ve3 // Get old word.
1113 pxor ve2, ve3 // Chain.
1114 movd ve3, 7*4+Local(r4) // Save on stack.
1116 movd vt0, dr // Move change to work register.
1119 movd 4*4(E, offset), vt1 // Get old word.
1120 pxor vt1, vt0 // Chain.
1121 movd vt0, (4+8)*4(E, offset) // Write new word to expanded key.
1123 movd 5*4(E, offset), vt1 // Get old word.
1124 pxor vt1, vt0 // Chain.
1125 movd vt0, (5+8)*4(E, offset) // Write new word to expanded key.
1127 movd 6*4(E, offset), vt1 // Get old word.
1128 pxor vt1, vt0 // Chain.
1129 movd vt0, (6+8)*4(E, offset) // Write new word to expanded key.
1131 movd 7*4(E, offset), vt1 // Get old word.
1132 pxor vt1, vt0 // Chain.
1133 movd vt0, (7+8)*4(E, offset) // Write new word to expanded key.
1139 movd ve3, dr // Put previous word into work register.
1140 movzx 1(R), t0d // Get round constant.
1145 movd 0*4+Local(r4), ve0 // Get old word.
1146 pxor vt0, ve0 // Chain.
1147 movd ve0, (0+8)*4(E, offset)
1149 // Chain to successive words.
1150 movd 1*4+Local(r4), ve1 // Get old word.
1151 pxor ve0, ve1 // Chain.
1152 movd ve1, (1+8)*4(E, offset)
1154 movd 2*4+Local(r4), ve2 // Get old word.
1155 pxor ve1, ve2 // Chain.
1156 movd ve2, (2+8)*4(E, offset)
1158 movd 3*4+Local(r4), ve3 // Get old word.
1159 pxor ve2, ve3 // Chain.
1160 movd ve3, (3+8)*4(E, offset)
1162 xor r0, r0 // Return success.
1164 // Pop stack and restore registers.
1165 movaps 7*16(r4), %xmm7
1166 movaps 6*16(r4), %xmm6
1167 movaps 5*16(r4), %xmm5
1168 movaps 4*16(r4), %xmm4
1169 movaps 3*16(r4), %xmm3
1170 movaps 2*16(r4), %xmm2
1171 movaps 1*16(r4), %xmm1
1172 movaps 0*16(r4), %xmm0
1174 add $Padding + LocalsSize, r4
1176 #if defined __i386__