--- /dev/null
+/* This file defines _aes_decrypt_key, _aes_decrypt_key128,
+ _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be
+ included in another assembly file with the preprocessor #include directive,
+ to benefit from some assembly-time calculations.
+
+ Written by Eric Postpischil, January 2008.
+
+ The comments here do not say much about the algorithm; the code just
+ follows the FIPS-197 specification. I recommend reading the specification
+ before working with this code or examining the C code in the parent
+ directory that illustrates key expansion.
+
+ One complication is that this routine both expands the key and applies
+ InvMixColumn to most of the words in the expanded key. This modifies the
+ key for use with the Equivalent Inverse Cipher.
+
+ During key expansion, there are sequences of four or six words that are
+ produced like this:
+
+ E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function.
+ E[i+1] = E[i+1-Nk] ^ E[i+0].
+ E[i+2] = E[i+2-Nk] ^ E[i+1].
+ E[i+3] = E[i+3-Nk] ^ E[i+2].
+
+ When Nk is four or eight, the sequence stops there. When it is six, it
+ goes on for two more words. Let I be the InvMixColumn function. for the
+ Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]),
+ I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not
+ need to calculate I four times. In AES' finite field, I is a linear
+ combination of the four bytes of its input. The ^ operation on the bits
+ that represent field elements is an addition in the Galois field. So
+ I(a ^ b) = I(a) ^ I(b). Then we have:
+
+ I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])).
+ I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]).
+ I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]).
+ I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]).
+
+ To compute this, we compute I(f(E[i-1])) and XOR it with the previously
+ stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously
+ stored E[i+1-Nk])) to get I(E[i+1])), and so on.
+
+ Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to
+ compute the pre-InvMixColumn words of the expanded key; it is not
+ sufficient to have the post-InvMixColumn words.
+*/
+
+
+/* Routine:
+
+ _aes_decrypt_key.
+
+ _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256.
+
+ Function:
+
+ Expand the user's cipher key into the key schedule, as defined in
+ Federal Information Processing Standards Publication 197 (FIPS-197),
+ November 26, 2001.
+
+ For decryption, the key is modified as shown in Figure 15 in FIPS-197,
+ to support the Equivalent Inverse Cipher.
+
+ Input:
+
+ Constant data:
+
+ The following names must be locally defined so the assembler
+ can calculate certain offsets.
+
+ static const Word _AESSubBytesWordTable[4][256].
+
+ _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
+ SubBytes is defined in FIPS-197. _AESSubBytesWordTable
+ differs from _AESEncryptTable in that it does not include
+ the MixColumn operation. It is used in performing the last
+ round, which differs fromm the previous rounds in that it
+ does not include the MixColumn operation.
+
+ static const Word _AESSInvMixColumnTable[4][256].
+
+ _AESInvMixColumnTable[i][j] contains the contribution of byte
+ j to element i of the InvMixColumn operation.
+
+ The four bytes of the word _AESInvMixColumnTable[0][j] are:
+
+ {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j},
+
+ listed in increasing address order, where multiplication is
+ performed in the Galois field. {j} designates the element of
+ the Galois field represented by j. _AESInvMixColumn[i][j] has
+ the same bytes, rotated right in the order shown above.
+
+ static const Byte _AESRcon[].
+
+ Round constants, beginning with AESRcon[1] for the first round
+ (AESRcon[0] is padding.)
+
+ Arguments:
+
+ const uint8_t *Key
+
+ Address of user's cipher key.
+
+ int Length
+
+ Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in
+ user's cipher key.
+
+ This argument is used with _aes_decrypt_key. It is not
+ present for the other routines. In those routines, Context
+ is the second argument.
+
+ aes_decrypt_ctx *Context
+
+ Structure to contain the expanded key beginning at offset
+ ContextKey and a four-byte "key length" beginning at offset
+ ContextKeyLength. The "key length" is the number of bytes from
+ the start of the first round key to the startof the last rond
+ key. That is 16 less than the number of bytes in the entire
+ key.
+
+ Output:
+
+ The expanded key and the "key length" are written to *Context.
+
+ Return:
+
+ aes_rval // -1 if "key length" is invalid. 0 otherwise.
+*/
+/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */
+
+#ifdef KERNEL
+#include <i386/cpu_capabilities.h>
+#else
+#include <System/i386/cpu_capabilities.h>
+#endif
+
+#define dr r0d // Dissection register.
+#define drl r0l // Low 8 bits of dissection register.
+#define drh r0h // Second-lowest 8 bits of dissection register.
+
+#define t0 r1
+#define t0d r1d // Low 32 bits of t0.
+
+#define STable r2 // Address of SubBytes table. Overlaps Nk.
+#define ITable r3 // Address of InvMixColumn table.
+#define offset Arch(r5, r11) // Address offset and loop sentinel.
+
+#define R r7 // Address of round constant.
+#define K r7 // User key pointer.
+ // R and K overlap.
+
+#define E r6 // Expanded key pointer.
+
+#define ve0 %xmm0
+#define ve1 %xmm1
+#define ve2 %xmm2
+#define ve3 %xmm3
+#define ve4 %xmm4
+#define ve5 %xmm5
+#define vt1 %xmm6
+#define vt0 %xmm7
+
+#define LookupS(table, index) (table)*TableSize(STable, index, 4)
+#define LookupI(table, index) (table)*TableSize(ITable, index, 4)
+
+
+/* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard
+ subroutine. It does not conform to the ABI. It is an integral part of
+ _ExpandKeyForDecryption and shares register use with it.
+*/
+InvMixColumn:
+ movzx drl, t0
+ movd LookupI(0, t0), vt0 // Look up byte 0 in table 0.
+ movzx drh, t0d
+ movd LookupI(1, t0), vt1 // Look up byte 1 in table 1.
+ pxor vt1, vt0
+ shr $16, dr
+ movzx drl, t0d
+ movd LookupI(2, t0), vt1 // Look up byte 2 in table 2.
+ pxor vt1, vt0
+ movzx drh, t0d
+ movd LookupI(3, t0), vt1 // Look up byte 3 in table 3.
+ pxor vt1, vt0
+ ret
+
+
+ // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0.
+ .macro SubWordRotWord
+ movzx drl, t0
+ movd LookupS(3, t0), vt1 // Look up byte 0 in table 3.
+ pxor vt1, vt0
+ movzx drh, t0d
+ movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
+ pxor vt1, vt0
+ shr $$16, dr
+ movzx drl, t0d
+ movd LookupS(1, t0), vt1 // Look up byte 2 in table 1.
+ pxor vt1, vt0
+ movzx drh, t0d
+ movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
+ pxor vt1, vt0
+ .endmacro
+
+
+ // SubWord puts SubWord(dr) into vt0.
+ .macro SubWord
+ movzx drl, t0
+ movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
+ movzx drh, t0d
+ movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
+ pxor vt1,vt0
+ shr $$16, dr
+ movzx drl, t0d
+ movd LookupS(2, t0), vt1 // Look up byte 2 in table 2.
+ pxor vt1,vt0
+ movzx drh, t0d
+ movd LookupS(3, t0), vt1 // Look up byte 3 in table 3.
+ pxor vt1,vt0
+ .endmacro
+
+ .text
+ .globl _aes_decrypt_key
+// .private_extern _aes_decrypt_key
+_aes_decrypt_key:
+
+ // detect AES HW, cclee 3-13-10
+#if defined __x86_64__
+ movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
+ mov (%rax), %eax // %eax = __cpu_capabilities
+#else
+#if defined KERNEL
+ leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
+ mov (%eax), %eax // %eax = __cpu_capabilities
+#else
+ mov _COMM_PAGE_CPU_CAPABILITIES, %eax
+#endif
+
+#endif
+ test $(kHasAES), %eax // __cpu_capabilities & kHasAES
+ jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+
+ 8 four-byte spaces for work.
+ */
+ #define LocalsSize (8*16 + 8*4)
+
+ // Define stack offset to storage space for local data.
+ #define Local (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+#if defined __i386__
+
+ // Define location of argument i.
+ #define Argument(i) StackFrame+4*(i)(r4)
+
+ #define Nk t0d
+
+ // Load arguments.
+ mov Argument(2), E
+ mov Argument(1), Nk
+ mov Argument(0), K
+
+#elif defined __x86_64__
+
+ #define Nk r9d // Number of words in key.
+ mov r6d, Nk // Move Nk argument out of way.
+ mov r2, E // Move E argument to common register.
+
+#endif
+
+ // Dispatch on key length.
+ cmp $128, Nk
+ jge 2f
+ shl $3, Nk // Convert from bytes to bits.
+ cmp $128, Nk
+2:
+ je DKeyHas4Words
+ cmp $192, Nk
+ je DKeyHas6Words
+ cmp $256, Nk
+ je DKeyHas8Words
+ mov $-1, r0 // Return error.
+ jmp 9f
+
+
+ .globl _aes_decrypt_key128
+// .private_extern _aes_decrypt_key128
+_aes_decrypt_key128:
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+
+ 8 four-byte spaces for work.
+ */
+ #define LocalsSize (8*16 + 8*4)
+
+ // Define stack offset to storage space for local data.
+ #define Local (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+#if defined __i386__
+
+ // Load arguments.
+ #define Argument(i) StackFrame+4*(i)(r4)
+ mov Argument(1), E
+ mov Argument(0), K
+
+#endif
+
+// Merge point for _aes_decrypt_key and _aes_decrypt_key128.
+DKeyHas4Words:
+
+ // First words of expanded key are copied from user key.
+ movd 0*4(K), ve0
+ movd 1*4(K), ve1
+ movd 2*4(K), ve2
+ movd 3*4(K), ve3
+
+ movl $10*16, ContextKeyLength(E) // Set "key length."
+
+ #if 0 != ContextKey
+ add $ContextKey, E
+ #endif
+
+ // K cannot be used after we write to R, since they use the same register.
+
+ #if defined __i386__
+
+ lea _AESRcon, R
+ lea _AESInvMixColumnTable, ITable
+ lea _AESSubBytesWordTable, STable
+
+ #elif defined __x86_64__
+
+ lea _AESRcon(%rip), R
+ lea _AESInvMixColumnTable(%rip), ITable
+ lea _AESSubBytesWordTable(%rip), STable
+
+ #endif
+
+ /* With a four-word key, there are ten rounds (eleven 16-byte key blocks),
+ nine of which have InvMixColumn applied.
+ */
+ mov $-9*4*4, offset
+ sub offset, E
+
+ // Store initial words of expanded key, which are copies of user's key.
+ movd ve0, 0*4(E, offset)
+ movd ve1, 1*4(E, offset)
+ movd ve2, 2*4(E, offset)
+ movd ve3, 3*4(E, offset)
+
+/* Here is the first iteration of the key expansion. It is separate from the
+ main loop below because we need to apply InvMixColumn to each of the
+ outputs, in ve0 through ve3. In the main loop, the technique described at
+ the top of this file is used to compute the proper outputs while using
+ InvMixColumn only once.
+*/
+ add $1, R // Advance pointer.
+ movd ve3, dr // Put previous word into work register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+ pxor vt0, ve0
+
+ // Chain to successive words.
+ pxor ve0, ve1
+ pxor ve1, ve2
+ pxor ve2, ve3
+
+ add $4*4, offset
+
+ /* Apply InvMixColumn to each word. The transformed values are stored in
+ the expanded key. The original values are retained in registers for
+ further computation.
+ */
+ movd ve0, dr
+ call InvMixColumn
+ movd vt0, 0*4(E, offset)
+
+ movd ve1, dr
+ call InvMixColumn
+ movd vt0, 1*4(E, offset)
+
+ movd ve2, dr
+ call InvMixColumn
+ movd vt0, 2*4(E, offset)
+
+ movd ve3, dr
+ call InvMixColumn
+ movd vt0, 3*4(E, offset)
+
+// Here is the main loop.
+1:
+ add $1, R // Advance pointer.
+ movd ve3, dr // Put previous word into work register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+ pxor vt0, ve0
+
+ // Chain to successive words.
+ pxor ve0, ve1
+ pxor ve1, ve2
+ pxor ve2, ve3
+ /* Dr. Brian Gladman uses a technique with a single XOR here instead
+ of the previous four. There is some periodic behavior in the key
+ expansion, and Gladman maintains E[4*i+3] for the latest four
+ values of i. XORing the value in vt0 with one of these yields its
+ replacement. However, using this technique requires additional
+ instructions before the loop (to initialize the values) and after
+ it (to extract the final values to be stored) and either some way
+ to rotate or index four values in the loop or a four-fold unrolling
+ of the loop to provide the indexing. Experiment suggests the
+ former is not worthwhile. Unrolling the loop might give a small
+ gain, at the cost of increased use of instruction cache, increased
+ instructions loads the first time the routine is executed, and
+ increased code complexity, so I decided against it.
+ */
+
+ // Apply InvMixColumn to the difference.
+ movd vt0, dr
+ call InvMixColumn
+
+ add $4*4, offset
+
+ // Chain the transformed difference to previously transformed outputs.
+ movd (0-4)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 0*4(E, offset)
+
+ movd (1-4)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 1*4(E, offset)
+
+ movd (2-4)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 2*4(E, offset)
+
+ movd (3-4)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 3*4(E, offset)
+
+ jl 1b
+
+// Here is the final iteration, which does not perform InvMixColumn.
+
+ movd ve3, dr // Put previous word into work register.
+ movzx 1(R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+ pxor vt0, ve0
+
+ // Chain to successive words.
+ movd ve0, 4*4(E, offset)
+ pxor ve0, ve1
+ movd ve1, 5*4(E, offset)
+ pxor ve1, ve2
+ movd ve2, 6*4(E, offset)
+ pxor ve2, ve3
+ movd ve3, 7*4(E, offset)
+
+ xor r0, r0 // Return success.
+
+9:
+ // Pop stack and restore registers.
+ movaps 7*16(r4), %xmm7
+ movaps 6*16(r4), %xmm6
+ movaps 5*16(r4), %xmm5
+ movaps 4*16(r4), %xmm4
+ movaps 3*16(r4), %xmm3
+ movaps 2*16(r4), %xmm2
+ movaps 1*16(r4), %xmm1
+ movaps 0*16(r4), %xmm0
+ #if 0 < LocalsSize
+ add $Padding + LocalsSize, r4
+ #endif
+ #if defined __i386__
+ pop r7
+ pop r6
+ pop r5
+ #endif
+ pop r3
+
+ ret
+
+
+ .globl _aes_decrypt_key192
+// .private_extern _aes_decrypt_key192
+_aes_decrypt_key192:
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+
+ 8 four-byte spaces for work.
+ */
+ #define LocalsSize (8*16 + 8*4)
+
+ // Define stack offset to storage space for local data.
+ #define Local (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+#if defined __i386__
+
+ // Load arguments.
+ #define Argument(i) StackFrame+4*(i)(r4)
+ mov Argument(1), E
+ mov Argument(0), K
+
+#endif
+
+// Merge point for _aes_decrypt_key and _aes_decrypt_key192.
+DKeyHas6Words:
+
+ // First words of expanded key are copied from user key.
+ movd 0*4(K), ve0
+ movd 1*4(K), ve1
+ movd 2*4(K), ve2
+ movd 3*4(K), ve3
+
+ movl $12*16, ContextKeyLength(E) // Set "key length."
+
+ #if 0 != ContextKey
+ add $ContextKey, E
+ #endif
+
+ movd 4*4(K), ve4
+ movd 5*4(K), ve5
+
+ // K cannot be used after we write to R, since they use the same register.
+
+ #if defined __i386__
+
+ lea _AESRcon, R
+ lea _AESInvMixColumnTable, ITable
+ lea _AESSubBytesWordTable, STable
+
+ #elif defined __x86_64__
+
+ lea _AESRcon(%rip), R
+ lea _AESInvMixColumnTable(%rip), ITable
+ lea _AESSubBytesWordTable(%rip), STable
+
+ #endif
+
+ /* With a six-word key, there are twelve rounds (thirteen 16-byte key
+ blocks), eleven of which have InvMixColumn applied. The key expansion
+ proceeds in iterations of six four-byte words, so the termination
+ condition is a bit complicated. We set offset to the negative of 10
+ four four-byte words, and the loop branch does another iteration if
+ offset is less than or equal to zero, meaning the number of iterations
+ performed so far is less than or equal to 10. Thus, after ten
+ iterations, it branches again. After the eleventh iteration, it
+ stops. Code after the end of the loop computes the twelfth key block,
+ which does not have InvMixColumn applied.
+ */
+ mov $-10*4*4, offset
+ sub offset, E
+
+ // Store initial words of expanded key, which are copies of user's key.
+ movd ve0, 0*4(E, offset)
+ movd ve1, 1*4(E, offset)
+ movd ve2, 2*4(E, offset)
+ movd ve3, 3*4(E, offset)
+
+ /* The first four words are stored untransformed. After that, words in
+ the expanded key are transformed by InvMixColumn.
+ */
+ movd ve4, dr
+ call InvMixColumn
+ movd vt0, 4*4(E, offset)
+
+ movd ve5, dr
+ call InvMixColumn
+ movd vt0, 5*4(E, offset)
+
+/* Here is the first iteration of the key expansion. It is separate from the
+ main loop below because we need to apply InvMixColumn to each of the
+ outputs, in ve0 through ve5. In the main loop, the technique described at
+ the top of this file is used to compute the proper outputs while using
+ InvMixColumn only once.
+*/
+ add $1, R // Advance pointer.
+ movd ve5, dr // Put previous word into work register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+ pxor vt0, ve0
+
+ // Chain to successive words.
+ pxor ve0, ve1
+ pxor ve1, ve2
+ pxor ve2, ve3
+ pxor ve3, ve4
+ pxor ve4, ve5
+
+ add $6*4, offset
+
+ /* Apply InvMixColumn to each word. The transformed values are stored in
+ the expanded key. The original values are retained in registers for
+ further computation.
+ */
+ movd ve0, dr
+ call InvMixColumn
+ movd vt0, 0*4(E, offset)
+
+ movd ve1, dr
+ call InvMixColumn
+ movd vt0, 1*4(E, offset)
+
+ movd ve2, dr
+ call InvMixColumn
+ movd vt0, 2*4(E, offset)
+
+ movd ve3, dr
+ call InvMixColumn
+ movd vt0, 3*4(E, offset)
+
+ movd (4-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 4*4(E, offset)
+
+ movd (5-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 5*4(E, offset)
+
+// Here is the main loop.
+1:
+ add $1, R // Advance pointer.
+ movd ve5, dr // Put previous word into work register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+ pxor vt0, ve0
+
+ // Chain to successive words.
+ pxor ve0, ve1
+ pxor ve1, ve2
+ pxor ve2, ve3
+ pxor ve3, ve4
+ pxor ve4, ve5
+
+ // Apply InvMixColumn to the difference.
+ movd vt0, dr
+ call InvMixColumn
+
+ add $6*4, offset
+
+ // Chain the transformed difference to previously transformed outputs.
+ movd (0-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 0*4(E, offset)
+
+ movd (1-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 1*4(E, offset)
+
+ movd (2-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 2*4(E, offset)
+
+ movd (3-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 3*4(E, offset)
+
+ movd (4-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 4*4(E, offset)
+
+ movd (5-6)*4(E, offset), vt1
+ pxor vt1, vt0
+ movd vt0, 5*4(E, offset)
+
+ jle 1b
+
+// Here is the final iteration, which does not perform InvMixColumn.
+
+ movd ve5, dr // Put previous word into work register.
+ movzx 1(R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+ pxor vt0, ve0
+
+ // Chain to successive words.
+ movd ve0, 6*4(E, offset)
+ pxor ve0, ve1
+ movd ve1, 7*4(E, offset)
+ pxor ve1, ve2
+ movd ve2, 8*4(E, offset)
+ pxor ve2, ve3
+ movd ve3, 9*4(E, offset)
+
+ xor r0, r0 // Return success.
+
+ // Pop stack and restore registers.
+ movaps 7*16(r4), %xmm7
+ movaps 6*16(r4), %xmm6
+ movaps 5*16(r4), %xmm5
+ movaps 4*16(r4), %xmm4
+ movaps 3*16(r4), %xmm3
+ movaps 2*16(r4), %xmm2
+ movaps 1*16(r4), %xmm1
+ movaps 0*16(r4), %xmm0
+ #if 0 < LocalsSize
+ add $Padding + LocalsSize, r4
+ #endif
+ #if defined __i386__
+ pop r7
+ pop r6
+ pop r5
+ #endif
+ pop r3
+
+ ret
+
+
+ .globl _aes_decrypt_key256
+// .private_extern _aes_decrypt_key256
+_aes_decrypt_key256:
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+
+ 8 four-byte spaces for work.
+ */
+ #define LocalsSize (8*16 + 8*4)
+
+ // Define stack offset to storage space for local data.
+ #define Local (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+#if defined __i386__
+
+ // Load arguments.
+ #define Argument(i) StackFrame+4*(i)(r4)
+ mov Argument(1), E
+ mov Argument(0), K
+
+#endif
+
+// Merge point for _aes_decrypt_key and _aes_decrypt_key256.
+DKeyHas8Words:
+
+ // First words of expanded key are copied from user key.
+ movd 0*4(K), ve0
+ movd 1*4(K), ve1
+ movd 2*4(K), ve2
+ movd 3*4(K), ve3
+
+ movl $14*16, ContextKeyLength(E) // Set "key length."
+
+ #if 0 != ContextKey
+ add $ContextKey, E
+ #endif
+
+ // Store initial words of expanded key, which are copies of user's key.
+ movd ve0, 0*4(E)
+ movd ve1, 1*4(E)
+ movd ve2, 2*4(E)
+ movd ve3, 3*4(E)
+ movd 4*4(K), ve0
+ movd 5*4(K), ve1
+ movd 6*4(K), ve2
+ movd 7*4(K), ve3
+
+ // K cannot be used after we write to R, since they use the same register.
+
+ #if defined __i386__
+
+ lea _AESRcon, R
+ lea _AESInvMixColumnTable, ITable
+ lea _AESSubBytesWordTable, STable
+
+ #elif defined __x86_64__
+
+ lea _AESRcon(%rip), R
+ lea _AESInvMixColumnTable(%rip), ITable
+ lea _AESSubBytesWordTable(%rip), STable
+
+ #endif
+
+ /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
+ blocks), thirteen of which have InvMixColumn applied.
+ */
+ mov $-12*4*4, offset
+ sub offset, E
+
+ // Save untransformed values in stack area.
+ movd ve0, 4*4+Local(r4)
+ movd ve1, 5*4+Local(r4)
+ movd ve2, 6*4+Local(r4)
+ movd ve3, 7*4+Local(r4)
+
+ /* Apply InvMixColumn to words 4 through 7. The transformed values are
+ stored in the expanded key. The original values are saved in the stack
+ area for further computation.
+ */
+ movd ve0, dr
+ call InvMixColumn
+ movd vt0, 4*4(E, offset)
+
+ movd ve1, dr
+ call InvMixColumn
+ movd vt0, 5*4(E, offset)
+
+ movd ve2, dr
+ call InvMixColumn
+ movd vt0, 6*4(E, offset)
+
+ movd ve3, dr
+ call InvMixColumn
+ movd vt0, 7*4(E, offset)
+
+/* Here is the first iteration of the key expansion. It is separate from the
+ main loop below because we need to apply InvMixColumn to each of the
+ outputs, in ve0 through ve3. In the main loop, the technique described at
+ the top of this file is used to compute the proper outputs while using
+ InvMixColumn only once.
+*/
+ add $1, R // Advance pointer.
+ movd ve3, dr // Put previous word into work register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+
+ add $8*4, offset
+
+ movd (0-8)*4(E, offset), ve0 // Get old word.
+ pxor vt0, ve0
+ movd ve0, 0*4+Local(r4) // Save on stack.
+ movd ve0, dr
+ call InvMixColumn
+ movd vt0, 0*4(E, offset) // Write to expanded key.
+
+ /* Chain to successive words and apply InvMixColumn to each word. The
+ transformed values are stored in the expanded key. The original
+ values are retained in local data for further computation.
+ */
+ movd (1-8)*4(E, offset), ve1 // Get old word.
+ pxor ve0, ve1 // Chain.
+ movd ve1, 1*4+Local(r4) // Save on stack.
+ movd ve1, dr
+ call InvMixColumn
+ movd vt0, 1*4(E, offset) // Write to expanded key.
+
+ movd (2-8)*4(E, offset), ve2 // Get old word.
+ pxor ve1, ve2 // Chain.
+ movd ve2, 2*4+Local(r4) // Save on stack.
+ movd ve2, dr
+ call InvMixColumn
+ movd vt0, 2*4(E, offset) // Write to expanded key.
+
+ movd (3-8)*4(E, offset), ve3 // Get old word.
+ pxor ve2, ve3 // Chain.
+ movd ve3, 3*4+Local(r4) // Save on stack.
+ movd ve3, dr
+ call InvMixColumn
+ movd vt0, 3*4(E, offset) // Write to expanded key.
+
+ movd ve3, dr // Put previous word into work register.
+ SubWord
+
+ movd 4*4+Local(r4), ve0 // Get old word.
+ pxor vt0, ve0 // Chain.
+ movd ve0, 4*4+Local(r4) // Save on stack.
+
+ movd 5*4+Local(r4), ve1 // Get old word.
+ pxor ve0, ve1 // Chain.
+ movd ve1, 5*4+Local(r4) // Save on stack.
+
+ movd 6*4+Local(r4), ve2 // Get old word.
+ pxor ve1, ve2 // Chain.
+ movd ve2, 6*4+Local(r4) // Save on stack.
+
+ movd 7*4+Local(r4), ve3 // Get old word.
+ pxor ve2, ve3 // Chain.
+ movd ve3, 7*4+Local(r4) // Save on stack.
+
+ movd vt0, dr // Move change to work register.
+ call InvMixColumn
+
+ movd (4-8)*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, 4*4(E, offset) // Write new word to expanded key.
+
+ movd (5-8)*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, 5*4(E, offset) // Write new word to expanded key.
+
+ movd (6-8)*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, 6*4(E, offset) // Write new word to expanded key.
+
+ movd (7-8)*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, 7*4(E, offset) // Write new word to expanded key.
+
+// Here is the main loop.
+1:
+ add $1, R // Advance pointer.
+ movd ve3, dr // Put previous word into work register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+
+ movd 0*4+Local(r4), ve0 // Get old word.
+ pxor vt0, ve0
+ movd ve0, 0*4+Local(r4) // Save on stack.
+
+ // Chain to successive words.
+ movd 1*4+Local(r4), ve1 // Get old word.
+ pxor ve0, ve1 // Chain.
+ movd ve1, 1*4+Local(r4) // Save on stack.
+
+ movd 2*4+Local(r4), ve2 // Get old word.
+ pxor ve1, ve2 // Chain.
+ movd ve2, 2*4+Local(r4) // Save on stack.
+
+ movd 3*4+Local(r4), ve3 // Get old word.
+ pxor ve2, ve3 // Chain.
+ movd ve3, 3*4+Local(r4) // Save on stack.
+
+ movd vt0, dr // Move change to work register.
+ call InvMixColumn
+
+ movd 0*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (0+8)*4(E, offset) // Write new word to expanded key.
+
+ movd 1*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (1+8)*4(E, offset) // Write new word to expanded key.
+
+ movd 2*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (2+8)*4(E, offset) // Write new word to expanded key.
+
+ movd 3*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (3+8)*4(E, offset) // Write new word to expanded key.
+
+ movd ve3, dr // Put previous word into work register.
+ SubWord
+
+ movd 4*4+Local(r4), ve0 // Get old word.
+ pxor vt0, ve0 // Chain.
+ movd ve0, 4*4+Local(r4) // Save on stack.
+
+ movd 5*4+Local(r4), ve1 // Get old word.
+ pxor ve0, ve1 // Chain.
+ movd ve1, 5*4+Local(r4) // Save on stack.
+
+ movd 6*4+Local(r4), ve2 // Get old word.
+ pxor ve1, ve2 // Chain.
+ movd ve2, 6*4+Local(r4) // Save on stack.
+
+ movd 7*4+Local(r4), ve3 // Get old word.
+ pxor ve2, ve3 // Chain.
+ movd ve3, 7*4+Local(r4) // Save on stack.
+
+ movd vt0, dr // Move change to work register.
+ call InvMixColumn
+
+ movd 4*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (4+8)*4(E, offset) // Write new word to expanded key.
+
+ movd 5*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (5+8)*4(E, offset) // Write new word to expanded key.
+
+ movd 6*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (6+8)*4(E, offset) // Write new word to expanded key.
+
+ movd 7*4(E, offset), vt1 // Get old word.
+ pxor vt1, vt0 // Chain.
+ movd vt0, (7+8)*4(E, offset) // Write new word to expanded key.
+
+ add $8*4, offset
+
+ jl 1b
+
+ movd ve3, dr // Put previous word into work register.
+ movzx 1(R), t0d // Get round constant.
+ movd t0d, vt0
+
+ SubWordRotWord
+
+ movd 0*4+Local(r4), ve0 // Get old word.
+ pxor vt0, ve0 // Chain.
+ movd ve0, (0+8)*4(E, offset)
+
+ // Chain to successive words.
+ movd 1*4+Local(r4), ve1 // Get old word.
+ pxor ve0, ve1 // Chain.
+ movd ve1, (1+8)*4(E, offset)
+
+ movd 2*4+Local(r4), ve2 // Get old word.
+ pxor ve1, ve2 // Chain.
+ movd ve2, (2+8)*4(E, offset)
+
+ movd 3*4+Local(r4), ve3 // Get old word.
+ pxor ve2, ve3 // Chain.
+ movd ve3, (3+8)*4(E, offset)
+
+ xor r0, r0 // Return success.
+
+ // Pop stack and restore registers.
+ movaps 7*16(r4), %xmm7
+ movaps 6*16(r4), %xmm6
+ movaps 5*16(r4), %xmm5
+ movaps 4*16(r4), %xmm4
+ movaps 3*16(r4), %xmm3
+ movaps 2*16(r4), %xmm2
+ movaps 1*16(r4), %xmm1
+ movaps 0*16(r4), %xmm0
+ #if 0 < LocalsSize
+ add $Padding + LocalsSize, r4
+ #endif
+ #if defined __i386__
+ pop r7
+ pop r6
+ pop r5
+ #endif
+ pop r3
+
+ ret
+
+
+#undef Address
+#undef Argument
+#undef E
+#undef ITable
+#undef K
+#undef Local
+#undef LocalsSize
+#undef LookupI
+#undef LookupS
+#undef Nk
+#undef Padding
+#undef R
+#undef SaveSize
+#undef STable
+#undef StackFrame
+#undef dr
+#undef drh
+#undef drl
+#undef offset
+#undef t0
+#undef t0d
+#undef ve0
+#undef ve1
+#undef ve2
+#undef ve3
+#undef ve4
+#undef ve5
+#undef vt0
+#undef vt1