--- /dev/null
+/* This file defines _aes_encrypt_key, _aes_encrypt_key128,
+ _aes_encrypt_key192, and _aes_encrypt_key256. It is designed to be
+ included in another assembly file with the preprocessor #include directive,
+ to benefit from some assembly-time calculations.
+
+ Written by Eric Postpischil, January 2008.
+
+ The comments here do not say much about the algorithm; the code just
+ follows the FIPS-197 specification. I recommend reading the specification
+ before working with this code or examining the C code in the parent
+ directory that illustrates key expansion.
+*/
+
+
+/* Routines:
+
+ _aes_encrypt_key.
+
+ _aes_encrypt_key128, _aes_encrypt_key192, and _aes_encrypt_key256.
+
+ Function:
+
+ Expand the user's cipher key into the key schedule, as defined in
+ Federal Information Processing Standards Publication 197 (FIPS-197),
+ November 26, 2001.
+
+ Input:
+
+ Constant data:
+
+ The following names must be locally defined so the assembler
+ can calculate certain offsets.
+
+ static const Word _AESSubBytesWordTable[4][256].
+
+ _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
+ SubBytes is defined in FIPS-197. _AESSubBytesWordTable
+ differs from _AESEncryptTable in that it does not include
+ the MixColumn operation. It is used in performing the last
+ round, which differs fromm the previous rounds in that it
+ does not include the MixColumn operation.
+
+ static const Byte _AESRcon[].
+
+ Round constants, beginning with AESRcon[1] for the first round
+ (AESRcon[0] is padding.)
+
+ Arguments:
+
+ const uint8_t *Key
+
+ Address of user's cipher key.
+
+ int Length
+
+ Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in
+ user's cipher key.
+
+ This argument is used with _aes_encrypt_key. It is not
+ present for the other routines. In those routines, Context
+ is the second argument.
+
+ aes_encrypt_ctx *Context
+
+ Structure to contain the expanded key beginning at offset
+ ContextKey and a four-byte "key length" beginning at offset
+ ContextKeyLength. The "key length" is the number of bytes from
+ the start of the first round key to the start of the last round
+ key. That is 16 less than the number of bytes in the entire
+ key.
+
+ Output:
+
+ The expanded key and the "key length" are written to *Context.
+
+ Return:
+
+ aes_rval // -1 if "key length" is invalid. 0 otherwise.
+*/
+
+/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */
+#ifdef KERNEL
+#include <i386/cpu_capabilities.h>
+#else
+#include <System/i386/cpu_capabilities.h>
+#endif
+
+ .text
+ .globl _aes_encrypt_key
+// .private_extern _aes_encrypt_key
+_aes_encrypt_key:
+
+ // detect AES HW, cclee-3-13-10
+#if defined __x86_64__
+ movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
+ mov (%rax), %eax // %eax = __cpu_capabilities
+#else
+#if defined KERNEL
+ leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
+ mov (%eax), %eax // %eax = __cpu_capabilities
+#else
+ mov _COMM_PAGE_CPU_CAPABILITIES, %eax
+#endif
+#endif
+ test $(kHasAES), %eax // __cpu_capabilities & kHasAES
+ jne _aes_encrypt_key_hw // if AES HW detected, branch to _aes_encrypt_key_hw
+
+#define dr r0d // Dissection register.
+#define drl r0l // Low 8 bits of dissection register.
+#define drh r0h // Second-lowest 8 bits of dissection register.
+
+#define t0 r1
+#define t0d r1d // Low 32 bits of t0.
+
+#define offset Arch(r5, r11) // Address offset and loop sentinel.
+
+#define R r7 // Address of round constant.
+#define K r7 // User key pointer.
+ // R and K overlap.
+
+#define E r6 // Expanded key pointer.
+
+#define ve0 %xmm0
+#define ve1 %xmm1
+#define ve2 %xmm2
+#define ve3 %xmm3
+#define vt3 %xmm4
+#define vt2 %xmm5
+#define vt1 %xmm6
+#define vt0 %xmm7
+
+#if defined __i386__
+ #define LookupS(table, index) \
+ _AESSubBytesWordTable+(table)*TableSize(, index, 4)
+#elif defined __x86_64__
+ #define LookupS(table, index) (table)*TableSize(STable, index, 4)
+#endif
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+ */
+ #define LocalsSize (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+#if defined __i386__
+
+ // Define location of argument i.
+ #define Argument(i) StackFrame+4*(i)(r4)
+
+ #define Nk t0d
+
+ // Load arguments.
+ mov Argument(2), E
+ mov Argument(1), Nk
+ mov Argument(0), K
+
+#elif defined __x86_64__
+
+ #define Nk r9d // Number of words in key.
+ mov r6d, Nk // Move Nk argument out of way.
+ mov r2, E // Move E argument to common register.
+
+#endif
+
+ // Dispatch on key length.
+ cmp $128, Nk
+ jge 2f
+ shl $3, Nk // Convert from bytes to bits.
+ cmp $128, Nk
+2:
+ je EKeyHas4Words
+ cmp $192, Nk
+ je EKeyHas6Words
+ cmp $256, Nk
+ je EKeyHas8Words
+ mov $-1, r0 // Return error.
+ jmp 9f
+
+// Stop using Nk.
+#undef Nk
+
+ .globl _aes_encrypt_key128
+// .private_extern _aes_encrypt_key128
+_aes_encrypt_key128:
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+ */
+ #define LocalsSize (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+ #if defined __i386__
+
+ // Load arguments.
+ #define Argument(i) StackFrame+4*(i)(r4)
+ mov Argument(1), E
+ mov Argument(0), K
+
+ #endif
+
+// Merge point for _aes_encrypt_key and _aes_encrypt_key128.
+EKeyHas4Words:
+
+#define e0 r2d
+#define e1 r3d
+#define e2 Arch(r5d, r11d)
+#define e3 r7d
+
+ // First words of expanded key are copied from user key.
+ mov 0*4(K), e0
+ mov 1*4(K), e1
+ mov 2*4(K), e2
+ mov 3*4(K), e3
+
+ movl $10*16, ContextKeyLength(E) // Set "key length."
+
+ #if 0 != ContextKey
+ add $ContextKey, E
+ #endif
+
+ // K cannot be used after we write to R, since they use the same register.
+
+ // Cache round constants in output buffer. The last is a sentinel.
+ movb $0x01, 1*16(E)
+ movb $0x02, 2*16(E)
+ movb $0x04, 3*16(E)
+ movb $0x08, 4*16(E)
+ movb $0x10, 5*16(E)
+ movb $0x20, 6*16(E)
+ movb $0x40, 7*16(E)
+ movb $0x80, 8*16(E)
+ movb $0x1b, 9*16(E)
+ movb $0x36, 10*16(E)
+
+ #if defined __x86_64__
+
+ #define STable r8
+ lea _AESSubBytesWordTable(%rip), STable
+
+ #endif
+
+ // Store initial words of expanded key, which are copies of user's key.
+ mov e0, 0*4(E)
+ mov e1, 1*4(E)
+ mov e2, 2*4(E)
+ mov e3, 3*4(E)
+
+1:
+ mov e3, dr // Put previous word into dissection register.
+
+ // Perform SubWord(RotWord(dr)).
+ movzx drl, t0
+ xor LookupS(3, t0), e0 // Look up byte 0 in table 3.
+ movzx drh, t0d
+ xor LookupS(0, t0), e0 // Look up byte 1 in table 0.
+ shr $16, dr
+ movzx drl, t0d
+ xor LookupS(1, t0), e0 // Look up byte 2 in table 1.
+ movzx drh, t0d
+ xor LookupS(2, t0), e0 // Look up byte 3 in table 2.
+
+ add $4*4, E
+
+ movzx (E), t0d // Get cached round constant.
+ xor t0d, e0 // XOR with word from four words back.
+
+ // Chain to successive words.
+ mov e0, 0*4(E)
+ xor e0, e1
+ mov e1, 1*4(E)
+ xor e1, e2
+ mov e2, 2*4(E)
+ xor e2, e3
+ mov e3, 3*4(E)
+
+ cmp $0x36, t0d // Was this the last round constant?
+
+ jne 1b
+
+ xor r0, r0 // Return success.
+
+9:
+ // Pop stack and restore registers.
+ movaps 7*16(r4), %xmm7
+ movaps 6*16(r4), %xmm6
+ movaps 5*16(r4), %xmm5
+ movaps 4*16(r4), %xmm4
+ movaps 3*16(r4), %xmm3
+ movaps 2*16(r4), %xmm2
+ movaps 1*16(r4), %xmm1
+ movaps 0*16(r4), %xmm0
+ #if 0 < LocalsSize
+ add $Padding + LocalsSize, r4
+ #endif
+ #if defined __i386__
+ pop r7
+ pop r6
+ pop r5
+ #endif
+ pop r3
+
+ ret
+
+
+// Reset definitions for next case.
+#undef e0
+#undef e1
+#undef e2
+#undef e3
+
+#undef vt3
+#undef vt2
+#define ve4 %xmm4
+#define ve5 %xmm5
+
+
+ .globl _aes_encrypt_key192
+// .private_extern _aes_encrypt_key192
+_aes_encrypt_key192:
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+ */
+ #define LocalsSize (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+ #if defined __i386__
+
+ // Load arguments.
+ #define Argument(i) StackFrame+4*(i)(r4)
+ mov Argument(1), E
+ mov Argument(0), K
+
+ #endif
+
+// Merge point for _aes_encrypt_key and _aes_encrypt_key192.
+EKeyHas6Words:
+
+ // First words of expanded key are copied from user key.
+ movd 0*4(K), ve0
+ movd 1*4(K), ve1
+ movd 2*4(K), ve2
+ movd 3*4(K), ve3
+
+ movl $12*16, ContextKeyLength(E) // Set "key length."
+
+ #if 0 != ContextKey
+ add $ContextKey, E
+ #endif
+
+ movd 4*4(K), ve4
+ movd 5*4(K), ve5
+
+ // K cannot be used after we write to R, since they use the same register.
+
+ #if defined __i386__
+
+ lea _AESRcon, R
+
+ #elif defined __x86_64__
+
+ lea _AESRcon(%rip), R
+ lea _AESSubBytesWordTable(%rip), STable
+
+ #endif
+
+ /* With a six-word key, there are twelve rounds (thirteen 16-byte key
+ blocks).
+ */
+ mov $-12*4*4, offset
+ sub offset, E
+
+ // Store initial words of expanded key, which are copies of user's key.
+ movd ve0, 0*4(E, offset)
+ movd ve1, 1*4(E, offset)
+ movd ve2, 2*4(E, offset)
+ movd ve3, 3*4(E, offset)
+ movd ve4, 4*4(E, offset)
+ movd ve5, 5*4(E, offset)
+
+/* Jump into loop body. The key expansion processes six four-byte words per
+ iteration. 52 are needed in the key. So only four are needed in the last
+ iteration.
+*/
+ jmp 2f
+1:
+ // Continue chaining to successive words.
+ pxor ve3, ve4
+ movd ve4, 4*4(E, offset)
+ pxor ve4, ve5
+ movd ve5, 5*4(E, offset)
+2:
+ add $1, R // Advance pointer.
+ movd ve5, dr // Put previous word into dissection register.
+ movzx (R), t0 // Get round constant.
+ movd t0d, vt1
+ pxor vt1, ve0 // XOR with word from six words back.
+
+ // Perform SubWord(RotWord(dr)).
+ movzx drl, t0d
+ movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
+ movzx drh, t0d
+ movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
+ shr $16, dr
+ movzx drl, t0d
+ pxor vt1, vt0
+ pxor vt0, ve0
+ movd LookupS(1, t0), vt0 // Look up byte 2 in table 1.
+ movzx drh, t0d
+ movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
+ pxor vt1, vt0
+ pxor vt0, ve0
+
+ add $6*4, offset
+
+ // Chain to successive words.
+ movd ve0, 0*4(E, offset)
+ pxor ve0, ve1
+ movd ve1, 1*4(E, offset)
+ pxor ve1, ve2
+ movd ve2, 2*4(E, offset)
+ pxor ve2, ve3
+ movd ve3, 3*4(E, offset)
+
+ jne 1b
+
+ xor r0, r0 // Return success.
+
+ // Pop stack and restore registers.
+ movaps 7*16(r4), %xmm7
+ movaps 6*16(r4), %xmm6
+ movaps 5*16(r4), %xmm5
+ movaps 4*16(r4), %xmm4
+ movaps 3*16(r4), %xmm3
+ movaps 2*16(r4), %xmm2
+ movaps 1*16(r4), %xmm1
+ movaps 0*16(r4), %xmm0
+ #if 0 < LocalsSize
+ add $Padding + LocalsSize, r4
+ #endif
+ #if defined __i386__
+ pop r7
+ pop r6
+ pop r5
+ #endif
+ pop r3
+
+ ret
+
+
+// Reset definitions for next case.
+#undef ve4
+#undef ve5
+#define vt3 %xmm4
+#define vt2 %xmm5
+
+
+ .globl _aes_encrypt_key256
+// .private_extern _aes_encrypt_key256
+_aes_encrypt_key256:
+
+ /* Save registers and set SaveSize to the number of bytes pushed onto the
+ stack so far, including the caller's return address.
+ */
+ push r3
+ #if defined __i386__
+ push r5
+ push r6
+ push r7
+ #define SaveSize (5*4)
+ #else
+ #define SaveSize (2*8)
+ #endif
+
+ /* Number of bytes used for local variables:
+
+ 8 16-byte spaces to save XMM registers.
+ */
+ #define LocalsSize (8*16)
+
+ #if 0 < LocalsSize
+ // Padding to position stack pointer at a multiple of 16 bytes.
+ #define Padding (15 & -(SaveSize + LocalsSize))
+ sub $Padding + LocalsSize, r4 // Allocate space on stack.
+ #else
+ #define Padding 0
+ #endif
+
+ /* StackFrame is the number of bytes in our stack frame, from caller's
+ stack pointer to ours (so it includes the return address).
+ */
+ #define StackFrame (SaveSize + Padding + LocalsSize)
+
+ // Save xmm registers.
+ movaps %xmm0, 0*16(r4)
+ movaps %xmm1, 1*16(r4)
+ movaps %xmm2, 2*16(r4)
+ movaps %xmm3, 3*16(r4)
+ movaps %xmm4, 4*16(r4)
+ movaps %xmm5, 5*16(r4)
+ movaps %xmm6, 6*16(r4)
+ movaps %xmm7, 7*16(r4)
+
+ #if defined __i386__
+
+ // Load arguments.
+ #define Argument(i) StackFrame+4*(i)(r4)
+ mov Argument(1), E
+ mov Argument(0), K
+
+ #endif
+
+// Merge point for _aes_encrypt_key and _aes_encrypt_key256.
+EKeyHas8Words:
+
+ // First words of expanded key are copied from user key.
+ movd 0*4(K), ve0
+ movd 1*4(K), ve1
+ movd 2*4(K), ve2
+ movd 3*4(K), ve3
+
+ movl $14*16, ContextKeyLength(E) // Set "key length."
+
+ #if 0 != ContextKey
+ add $ContextKey, E
+ #endif
+
+ // Store initial words of expanded key, which are copies of user's key.
+ movd ve0, 0*4(E)
+ movd ve1, 1*4(E)
+ movd ve2, 2*4(E)
+ movd ve3, 3*4(E)
+ movd 4*4(K), ve0
+ movd 5*4(K), ve1
+ movd 6*4(K), ve2
+ movd 7*4(K), ve3
+
+ // K cannot be used after we write to R, since they use the same register.
+
+ #if defined __i386__
+
+ lea _AESRcon, R
+
+ #elif defined __x86_64__
+
+ lea _AESRcon(%rip), R
+ lea _AESSubBytesWordTable(%rip), STable
+
+ #endif
+
+ /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
+ blocks).
+ */
+ mov $-14*4*4, offset
+ sub offset, E
+
+ // Store initial words of expanded key, which are copies of user's key.
+ movd ve0, 4*4(E, offset)
+ movd ve1, 5*4(E, offset)
+ movd ve2, 6*4(E, offset)
+ movd ve3, 7*4(E, offset)
+
+/* Jump into loop body. The key expansion processes eight four-byte words per
+ iteration. 60 are needed in the key. So only four are needed in the last
+ iteration.
+*/
+ jmp 2f
+1:
+ movd ve3, dr // Put previous word into dissection register.
+
+ /* Get word from eight words back (it is four words back from where E
+ currently points, and we use it to prepare the value to be stored
+ four words beyond where E currently points).
+ */
+ movd -4*4(E, offset), ve0
+
+ // Perform SubWord(dr).
+ movzx drl, t0
+ movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
+ movzx drh, t0d
+ movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
+ shr $16, dr
+ movzx drl, t0d
+ movd LookupS(2, t0), vt2 // Look up byte 2 in table 2.
+ movzx drh, t0d
+ movd LookupS(3, t0), vt3 // Look up byte 3 in table 3.
+ pxor vt1, vt0
+ pxor vt3, vt2
+ pxor vt0, ve0
+ pxor vt2, ve0
+
+ movd -3*4(E, offset), ve1 // Get words from eight words back.
+ movd -2*4(E, offset), ve2
+ movd -1*4(E, offset), ve3
+
+ // Chain to successive words.
+ movd ve0, 4*4(E, offset)
+ pxor ve0, ve1
+ movd ve1, 5*4(E, offset)
+ pxor ve1, ve2
+ movd ve2, 6*4(E, offset)
+ pxor ve2, ve3
+ movd ve3, 7*4(E, offset)
+
+2:
+ add $1, R // Advance pointer.
+ movd ve3, dr // Put previous word into dissection register.
+ movzx (R), t0d // Get round constant.
+ movd t0d, vt1
+ movd 0*4(E, offset), ve0 // Get word from eight words back.
+ pxor vt1, ve0
+
+ // Perform SubWord(RotWord(dr)).
+ movzx drl, t0
+ movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
+ movzx drh, t0d
+ movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
+ shr $16, dr
+ movzx drl, t0d
+ movd LookupS(1, t0), vt2 // Look up byte 2 in table 1.
+ movzx drh, t0d
+ movd LookupS(2, t0), vt3 // Look up byte 3 in table 2.
+ pxor vt1, vt0
+ pxor vt3, vt2
+ pxor vt0, ve0
+ pxor vt2, ve0
+
+ movd 1*4(E, offset), ve1
+ movd 2*4(E, offset), ve2
+ movd 3*4(E, offset), ve3
+
+ add $8*4, offset
+
+ // Chain to successive words.
+ movd ve0, 0*4(E, offset)
+ pxor ve0, ve1
+ movd ve1, 1*4(E, offset)
+ pxor ve1, ve2
+ movd ve2, 2*4(E, offset)
+ pxor ve2, ve3
+ movd ve3, 3*4(E, offset)
+
+ jne 1b
+
+ xor r0, r0 // Return success.
+
+ // Pop stack and restore registers.
+ movaps 7*16(r4), %xmm7
+ movaps 6*16(r4), %xmm6
+ movaps 5*16(r4), %xmm5
+ movaps 4*16(r4), %xmm4
+ movaps 3*16(r4), %xmm3
+ movaps 2*16(r4), %xmm2
+ movaps 1*16(r4), %xmm1
+ movaps 0*16(r4), %xmm0
+ #if 0 < LocalsSize
+ add $Padding + LocalsSize, r4
+ #endif
+ #if defined __i386__
+ pop r7
+ pop r6
+ pop r5
+ #endif
+ pop r3
+
+ ret
+
+
+#undef Address
+#undef Argument
+#undef E
+#undef K
+#undef LocalsSize
+#undef LookupS
+#undef Padding
+#undef R
+#undef SaveSize
+#undef STable
+#undef StackFrame
+#undef dr
+#undef drh
+#undef drl
+#undef offset
+#undef t0
+#undef t0d
+#undef ve0
+#undef ve1
+#undef ve2
+#undef ve3
+#undef vt0
+#undef vt1
+#undef vt2
+#undef vt3