1 /* This file defines _aes_encrypt_key, _aes_encrypt_key128,
2 _aes_encrypt_key192, and _aes_encrypt_key256. It is designed to be
3 included in another assembly file with the preprocessor #include directive,
4 to benefit from some assembly-time calculations.
6 Written by Eric Postpischil, January 2008.
8 The comments here do not say much about the algorithm; the code just
9 follows the FIPS-197 specification. I recommend reading the specification
10 before working with this code or examining the C code in the parent
11 directory that illustrates key expansion.
19 _aes_encrypt_key128, _aes_encrypt_key192, and _aes_encrypt_key256.
23 Expand the user's cipher key into the key schedule, as defined in
24 Federal Information Processing Standards Publication 197 (FIPS-197),
31 The following names must be locally defined so the assembler
32 can calculate certain offsets.
34 static const Word _AESSubBytesWordTable[4][256].
36 _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
37 SubBytes is defined in FIPS-197. _AESSubBytesWordTable
38 differs from _AESEncryptTable in that it does not include
39 the MixColumn operation. It is used in performing the last
40 round, which differs fromm the previous rounds in that it
41 does not include the MixColumn operation.
43 static const Byte _AESRcon[].
45 Round constants, beginning with AESRcon[1] for the first round
46 (AESRcon[0] is padding.)
52 Address of user's cipher key.
56 Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in
59 This argument is used with _aes_encrypt_key. It is not
60 present for the other routines. In those routines, Context
61 is the second argument.
63 aes_encrypt_ctx *Context
65 Structure to contain the expanded key beginning at offset
66 ContextKey and a four-byte "key length" beginning at offset
67 ContextKeyLength. The "key length" is the number of bytes from
68 the start of the first round key to the start of the last round
69 key. That is 16 less than the number of bytes in the entire
74 The expanded key and the "key length" are written to *Context.
78 aes_rval // -1 if "key length" is invalid. 0 otherwise.
81 /* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */
83 #include <i386/cpu_capabilities.h>
85 #include <System/i386/cpu_capabilities.h>
89 .globl _aes_encrypt_key
90 // .private_extern _aes_encrypt_key
93 // detect AES HW, cclee-3-13-10
94 #if defined __x86_64__
95 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
96 mov (%rax), %eax // %eax = __cpu_capabilities
99 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
100 mov (%eax), %eax // %eax = __cpu_capabilities
102 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
105 test $(kHasAES), %eax // __cpu_capabilities & kHasAES
106 jne _aes_encrypt_key_hw // if AES HW detected, branch to _aes_encrypt_key_hw
108 #define dr r0d // Dissection register.
109 #define drl r0l // Low 8 bits of dissection register.
110 #define drh r0h // Second-lowest 8 bits of dissection register.
113 #define t0d r1d // Low 32 bits of t0.
115 #define offset Arch(r5, r11) // Address offset and loop sentinel.
117 #define R r7 // Address of round constant.
118 #define K r7 // User key pointer.
121 #define E r6 // Expanded key pointer.
133 #define LookupS(table, index) \
134 _AESSubBytesWordTable+(table)*TableSize(, index, 4)
135 #elif defined __x86_64__
136 #define LookupS(table, index) (table)*TableSize(STable, index, 4)
139 /* Save registers and set SaveSize to the number of bytes pushed onto the
140 stack so far, including the caller's return address.
147 #define SaveSize (5*4)
149 #define SaveSize (2*8)
152 /* Number of bytes used for local variables:
154 8 16-byte spaces to save XMM registers.
156 #define LocalsSize (8*16)
159 // Padding to position stack pointer at a multiple of 16 bytes.
160 #define Padding (15 & -(SaveSize + LocalsSize))
161 sub $Padding + LocalsSize, r4 // Allocate space on stack.
166 /* StackFrame is the number of bytes in our stack frame, from caller's
167 stack pointer to ours (so it includes the return address).
169 #define StackFrame (SaveSize + Padding + LocalsSize)
171 // Save xmm registers.
172 movaps %xmm0, 0*16(r4)
173 movaps %xmm1, 1*16(r4)
174 movaps %xmm2, 2*16(r4)
175 movaps %xmm3, 3*16(r4)
176 movaps %xmm4, 4*16(r4)
177 movaps %xmm5, 5*16(r4)
178 movaps %xmm6, 6*16(r4)
179 movaps %xmm7, 7*16(r4)
183 // Define location of argument i.
184 #define Argument(i) StackFrame+4*(i)(r4)
193 #elif defined __x86_64__
195 #define Nk r9d // Number of words in key.
196 mov r6d, Nk // Move Nk argument out of way.
197 mov r2, E // Move E argument to common register.
201 // Dispatch on key length.
204 shl $3, Nk // Convert from bytes to bits.
212 mov $-1, r0 // Return error.
218 .globl _aes_encrypt_key128
219 // .private_extern _aes_encrypt_key128
222 /* Save registers and set SaveSize to the number of bytes pushed onto the
223 stack so far, including the caller's return address.
230 #define SaveSize (5*4)
232 #define SaveSize (2*8)
235 /* Number of bytes used for local variables:
237 8 16-byte spaces to save XMM registers.
239 #define LocalsSize (8*16)
242 // Padding to position stack pointer at a multiple of 16 bytes.
243 #define Padding (15 & -(SaveSize + LocalsSize))
244 sub $Padding + LocalsSize, r4 // Allocate space on stack.
249 /* StackFrame is the number of bytes in our stack frame, from caller's
250 stack pointer to ours (so it includes the return address).
252 #define StackFrame (SaveSize + Padding + LocalsSize)
254 // Save xmm registers.
255 movaps %xmm0, 0*16(r4)
256 movaps %xmm1, 1*16(r4)
257 movaps %xmm2, 2*16(r4)
258 movaps %xmm3, 3*16(r4)
259 movaps %xmm4, 4*16(r4)
260 movaps %xmm5, 5*16(r4)
261 movaps %xmm6, 6*16(r4)
262 movaps %xmm7, 7*16(r4)
267 #define Argument(i) StackFrame+4*(i)(r4)
273 // Merge point for _aes_encrypt_key and _aes_encrypt_key128.
278 #define e2 Arch(r5d, r11d)
281 // First words of expanded key are copied from user key.
287 movl $10*16, ContextKeyLength(E) // Set "key length."
293 // K cannot be used after we write to R, since they use the same register.
295 // Cache round constants in output buffer. The last is a sentinel.
307 #if defined __x86_64__
310 lea _AESSubBytesWordTable(%rip), STable
314 // Store initial words of expanded key, which are copies of user's key.
321 mov e3, dr // Put previous word into dissection register.
323 // Perform SubWord(RotWord(dr)).
325 xor LookupS(3, t0), e0 // Look up byte 0 in table 3.
327 xor LookupS(0, t0), e0 // Look up byte 1 in table 0.
330 xor LookupS(1, t0), e0 // Look up byte 2 in table 1.
332 xor LookupS(2, t0), e0 // Look up byte 3 in table 2.
336 movzx (E), t0d // Get cached round constant.
337 xor t0d, e0 // XOR with word from four words back.
339 // Chain to successive words.
348 cmp $0x36, t0d // Was this the last round constant?
352 xor r0, r0 // Return success.
355 // Pop stack and restore registers.
356 movaps 7*16(r4), %xmm7
357 movaps 6*16(r4), %xmm6
358 movaps 5*16(r4), %xmm5
359 movaps 4*16(r4), %xmm4
360 movaps 3*16(r4), %xmm3
361 movaps 2*16(r4), %xmm2
362 movaps 1*16(r4), %xmm1
363 movaps 0*16(r4), %xmm0
365 add $Padding + LocalsSize, r4
377 // Reset definitions for next case.
389 .globl _aes_encrypt_key192
390 // .private_extern _aes_encrypt_key192
393 /* Save registers and set SaveSize to the number of bytes pushed onto the
394 stack so far, including the caller's return address.
401 #define SaveSize (5*4)
403 #define SaveSize (2*8)
406 /* Number of bytes used for local variables:
408 8 16-byte spaces to save XMM registers.
410 #define LocalsSize (8*16)
413 // Padding to position stack pointer at a multiple of 16 bytes.
414 #define Padding (15 & -(SaveSize + LocalsSize))
415 sub $Padding + LocalsSize, r4 // Allocate space on stack.
420 /* StackFrame is the number of bytes in our stack frame, from caller's
421 stack pointer to ours (so it includes the return address).
423 #define StackFrame (SaveSize + Padding + LocalsSize)
425 // Save xmm registers.
426 movaps %xmm0, 0*16(r4)
427 movaps %xmm1, 1*16(r4)
428 movaps %xmm2, 2*16(r4)
429 movaps %xmm3, 3*16(r4)
430 movaps %xmm4, 4*16(r4)
431 movaps %xmm5, 5*16(r4)
432 movaps %xmm6, 6*16(r4)
433 movaps %xmm7, 7*16(r4)
438 #define Argument(i) StackFrame+4*(i)(r4)
444 // Merge point for _aes_encrypt_key and _aes_encrypt_key192.
447 // First words of expanded key are copied from user key.
453 movl $12*16, ContextKeyLength(E) // Set "key length."
462 // K cannot be used after we write to R, since they use the same register.
468 #elif defined __x86_64__
470 lea _AESRcon(%rip), R
471 lea _AESSubBytesWordTable(%rip), STable
475 /* With a six-word key, there are twelve rounds (thirteen 16-byte key
481 // Store initial words of expanded key, which are copies of user's key.
482 movd ve0, 0*4(E, offset)
483 movd ve1, 1*4(E, offset)
484 movd ve2, 2*4(E, offset)
485 movd ve3, 3*4(E, offset)
486 movd ve4, 4*4(E, offset)
487 movd ve5, 5*4(E, offset)
489 /* Jump into loop body. The key expansion processes six four-byte words per
490 iteration. 52 are needed in the key. So only four are needed in the last
495 // Continue chaining to successive words.
497 movd ve4, 4*4(E, offset)
499 movd ve5, 5*4(E, offset)
501 add $1, R // Advance pointer.
502 movd ve5, dr // Put previous word into dissection register.
503 movzx (R), t0 // Get round constant.
505 pxor vt1, ve0 // XOR with word from six words back.
507 // Perform SubWord(RotWord(dr)).
509 movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
511 movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
516 movd LookupS(1, t0), vt0 // Look up byte 2 in table 1.
518 movd LookupS(2, t0), vt1 // Look up byte 3 in table 2.
524 // Chain to successive words.
525 movd ve0, 0*4(E, offset)
527 movd ve1, 1*4(E, offset)
529 movd ve2, 2*4(E, offset)
531 movd ve3, 3*4(E, offset)
535 xor r0, r0 // Return success.
537 // Pop stack and restore registers.
538 movaps 7*16(r4), %xmm7
539 movaps 6*16(r4), %xmm6
540 movaps 5*16(r4), %xmm5
541 movaps 4*16(r4), %xmm4
542 movaps 3*16(r4), %xmm3
543 movaps 2*16(r4), %xmm2
544 movaps 1*16(r4), %xmm1
545 movaps 0*16(r4), %xmm0
547 add $Padding + LocalsSize, r4
559 // Reset definitions for next case.
566 .globl _aes_encrypt_key256
567 // .private_extern _aes_encrypt_key256
570 /* Save registers and set SaveSize to the number of bytes pushed onto the
571 stack so far, including the caller's return address.
578 #define SaveSize (5*4)
580 #define SaveSize (2*8)
583 /* Number of bytes used for local variables:
585 8 16-byte spaces to save XMM registers.
587 #define LocalsSize (8*16)
590 // Padding to position stack pointer at a multiple of 16 bytes.
591 #define Padding (15 & -(SaveSize + LocalsSize))
592 sub $Padding + LocalsSize, r4 // Allocate space on stack.
597 /* StackFrame is the number of bytes in our stack frame, from caller's
598 stack pointer to ours (so it includes the return address).
600 #define StackFrame (SaveSize + Padding + LocalsSize)
602 // Save xmm registers.
603 movaps %xmm0, 0*16(r4)
604 movaps %xmm1, 1*16(r4)
605 movaps %xmm2, 2*16(r4)
606 movaps %xmm3, 3*16(r4)
607 movaps %xmm4, 4*16(r4)
608 movaps %xmm5, 5*16(r4)
609 movaps %xmm6, 6*16(r4)
610 movaps %xmm7, 7*16(r4)
615 #define Argument(i) StackFrame+4*(i)(r4)
621 // Merge point for _aes_encrypt_key and _aes_encrypt_key256.
624 // First words of expanded key are copied from user key.
630 movl $14*16, ContextKeyLength(E) // Set "key length."
636 // Store initial words of expanded key, which are copies of user's key.
646 // K cannot be used after we write to R, since they use the same register.
652 #elif defined __x86_64__
654 lea _AESRcon(%rip), R
655 lea _AESSubBytesWordTable(%rip), STable
659 /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key
665 // Store initial words of expanded key, which are copies of user's key.
666 movd ve0, 4*4(E, offset)
667 movd ve1, 5*4(E, offset)
668 movd ve2, 6*4(E, offset)
669 movd ve3, 7*4(E, offset)
671 /* Jump into loop body. The key expansion processes eight four-byte words per
672 iteration. 60 are needed in the key. So only four are needed in the last
677 movd ve3, dr // Put previous word into dissection register.
679 /* Get word from eight words back (it is four words back from where E
680 currently points, and we use it to prepare the value to be stored
681 four words beyond where E currently points).
683 movd -4*4(E, offset), ve0
685 // Perform SubWord(dr).
687 movd LookupS(0, t0), vt0 // Look up byte 0 in table 0.
689 movd LookupS(1, t0), vt1 // Look up byte 1 in table 1.
692 movd LookupS(2, t0), vt2 // Look up byte 2 in table 2.
694 movd LookupS(3, t0), vt3 // Look up byte 3 in table 3.
700 movd -3*4(E, offset), ve1 // Get words from eight words back.
701 movd -2*4(E, offset), ve2
702 movd -1*4(E, offset), ve3
704 // Chain to successive words.
705 movd ve0, 4*4(E, offset)
707 movd ve1, 5*4(E, offset)
709 movd ve2, 6*4(E, offset)
711 movd ve3, 7*4(E, offset)
714 add $1, R // Advance pointer.
715 movd ve3, dr // Put previous word into dissection register.
716 movzx (R), t0d // Get round constant.
718 movd 0*4(E, offset), ve0 // Get word from eight words back.
721 // Perform SubWord(RotWord(dr)).
723 movd LookupS(3, t0), vt0 // Look up byte 0 in table 3.
725 movd LookupS(0, t0), vt1 // Look up byte 1 in table 0.
728 movd LookupS(1, t0), vt2 // Look up byte 2 in table 1.
730 movd LookupS(2, t0), vt3 // Look up byte 3 in table 2.
736 movd 1*4(E, offset), ve1
737 movd 2*4(E, offset), ve2
738 movd 3*4(E, offset), ve3
742 // Chain to successive words.
743 movd ve0, 0*4(E, offset)
745 movd ve1, 1*4(E, offset)
747 movd ve2, 2*4(E, offset)
749 movd ve3, 3*4(E, offset)
753 xor r0, r0 // Return success.
755 // Pop stack and restore registers.
756 movaps 7*16(r4), %xmm7
757 movaps 6*16(r4), %xmm6
758 movaps 5*16(r4), %xmm5
759 movaps 4*16(r4), %xmm4
760 movaps 3*16(r4), %xmm3
761 movaps 2*16(r4), %xmm2
762 movaps 1*16(r4), %xmm1
763 movaps 0*16(r4), %xmm0
765 add $Padding + LocalsSize, r4