1 /* This file defines _aes_encrypt or _aes_decrypt, according to the value of
2 the Select preprocessor symbol. This file is designed to be included in
3 another assembly file using the preprocessor #include directive, to benefit
4 from some assembly-time calculations.
6 These two routines are nearly identical. They differ only in the tables
7 they use, the direction they iterate through the key, and the permutation
8 performed on part of the state.
10 Written by Eric Postpischil, January 2008.
13 /* add AES HW detection and HW-specific program branch cclee 3-12-10 */
15 #include <i386/cpu_capabilities.h>
17 #include <System/i386/cpu_capabilities.h>
21 #define Name _aes_encrypt // Routine name.
22 #define MTable _AESEncryptTable // Main table.
23 #define FTable _AESSubBytesWordTable // Final table.
24 #define P0 S0 // State permutation.
28 #define Increment +16 // ExpandedKey increment.
30 #define Name _aes_decrypt // Routine name.
31 #define MTable _AESDecryptTable // Main table.
32 #define FTable _AESInvSubBytesWordTable // Final table.
33 #define P0 S2 // State permutation.
37 #define Increment -16 // ExpandedKey increment.
39 #define Name _aes_encrypt_xmm_no_save // Routine name.
40 #define MTable _AESEncryptTable // Main table.
41 #define FTable _AESSubBytesWordTable // Final table.
42 #define P0 S0 // State permutation.
46 #define Increment +16 // ExpandedKey increment.
48 #define Name _aes_decrypt_xmm_no_save // Routine name.
49 #define MTable _AESDecryptTable // Main table.
50 #define FTable _AESInvSubBytesWordTable // Final table.
51 #define P0 S2 // State permutation.
55 #define Increment -16 // ExpandedKey increment.
61 _AESEncryptWithExpandedKey (if Select is 0) or
62 _AESDecryptWithExpandedKey (if Select is 1).
66 Perform the AES cipher or its inverse as defined in Federal Information
67 Processing Standards Publication 197 (FIPS-197), November 26, 2001.
69 The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197.
75 The following names must be locally defined so the assembler
76 can calculate certain offsets.
80 static const Word _AESEncryptTable[4][256].
82 _AESEncryptTable[i] contains the tables T[i] defined in AES
83 Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and
84 Vincent Rijmen, section 5.2.1, page 18. These tables
85 combine the SubBytes and MixColumns operations.
87 static const Word _AESSubBytesWordTable[256].
89 _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
90 SubBytes is defined in FIPS-197. _AESSubBytesWordTable
91 differs from _AESEncryptTable in that it does not include
92 the MixColumn operation. It is used in performing the last
93 round, which differs fromm the previous rounds in that it
94 does not include the MixColumn operation.
98 static const Word _AESDecryptTable[4][256].
100 The analog of _AESEncryptTable for decryption.
102 static const Word _AESSubBytesWordTable[256].
104 _AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i,
105 where InvSubBytes is defined in FIPS-197.
106 _AESInvSubBytesWordTable differs from _AESDecryptTable in
107 that it does not include the InvMixColumn operation. It is
108 used in performing the last round, which differs from the
109 previous rounds in that it does not include the
110 InvMixColumn operation.
114 const Byte *InputText.
116 Address of input, 16 bytes. Best if four-byte aligned.
120 Address of output, 16 bytes. Best if four-byte aligned.
122 aes_encrypt_ctx *Context or aes_decrypt_ctx *Context
124 aes_encrypt_ctx and aes_decrypt_ctx are identical except the
125 former is used for encryption and the latter for decryption.
127 Each is a structure containing the expanded key beginning at
128 offset ContextKey and a four-byte "key length" beginning at
129 offset ContextKeyLength. The "key length" is the number of
130 bytes from the start of the first round key to the start of the
131 last round key. That is 16 less than the number of bytes in
136 Encrypted or decrypted data is written to *OutputText.
140 aes_rval // -1 if "key length" is invalid. 0 otherwise.
147 // detect AES HW, cclee 3-13-10
148 #if Select < 2 // only for aes_encrypt/aes_decrypt
149 #if defined __x86_64__
150 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
151 mov (%rax), %eax // %eax = __cpu_capabilities
154 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
155 mov (%eax), %eax // %eax = __cpu_capabilities
157 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
160 test $(kHasAES), %eax // __cpu_capabilities & kHasAES
162 jne _aes_encrypt_hw // if AES HW detected, branch to HW specific code
164 jne _aes_decrypt_hw // if AES HW detected, branch to HW specific code
168 // Push new stack frame.
171 /* Save registers and set SaveSize to the number of bytes pushed onto the
172 stack so far, including the caller's return address.
178 #define SaveSize (5*4)
180 #define SaveSize (3*8)
183 /* Number of bytes used for local variables:
185 4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd.
187 5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers.
189 #define LocalsSize (Arch(4, 0) + Arch(5, 3)*16)
192 // Padding to position stack pointer at a multiple of 16 bytes.
193 #define Padding (15 & -(SaveSize + LocalsSize))
194 sub $Padding + LocalsSize, r4 // Allocate space on stack.
201 // Save XMM registers.
202 movaps %xmm0, 0*16(r4)
203 movaps %xmm1, 1*16(r4)
204 movaps %xmm2, 2*16(r4)
206 movaps %xmm3, 3*16(r4)
207 movaps %xmm4, 4*16(r4)
214 // Number of bytes from caller's stack pointer to ours.
215 #define StackFrame (SaveSize + Padding + LocalsSize)
217 // Define location of argument i (presuming 4-byte arguments).
218 #define Argument(i) StackFrame+4*(i)(%esp)
220 #define ArgInputText Argument(0)
221 #define ArgOutputText Argument(1)
222 #define ArgContext Argument(2)
224 #elif defined __x86_64__
227 #define InputText r7 // Used early then overwritten for other use.
228 #define OutputText r6 // Needed near end of routine.
229 #define ArgContext r2
230 /* The argument passed in r2 overlaps registers we need for other
231 work, so it must be moved early in the routine.
236 #define BaseP Arch(r6, r9) // Base pointer for addressing global data.
237 #define ExpandedKey Arch(t0, r10) // Address of expanded key.
239 /* The Work registers defined below are used to hold parts of the AES state
240 while we dissect or assemble it. They must be assigned to the A, B, C, and
241 D registers so that we can access the bytes in %al, %ah, and so on.
257 #define t0d r5d // Low 32 bits of t0.
258 #define t0l r5l // Low byte of t0.
262 /* S0, S1, S2, and S3 are where we assemble the new AES state when computing
263 a regular round. S1, S2, and S3 are assigned to the Work registers, but
264 S0 needs to go somewhere else because Work0 holds part of the old state.
266 #define S0 Arch(t1, r8d)
271 /* These XMM registers are used as holding space, because it is faster to
272 spill to these registers than to the stack. (On x86_64, we do not need
273 to spill, because there are additional general registers available.
274 However, using more general registers requires saving them to the stack
275 and restoring them. I timed it, and no time was saved.)
281 #define vExpandedKey %xmm3
282 #define vIncrement %xmm4
285 // Get address of expanded key.
286 mov ArgContext, ExpandedKey
288 add $ContextKey, ExpandedKey
291 /* Store sentinel value of ExpandedKey on the stack on i386, a register on
294 #define ExpandedKeyEnd Arch(5*16(r4), r11)
296 // Get and check "key length".
297 movzx ContextKeyLength(ExpandedKey), r0
304 mov $-1, r0 // Return error.
308 #if (Select == 0 || Select == 2)
309 // For encryption, prepare to iterate forward through expanded key.
311 mov r0, ExpandedKeyEnd
313 // For decryption, prepare to iterate backward through expanded key.
314 mov ExpandedKey, ExpandedKeyEnd
318 // Initialize State from input text.
320 mov ArgInputText, BaseP
321 #define InputText BaseP
323 mov 0*4(InputText), Work0d
324 mov 1*4(InputText), S1
325 mov 2*4(InputText), S2
326 mov 3*4(InputText), S3
327 #undef InputText // Register is reused after this for other purposes.
329 // Add round key and save results.
330 xor 0*4(ExpandedKey), Work0d // S0 is in dissection register.
331 xor 1*4(ExpandedKey), S1
332 movd S1, vS1 // Save S1 to S3 in vector registers.
333 xor 2*4(ExpandedKey), S2
335 xor 3*4(ExpandedKey), S3
338 add $Increment, ExpandedKey // Advance to next round key.
341 // Save expanded key address and increment in vector registers.
343 movp ExpandedKey, vExpandedKey
347 // Set up relative addressing.
350 // Get address of 0 in BaseP.
351 call 0f // Push program counter onto stack.
353 pop BaseP // Get program counter.
355 // Define macros to help address data.
356 #define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4)
357 #define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4)
359 #elif defined __x86_64__
361 lea MTable(%rip), BaseP
363 // Define macros to help address data.
364 #define LookupM(table, index) (table)*TableSize(BaseP, index, 4)
365 #define LookupF(table, index) (table)*TableSize(BaseP, index, 4)
367 /* With these definitions of LookupM and LookupF, BaseP must be loaded with
368 the address of the table at the point where it is used. So we need an
369 instruction to change BaseP after we are done with MTable and before we
370 start using FTable. I would prefer to use something like:
372 .set FMinusM, FTable - MTable
373 #define LookupF(table, index) \
374 FMinusM+(table)*TableSize(BaseP, index, 4)
376 Then BaseP would not need to change. However, this fails due to an
377 assembler/linker bug, <rdar://problem/5683882>.
383 mov 0*4(ExpandedKey), S0
384 mov 1*4(ExpandedKey), S1
385 mov 2*4(ExpandedKey), S2
386 mov 3*4(ExpandedKey), S3
389 /* Word 0 of the current state must be in Work0 now, and the next round
390 key must be in S0 to S3.
393 // Process previous S0.
395 xor LookupM(0, t0), S0
397 xor LookupM(1, t0), P3
400 xor LookupM(2, t0), S2
402 xor LookupM(3, t0), P1
404 // Process previous S1.
407 xor LookupM(0, t0), S1
409 xor LookupM(1, t0), P0
412 xor LookupM(2, t0), S3
414 xor LookupM(3, t0), P2
416 // Process previous S2.
419 xor LookupM(0, t0), S2
421 xor LookupM(1, t0), P1
424 xor LookupM(2, t0), S0
426 xor LookupM(3, t0), P3
428 // Process previous S3.
431 xor LookupM(0, t0), S3
433 xor LookupM(1, t0), P2
436 xor LookupM(2, t0), S1
438 xor LookupM(3, t0), P0
441 paddd vIncrement, vExpandedKey
442 movp vExpandedKey, ExpandedKey
444 add $Increment, ExpandedKey
447 // Save state for next iteration and load next round key.
449 mov 0*4(ExpandedKey), S0
451 mov 1*4(ExpandedKey), S1
453 mov 2*4(ExpandedKey), S2
455 mov 3*4(ExpandedKey), S3
457 cmp ExpandedKeyEnd, ExpandedKey
460 /* Word 0 of the current state must be in Work0 now, and the next round
461 key must be in S0 to S3.
464 // Work around assembler bug. See comments above about Radar 5683882.
465 #if defined __x86_64__
466 lea FTable(%rip), BaseP
469 // Process previous S0.
471 xor LookupF(0, t0), S0
473 xor LookupF(1, t0), P3
476 xor LookupF(2, t0), S2
478 xor LookupF(3, t0), P1
480 // Process previous S1.
483 xor LookupF(0, t0), S1
485 xor LookupF(1, t0), P0
488 xor LookupF(2, t0), S3
490 xor LookupF(3, t0), P2
492 // Process previous S2.
495 xor LookupF(0, t0), S2
497 xor LookupF(1, t0), P1
500 xor LookupF(2, t0), S0
502 xor LookupF(3, t0), P3
504 // Process previous S3.
507 xor LookupF(0, t0), S3
509 xor LookupF(1, t0), P2
512 xor LookupF(2, t0), S1
514 xor LookupF(3, t0), P0
516 #if defined __i386__ // Architecture.
517 // Get OutputText address.
518 #define OutputText BaseP
519 mov ArgOutputText, OutputText
520 #endif // Architecture.
523 mov S0, 0*4(OutputText)
524 mov S1, 1*4(OutputText)
525 mov S2, 2*4(OutputText)
526 mov S3, 3*4(OutputText)
528 xor r0, r0 // Return success.
531 // Pop stack and restore registers.
535 movaps 4*16(r4), %xmm4
536 movaps 3*16(r4), %xmm3
538 movaps 2*16(r4), %xmm2
539 movaps 1*16(r4), %xmm1
540 movaps 0*16(r4), %xmm0
544 add $Padding + LocalsSize, r4
549 #elif defined __x86_64__
557 #undef ArgExpandedKey
564 #undef ExpandedKeyEnd