]> git.saurik.com Git - apple/xnu.git/blob - bsd/crypto/aes/i386/EncryptDecrypt.s
xnu-1699.24.23.tar.gz
[apple/xnu.git] / bsd / crypto / aes / i386 / EncryptDecrypt.s
1 /* This file defines _aes_encrypt or _aes_decrypt, according to the value of
2 the Select preprocessor symbol. This file is designed to be included in
3 another assembly file using the preprocessor #include directive, to benefit
4 from some assembly-time calculations.
5
6 These two routines are nearly identical. They differ only in the tables
7 they use, the direction they iterate through the key, and the permutation
8 performed on part of the state.
9
10 Written by Eric Postpischil, January 2008.
11 */
12
13 /* add AES HW detection and HW-specific program branch cclee 3-12-10 */
14 #ifdef KERNEL
15 #include <i386/cpu_capabilities.h>
16 #else
17 #include <System/i386/cpu_capabilities.h>
18 #endif
19
20 #if Select == 0
21 #define Name _aes_encrypt // Routine name.
22 #define MTable _AESEncryptTable // Main table.
23 #define FTable _AESSubBytesWordTable // Final table.
24 #define P0 S0 // State permutation.
25 #define P1 S1
26 #define P2 S2
27 #define P3 S3
28 #define Increment +16 // ExpandedKey increment.
29 #elif Select == 1
30 #define Name _aes_decrypt // Routine name.
31 #define MTable _AESDecryptTable // Main table.
32 #define FTable _AESInvSubBytesWordTable // Final table.
33 #define P0 S2 // State permutation.
34 #define P1 S3
35 #define P2 S0
36 #define P3 S1
37 #define Increment -16 // ExpandedKey increment.
38 #elif Select == 2
39 #define Name _aes_encrypt_xmm_no_save // Routine name.
40 #define MTable _AESEncryptTable // Main table.
41 #define FTable _AESSubBytesWordTable // Final table.
42 #define P0 S0 // State permutation.
43 #define P1 S1
44 #define P2 S2
45 #define P3 S3
46 #define Increment +16 // ExpandedKey increment.
47 #elif Select == 3
48 #define Name _aes_decrypt_xmm_no_save // Routine name.
49 #define MTable _AESDecryptTable // Main table.
50 #define FTable _AESInvSubBytesWordTable // Final table.
51 #define P0 S2 // State permutation.
52 #define P1 S3
53 #define P2 S0
54 #define P3 S1
55 #define Increment -16 // ExpandedKey increment.
56 #endif // Select
57
58
59 /* Routine:
60
61 _AESEncryptWithExpandedKey (if Select is 0) or
62 _AESDecryptWithExpandedKey (if Select is 1).
63
64 Function:
65
66 Perform the AES cipher or its inverse as defined in Federal Information
67 Processing Standards Publication 197 (FIPS-197), November 26, 2001.
68
69 The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197.
70
71 Input:
72
73 Constant data:
74
75 The following names must be locally defined so the assembler
76 can calculate certain offsets.
77
78 For encryption:
79
80 static const Word _AESEncryptTable[4][256].
81
82 _AESEncryptTable[i] contains the tables T[i] defined in AES
83 Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and
84 Vincent Rijmen, section 5.2.1, page 18. These tables
85 combine the SubBytes and MixColumns operations.
86
87 static const Word _AESSubBytesWordTable[256].
88
89 _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
90 SubBytes is defined in FIPS-197. _AESSubBytesWordTable
91 differs from _AESEncryptTable in that it does not include
92 the MixColumn operation. It is used in performing the last
93 round, which differs fromm the previous rounds in that it
94 does not include the MixColumn operation.
95
96 For decryption:
97
98 static const Word _AESDecryptTable[4][256].
99
100 The analog of _AESEncryptTable for decryption.
101
102 static const Word _AESSubBytesWordTable[256].
103
104 _AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i,
105 where InvSubBytes is defined in FIPS-197.
106 _AESInvSubBytesWordTable differs from _AESDecryptTable in
107 that it does not include the InvMixColumn operation. It is
108 used in performing the last round, which differs from the
109 previous rounds in that it does not include the
110 InvMixColumn operation.
111
112 Arguments:
113
114 const Byte *InputText.
115
116 Address of input, 16 bytes. Best if four-byte aligned.
117
118 Byte *OutputText.
119
120 Address of output, 16 bytes. Best if four-byte aligned.
121
122 aes_encrypt_ctx *Context or aes_decrypt_ctx *Context
123
124 aes_encrypt_ctx and aes_decrypt_ctx are identical except the
125 former is used for encryption and the latter for decryption.
126
127 Each is a structure containing the expanded key beginning at
128 offset ContextKey and a four-byte "key length" beginning at
129 offset ContextKeyLength. The "key length" is the number of
130 bytes from the start of the first round key to the start of the
131 last round key. That is 16 less than the number of bytes in
132 the entire key.
133
134 Output:
135
136 Encrypted or decrypted data is written to *OutputText.
137
138 Return:
139
140 aes_rval // -1 if "key length" is invalid. 0 otherwise.
141 */
142
143 .text
144 .globl Name
145 Name:
146
147 // detect AES HW, cclee 3-13-10
148 #if Select < 2 // only for aes_encrypt/aes_decrypt
149 #if defined __x86_64__
150 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
151 mov (%rax), %eax // %eax = __cpu_capabilities
152 #else
153 #if defined KERNEL
154 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
155 mov (%eax), %eax // %eax = __cpu_capabilities
156 #else
157 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
158 #endif
159 #endif
160 test $(kHasAES), %eax // __cpu_capabilities & kHasAES
161 #if Select == 0
162 jne _aes_encrypt_hw // if AES HW detected, branch to HW specific code
163 #else
164 jne _aes_decrypt_hw // if AES HW detected, branch to HW specific code
165 #endif
166 #endif // Select
167
168 // Push new stack frame.
169 push r5
170
171 /* Save registers and set SaveSize to the number of bytes pushed onto the
172 stack so far, including the caller's return address.
173 */
174 push r3
175 #if defined __i386__
176 push r6
177 push r7
178 #define SaveSize (5*4)
179 #else
180 #define SaveSize (3*8)
181 #endif
182
183 /* Number of bytes used for local variables:
184
185 4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd.
186
187 5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers.
188 */
189 #define LocalsSize (Arch(4, 0) + Arch(5, 3)*16)
190
191 #if 0 < LocalsSize
192 // Padding to position stack pointer at a multiple of 16 bytes.
193 #define Padding (15 & -(SaveSize + LocalsSize))
194 sub $Padding + LocalsSize, r4 // Allocate space on stack.
195 #else
196 #define Padding 0
197 #endif
198
199 #ifdef KERNEL
200 #if Select < 2
201 // Save XMM registers.
202 movaps %xmm0, 0*16(r4)
203 movaps %xmm1, 1*16(r4)
204 movaps %xmm2, 2*16(r4)
205 #if defined __i386__
206 movaps %xmm3, 3*16(r4)
207 movaps %xmm4, 4*16(r4)
208 #endif
209 #endif // Select
210 #endif // KERNEL
211
212 #if defined __i386__
213
214 // Number of bytes from caller's stack pointer to ours.
215 #define StackFrame (SaveSize + Padding + LocalsSize)
216
217 // Define location of argument i (presuming 4-byte arguments).
218 #define Argument(i) StackFrame+4*(i)(%esp)
219
220 #define ArgInputText Argument(0)
221 #define ArgOutputText Argument(1)
222 #define ArgContext Argument(2)
223
224 #elif defined __x86_64__
225
226 // Arguments.
227 #define InputText r7 // Used early then overwritten for other use.
228 #define OutputText r6 // Needed near end of routine.
229 #define ArgContext r2
230 /* The argument passed in r2 overlaps registers we need for other
231 work, so it must be moved early in the routine.
232 */
233
234 #endif
235
236 #define BaseP Arch(r6, r9) // Base pointer for addressing global data.
237 #define ExpandedKey Arch(t0, r10) // Address of expanded key.
238
239 /* The Work registers defined below are used to hold parts of the AES state
240 while we dissect or assemble it. They must be assigned to the A, B, C, and
241 D registers so that we can access the bytes in %al, %ah, and so on.
242 */
243 #define Work0d r0d
244 #define Work0l r0l
245 #define Work0h r0h
246 #define Work1d r3d
247 #define Work1l r3l
248 #define Work1h r3h
249 #define Work2d r1d
250 #define Work2l r1l
251 #define Work2h r1h
252 #define Work3d r2d
253 #define Work3l r2l
254 #define Work3h r2h
255
256 #define t0 r5
257 #define t0d r5d // Low 32 bits of t0.
258 #define t0l r5l // Low byte of t0.
259
260 #define t1 r7
261
262 /* S0, S1, S2, and S3 are where we assemble the new AES state when computing
263 a regular round. S1, S2, and S3 are assigned to the Work registers, but
264 S0 needs to go somewhere else because Work0 holds part of the old state.
265 */
266 #define S0 Arch(t1, r8d)
267 #define S1 Work1d
268 #define S2 Work2d
269 #define S3 Work3d
270
271 /* These XMM registers are used as holding space, because it is faster to
272 spill to these registers than to the stack. (On x86_64, we do not need
273 to spill, because there are additional general registers available.
274 However, using more general registers requires saving them to the stack
275 and restoring them. I timed it, and no time was saved.)
276 */
277 #define vS1 %xmm0
278 #define vS2 %xmm1
279 #define vS3 %xmm2
280 #if defined __i386__
281 #define vExpandedKey %xmm3
282 #define vIncrement %xmm4
283 #endif
284
285 // Get address of expanded key.
286 mov ArgContext, ExpandedKey
287 #if 0 != ContextKey
288 add $ContextKey, ExpandedKey
289 #endif
290
291 /* Store sentinel value of ExpandedKey on the stack on i386, a register on
292 x86_64.
293 */
294 #define ExpandedKeyEnd Arch(5*16(r4), r11)
295
296 // Get and check "key length".
297 movzx ContextKeyLength(ExpandedKey), r0
298 cmp $160, r0
299 je 2f
300 cmp $192, r0
301 je 2f
302 cmp $224, r0
303 je 2f
304 mov $-1, r0 // Return error.
305 jmp 9f
306 2:
307
308 #if (Select == 0 || Select == 2)
309 // For encryption, prepare to iterate forward through expanded key.
310 add ExpandedKey, r0
311 mov r0, ExpandedKeyEnd
312 #else
313 // For decryption, prepare to iterate backward through expanded key.
314 mov ExpandedKey, ExpandedKeyEnd
315 add r0, ExpandedKey
316 #endif
317
318 // Initialize State from input text.
319 #if defined __i386__
320 mov ArgInputText, BaseP
321 #define InputText BaseP
322 #endif
323 mov 0*4(InputText), Work0d
324 mov 1*4(InputText), S1
325 mov 2*4(InputText), S2
326 mov 3*4(InputText), S3
327 #undef InputText // Register is reused after this for other purposes.
328
329 // Add round key and save results.
330 xor 0*4(ExpandedKey), Work0d // S0 is in dissection register.
331 xor 1*4(ExpandedKey), S1
332 movd S1, vS1 // Save S1 to S3 in vector registers.
333 xor 2*4(ExpandedKey), S2
334 movd S2, vS2
335 xor 3*4(ExpandedKey), S3
336 movd S3, vS3
337
338 add $Increment, ExpandedKey // Advance to next round key.
339
340 #if defined __i386__
341 // Save expanded key address and increment in vector registers.
342 mov $Increment, t1
343 movp ExpandedKey, vExpandedKey
344 movp t1, vIncrement
345 #endif
346
347 // Set up relative addressing.
348 #if defined __i386__
349
350 // Get address of 0 in BaseP.
351 call 0f // Push program counter onto stack.
352 0:
353 pop BaseP // Get program counter.
354
355 // Define macros to help address data.
356 #define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4)
357 #define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4)
358
359 #elif defined __x86_64__
360
361 lea MTable(%rip), BaseP
362
363 // Define macros to help address data.
364 #define LookupM(table, index) (table)*TableSize(BaseP, index, 4)
365 #define LookupF(table, index) (table)*TableSize(BaseP, index, 4)
366
367 /* With these definitions of LookupM and LookupF, BaseP must be loaded with
368 the address of the table at the point where it is used. So we need an
369 instruction to change BaseP after we are done with MTable and before we
370 start using FTable. I would prefer to use something like:
371
372 .set FMinusM, FTable - MTable
373 #define LookupF(table, index) \
374 FMinusM+(table)*TableSize(BaseP, index, 4)
375
376 Then BaseP would not need to change. However, this fails due to an
377 assembler/linker bug, <rdar://problem/5683882>.
378 */
379
380 #endif
381
382 // Get round key.
383 mov 0*4(ExpandedKey), S0
384 mov 1*4(ExpandedKey), S1
385 mov 2*4(ExpandedKey), S2
386 mov 3*4(ExpandedKey), S3
387
388 1:
389 /* Word 0 of the current state must be in Work0 now, and the next round
390 key must be in S0 to S3.
391 */
392
393 // Process previous S0.
394 movzx Work0l, t0
395 xor LookupM(0, t0), S0
396 movzx Work0h, t0d
397 xor LookupM(1, t0), P3
398 shr $16, Work0d
399 movzx Work0l, t0d
400 xor LookupM(2, t0), S2
401 movzx Work0h, t0d
402 xor LookupM(3, t0), P1
403
404 // Process previous S1.
405 movd vS1, Work0d
406 movzx Work0l, t0d
407 xor LookupM(0, t0), S1
408 movzx Work0h, t0d
409 xor LookupM(1, t0), P0
410 shr $16, Work0d
411 movzx Work0l, t0d
412 xor LookupM(2, t0), S3
413 movzx Work0h, t0d
414 xor LookupM(3, t0), P2
415
416 // Process previous S2.
417 movd vS2, Work0d
418 movzx Work0l, t0d
419 xor LookupM(0, t0), S2
420 movzx Work0h, t0d
421 xor LookupM(1, t0), P1
422 shr $16, Work0d
423 movzx Work0l, t0d
424 xor LookupM(2, t0), S0
425 movzx Work0h, t0d
426 xor LookupM(3, t0), P3
427
428 // Process previous S3.
429 movd vS3, Work0d
430 movzx Work0l, t0d
431 xor LookupM(0, t0), S3
432 movzx Work0h, t0d
433 xor LookupM(1, t0), P2
434 shr $16, Work0d
435 movzx Work0l, t0d
436 xor LookupM(2, t0), S1
437 movzx Work0h, t0d
438 xor LookupM(3, t0), P0
439
440 #if defined __i386__
441 paddd vIncrement, vExpandedKey
442 movp vExpandedKey, ExpandedKey
443 #else
444 add $Increment, ExpandedKey
445 #endif
446
447 // Save state for next iteration and load next round key.
448 mov S0, Work0d
449 mov 0*4(ExpandedKey), S0
450 movd S1, vS1
451 mov 1*4(ExpandedKey), S1
452 movd S2, vS2
453 mov 2*4(ExpandedKey), S2
454 movd S3, vS3
455 mov 3*4(ExpandedKey), S3
456
457 cmp ExpandedKeyEnd, ExpandedKey
458 jne 1b
459
460 /* Word 0 of the current state must be in Work0 now, and the next round
461 key must be in S0 to S3.
462 */
463
464 // Work around assembler bug. See comments above about Radar 5683882.
465 #if defined __x86_64__
466 lea FTable(%rip), BaseP
467 #endif
468
469 // Process previous S0.
470 movzx Work0l, t0
471 xor LookupF(0, t0), S0
472 movzx Work0h, t0d
473 xor LookupF(1, t0), P3
474 shr $16, Work0d
475 movzx Work0l, t0d
476 xor LookupF(2, t0), S2
477 movzx Work0h, t0d
478 xor LookupF(3, t0), P1
479
480 // Process previous S1.
481 movd vS1, Work0d
482 movzx Work0l, t0d
483 xor LookupF(0, t0), S1
484 movzx Work0h, t0d
485 xor LookupF(1, t0), P0
486 shr $16, Work0d
487 movzx Work0l, t0d
488 xor LookupF(2, t0), S3
489 movzx Work0h, t0d
490 xor LookupF(3, t0), P2
491
492 // Process previous S2.
493 movd vS2, Work0d
494 movzx Work0l, t0d
495 xor LookupF(0, t0), S2
496 movzx Work0h, t0d
497 xor LookupF(1, t0), P1
498 shr $16, Work0d
499 movzx Work0l, t0d
500 xor LookupF(2, t0), S0
501 movzx Work0h, t0d
502 xor LookupF(3, t0), P3
503
504 // Process previous S3.
505 movd vS3, Work0d
506 movzx Work0l, t0d
507 xor LookupF(0, t0), S3
508 movzx Work0h, t0d
509 xor LookupF(1, t0), P2
510 shr $16, Work0d
511 movzx Work0l, t0d
512 xor LookupF(2, t0), S1
513 movzx Work0h, t0d
514 xor LookupF(3, t0), P0
515
516 #if defined __i386__ // Architecture.
517 // Get OutputText address.
518 #define OutputText BaseP
519 mov ArgOutputText, OutputText
520 #endif // Architecture.
521
522 // Write output.
523 mov S0, 0*4(OutputText)
524 mov S1, 1*4(OutputText)
525 mov S2, 2*4(OutputText)
526 mov S3, 3*4(OutputText)
527
528 xor r0, r0 // Return success.
529
530 9:
531 // Pop stack and restore registers.
532 #ifdef KERNEL
533 #if Select < 2
534 #if defined __i386__
535 movaps 4*16(r4), %xmm4
536 movaps 3*16(r4), %xmm3
537 #endif
538 movaps 2*16(r4), %xmm2
539 movaps 1*16(r4), %xmm1
540 movaps 0*16(r4), %xmm0
541 #endif // Select
542 #endif // KERNEL
543 #if 0 < LocalsSize
544 add $Padding + LocalsSize, r4
545 #endif
546 #if defined __i386__
547 pop r7
548 pop r6
549 #elif defined __x86_64__
550 #endif
551 pop r3
552 pop r5
553
554 ret
555
556
557 #undef ArgExpandedKey
558 #undef ArgInputText
559 #undef ArgNr
560 #undef ArgOutputText
561 #undef Argument
562 #undef BaseP
563 #undef ExpandedKey
564 #undef ExpandedKeyEnd
565 #undef FTable
566 #undef InputText
567 #undef LocalsSize
568 #undef LookupM
569 #undef LookupF
570 #undef MTable
571 #undef OutputText
572 #undef Padding
573 #undef SaveSize
574 #undef S0
575 #undef S1
576 #undef S2
577 #undef S3
578 #undef StackFrame
579 #undef Work0d
580 #undef Work0h
581 #undef Work0l
582 #undef Work1d
583 #undef Work1h
584 #undef Work1l
585 #undef Work2d
586 #undef Work2h
587 #undef Work2l
588 #undef Work3d
589 #undef Work3h
590 #undef Work3l
591 #undef t0
592 #undef t0d
593 #undef t0l
594 #undef t1
595 #undef vExpandedKey
596 #undef vS1
597 #undef vS2
598 #undef vS3
599
600 #undef Name
601 #undef MTable
602 #undef FTable
603 #undef P0
604 #undef P1
605 #undef P2
606 #undef P3
607 #undef Increment