]> git.saurik.com Git - apple/xnu.git/blob - bsd/crypto/aes/i386/aesxts_asm.s
ec6b924b7307527765a40e18ed86360e66a59e49
[apple/xnu.git] / bsd / crypto / aes / i386 / aesxts_asm.s
1 /*
2 This file "aesxts.s" provides x86_64 / i386 optimization of the following functions
3
4 0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions
5 1. void xts_mult_x(uint8_t *I);
6 2. int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx);
7 3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim);
8 4. int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx);
9 5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim);
10
11 This file should be compiled together with xtsClearC.c
12
13 functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures
14 functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c)
15 - we can possibly add C code for functions 3 and 5 for future porting to other architectures
16
17 cclee 4-29-10
18
19 */
20
21 #ifdef KERNEL
22 #include <i386/cpu_capabilities.h>
23 #else
24 #include <System/i386/cpu_capabilities.h>
25 #endif
26 #define CRYPT_OK 0 // can not include "crypt.h" in which CRYPT_OK is from enum
27
28 /*
29 The following macro is used throughout the functions in this file.
30 It is the core function within the function xts_mult_x defined in (xtsClearC.c)
31
32 upon entry, %xmm7 = the input tweak (128-bit),
33 on return, %xmm7 = the updated tweak (128-bit)
34 the macro uses %xmm1/%xmm2/%ecx in the computation
35 the operation can be described as follows :
36 0. let x = %xmm7; // 128-bit little-endian input
37 1. x = rotate_left(x,1); // rotate left by 1 -bit
38 2. if (x&1) x ^= 0x0000...0086; // if least significant bit = 1, least significant byte ^= 0x86;
39 3. return x;
40
41 It's a pity that SSE does not support shifting of the whole 128-bit xmm registers.
42 The workaround is
43 1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits
44 2. manipulating the shifted quad words to form the 128-bit shifted result.
45
46 Input : %xmm7
47 Output : %xmm7
48 Used : %xmm1/%xmm2/%ecx
49
50 The macro is good for both x86_64 and i386.
51
52 */
53
54 .macro xts_mult_x_on_xmm7 // input : x = %xmm7, MS = most significant, LS = least significant
55 movaps %xmm7, %xmm1 // %xmm1 = a copy of x
56 movaps %xmm7, %xmm2 // %xmm2 = a copy of x
57 psllq $$1, %xmm7 // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled
58 psrlq $$63, %xmm1 // 2 leading bits, each in the least significant bit of a quad word
59 psrad $$31, %xmm2 // the MS 32-bit will be either 0 or -1, depending on the MS bit of x
60 pshufd $$0xc6, %xmm1, %xmm1 // switch the positions of the 2 leading bits
61 pshufd $$0x03, %xmm2, %xmm2 // the LS 32-bit will be either 0 or -1, depending on the MS bit of x
62 por %xmm1, %xmm7 // we finally has %xmm7 = rotate_left(x,1);
63 movl $$0x86, %ecx // a potential byte to xor the bottom byte
64 movd %ecx, %xmm1 // copy it to %xmm1, the other is 0
65 pand %xmm2, %xmm1 // %xmm1 = 0 or 0x86, depending on the MS bit of x
66 pxor %xmm1, %xmm7 // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x
67 .endm
68
69
70 /*
71 function : void xts_mult_x(uint8_t *I);
72
73 1. load (__m128*) (I) into xmm7
74 2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx)
75 3. save output (%xmm7) to memory pointed by I
76
77 input : 16-byte memory pointed by I
78 output : same 16-byte memory pointed by I
79
80 if kernel code, xmm1/xmm2/xmm7 saved and restored
81 other used registers : eax/ecx
82
83 */
84 .text
85 .align 4,0x90
86 .globl _xts_mult_x
87 _xts_mult_x:
88
89 #if defined __x86_64__
90 #define I %rdi // 1st argument at %rdi for x86_64
91 #define sp %rsp
92 #else
93 mov 4(%esp), %eax // 1st argument at stack, offset 4 for ret_addr for i386
94 #define I %eax
95 #define sp %esp
96 #endif
97
98 // if KERNEL code, allocate memory and save xmm1/xmm2/xmm7
99 #ifdef KERNEL
100 #if defined __x86_64__
101 sub $0x38, sp // 8-bytes alignment + 3 * 16 bytes
102 #else
103 sub $0x3c, sp // 12-bytes alignment + 3 * 16 bytes
104 #endif
105 movaps %xmm1, (sp)
106 movaps %xmm2, 16(sp)
107 movaps %xmm7, 32(sp)
108 #endif
109
110 // load, compute, and save
111 movups (I), %xmm7 // load input tweak 128-bit into %xmm7
112 xts_mult_x_on_xmm7 // the macro (also used else where) will update %xmm7 as the output
113 movups %xmm7, (I) // save the xts_mult_x output
114
115 // if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory
116 #ifdef KERNEL
117 movaps (sp), %xmm1
118 movaps 16(sp), %xmm2
119 movaps 32(sp), %xmm7
120 #if defined __x86_64__
121 add $0x38, sp // 8-bytes alignment + 3 * 16 bytes
122 #else
123 add $0x3c, sp // 12-bytes alignment + 3 * 16 bytes
124 #endif
125 #endif
126
127 ret // return
128
129 #undef I
130 #undef sp
131
132 /*
133 The following is x86_64/i386 assembly implementation of
134
135 int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx);
136
137 Its C code implementation is given in xtsClearC.c
138
139 all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data.
140
141 The operation of tweak_crypt
142
143 1. C = P ^ T
144 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err;
145 3. C = C ^ T
146 4. xts_mult_x(T)
147 5. return CRYPT_OK;
148
149 The following is the assembly implementation flow
150
151 1. save used xmm registers (xmm1/xmm7) if kernel code
152 2. load xmm1 = P, xmm7 = T
153 3. xmm1 = C = P ^ T
154 4. write xmm1 to C
155 5. call aes_encryp(C,C,ctx); note that it will use aesni if available, also xmm will return intact
156 6. load xmm1 = C
157 7. xmm1 = C = C^T = xmm1 ^ xmm7
158 8. write xmm1 to C
159 9. update T (in xmm7) via xts_mult_x macro
160 a. restore xmm registers (xmm1/xmm7) if kernel code
161 b. return CRYPT_OK (in eax)
162
163 Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
164
165 */
166
167 .text
168 .align 4,0x90
169 .globl _tweak_crypt
170 _tweak_crypt:
171 #if defined __i386__
172
173 // push into stack for local use
174 push %ebp
175 mov %esp, %ebp
176 push %ebx
177 push %edi
178 push %esi
179
180 // alllocate stack memory for local use
181 sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
182
183 // load with called arguments
184 mov 8(%ebp), %eax // P, we need this only briefly, so eax is fine
185 mov 12(%ebp), %edi // C
186 mov 16(%ebp), %ebx // T
187 mov 20(%ebp), %esi // ctx
188
189 #define P %eax
190 #define C %edi
191 #define T %ebx
192 #define ctx %esi
193 #define sp %esp
194
195 #else
196 // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
197
198 // push into stack for local use
199 push %rbp
200 mov %rsp, %rbp
201 push %r12
202 push %r13
203 push %r14
204 push %r15
205
206 // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
207 #ifdef KERNEL
208 sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386
209 #endif
210
211 // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt
212 mov %rsi, %r13
213 mov %rdx, %r14
214 mov %rcx, %r15
215
216 #define P %rdi
217 #define C %r13
218 #define T %r14
219 #define ctx %r15
220 #define sp %rsp
221
222 #endif
223
224 // if kernel, save used xmm registers
225 #ifdef KERNEL
226 movaps %xmm1, 16(sp)
227 movaps %xmm2, 32(sp)
228 movaps %xmm7, 48(sp)
229 #endif
230
231 movups (P), %xmm1 // P
232 movups (T), %xmm7 // T
233
234 // setup caliing arguments for aes_encrypt
235 #if defined __i386__
236 mov C, (%esp) // C
237 mov C, 4(%esp) // C
238 mov ctx, 8(%esp) // ctx
239 #else
240 mov C, %rdi // C
241 mov C, %rsi // C
242 mov ctx, %rdx // ctx
243 #endif
244
245 pxor %xmm7, %xmm1 // C = P ^ T
246 movups %xmm1, (C) // save C into memory
247
248 call _aes_encrypt // err = aes_encrypt(C,C,ctx);
249
250 cmp $CRYPT_OK, %eax // check err == CRYPT_OK
251 jne 9f // if err != CRYPT_OK, exit
252
253 movups (C), %xmm1 // load xmm1 = C
254 pxor %xmm7, %xmm1 // C ^= T
255 movups %xmm1, (C) // write C with xmm1, xmm1 is freed now, will be changed in the following macro
256
257 xts_mult_x_on_xmm7 // update T (on xmm7)
258
259 movups %xmm7, (T) // write xmm7 to T
260 9:
261
262 // restore used xmm registers if this is for kernel
263 #ifdef KERNEL
264 movaps 16(sp), %xmm1
265 movaps 32(sp), %xmm2
266 movaps 48(sp), %xmm7
267 #endif
268
269 // free stack memory and restore callee registers
270 #if defined __i386__
271 add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
272 pop %esi
273 pop %edi
274 pop %ebx
275 #else
276 #ifdef KERNEL
277 add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386
278 #endif
279 pop %r15
280 pop %r14
281 pop %r13
282 pop %r12
283 #endif
284
285 // return, eax/rax already has the return val
286 leave
287 ret
288
289 #undef P
290 #undef C
291 #undef T
292 #undef ctx
293 #undef sp
294
295 /*
296 The following is x86_64/i386 assembly implementation of
297
298 int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim);
299
300 TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
301 This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need
302 to happen once for all grouped blocks.
303
304 The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
305 If aesni is available, the code branch to optimized code that uses aesni.
306
307 The optimized aesni code operates as follows:
308
309 while (more than 4 consecutive blocks available) {
310
311 do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
312
313 perform 4 C = P ^ T; // T is on 16-byte aligned stack
314
315 perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput)
316
317 perform 4 C = C ^ T // T is on 16-byte aligned stack
318
319 }
320
321 The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
322
323 1. C = P ^ T
324 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err;
325 3. C = C ^ T
326 4. xts_mult_x(T)
327
328 Note: used xmm registers :
329 xmm0-xmm5, xmm7 if aesni is available
330 xmm0-xmm4, xmm7 if aesni is not available.
331
332 */
333
334 .text
335 .align 4,0x90
336 .globl _tweak_crypt_group
337 _tweak_crypt_group:
338
339 #if defined __i386__
340
341 // push callee-saved registers for local use
342 push %ebp
343 mov %esp, %ebp
344 push %ebx
345 push %edi
346 push %esi
347
348 // allocate stack memory for local use and/or xmm register save for kernel code
349 sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
350 // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni
351 // transfer calling arguments
352 mov 20(%ebp), %eax // ctx
353 mov 12(%ebp), %edi // C
354 mov 16(%ebp), %ebx // T
355 mov 8(%ebp), %esi // P
356 mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt
357
358 #define P %esi
359 #define C %edi
360 #define T %ebx
361 #define lim 24(%ebp)
362 #define sp %esp
363
364 #else
365
366 // push callee-saved registers for local use
367 push %rbp
368 mov %rsp, %rbp
369 push %rbx
370 push %r12
371 push %r13
372 push %r14
373 push %r15
374
375 // allocate stack memory for local use and/or xmm register save for kernel code
376 sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
377
378 // rdi/rsi/rdx/rcx/r8
379 // transfer calling arguments
380 mov %rdi, %r12
381 mov %rsi, %r13
382 mov %rdx, %r14
383 mov %rcx, %r15
384 mov %r8, %rbx
385
386 #define P %r12
387 #define C %r13
388 #define T %r14
389 #define ctx %r15
390 #define lim %ebx
391 #define sp %rsp
392 #endif
393
394 #ifdef KERNEL
395 movaps %xmm0, 0x50(sp)
396 movaps %xmm1, 0x60(sp)
397 movaps %xmm2, 0x70(sp)
398 movaps %xmm3, 0x80(sp)
399 movaps %xmm4, 0x90(sp)
400 movaps %xmm7, 0xa0(sp)
401 #endif
402
403 // probe __cpu_capabilities to detect aesni
404 #if defined __x86_64__
405 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
406 mov (%rax), %eax // %eax = __cpu_capabilities
407 #else // i386
408 #if defined KERNEL
409 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
410 mov (%eax), %eax // %eax = __cpu_capabilities
411 #else
412 movl _COMM_PAGE_CPU_CAPABILITIES, %eax
413 #endif
414 #endif
415 test $(kHasAES), %eax
416 je L_crypt_group_sw // if aesni not available, jump to sw-based implementation
417
418 // aesni-based implementation
419
420 sub $4, lim // pre-decrement lim by 4
421 jl 9f // if lim < 4, skip the following code
422
423 movups (T), %xmm7 // xmm7 is the tweak before encrypting every 4 blocks
424 #ifdef KERNEL
425 movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5
426 #endif
427
428 0:
429 // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
430 // xmm7 will be the tweak for next 4-blocks iteration
431
432 #define tweak1 16(sp)
433 #define tweak2 32(sp)
434 #define tweak3 48(sp)
435 #define tweak4 64(sp)
436
437 movaps %xmm7, tweak1 // save 1st tweak on stack
438 xts_mult_x_on_xmm7 // compute 2nd tweak
439 movaps %xmm7, tweak2 // save 2nd tweak on stack
440 xts_mult_x_on_xmm7 // compute 3rd tweak
441 movaps %xmm7, tweak3 // save 3rd tweak on stack
442 xts_mult_x_on_xmm7 // compute 4th tweak
443 movaps %xmm7, tweak4 // save 4th tweak on stack
444 xts_mult_x_on_xmm7 // compute 1st tweak for next iteration
445
446 // read 4 Ps
447 movups (P), %xmm0
448 movups 16(P), %xmm1
449 movups 32(P), %xmm2
450 movups 48(P), %xmm3
451
452 // 4 C = P ^ T
453 pxor tweak1, %xmm0
454 pxor tweak2, %xmm1
455 pxor tweak3, %xmm2
456 pxor tweak4, %xmm3
457
458 // 4 interleaved aes_encrypt
459
460 #if defined __i386__
461 mov 8(sp), %ecx // ctx
462 #undef ctx
463 #define ctx %ecx
464 #endif
465
466 mov 240(ctx), %eax // aes length
467
468 cmp $160, %eax // AES-128 ?
469 je 160f
470 cmp $192, %eax // AES-192 ?
471 je 192f
472 cmp $224, %eax // AES-256 ?
473 je 224f
474 mov $-1, %eax // error : non-supported aes length
475 #ifdef KERNEL
476 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
477 #endif
478 jmp L_error_crypt
479
480 // definitions, macros, and constructs for 4 blocks hw-aes-encrypt
481
482 // the following key definitions will also be used in tweak_uncrypt_group
483 #define key0 0(ctx)
484 #define key1 16(ctx)
485 #define key2 32(ctx)
486 #define key3 48(ctx)
487 #define key4 64(ctx)
488 #define key5 80(ctx)
489 #define key6 96(ctx)
490 #define key7 112(ctx)
491 #define key8 128(ctx)
492 #define key9 144(ctx)
493 #define keyA 160(ctx)
494 #define keyB 176(ctx)
495 #define keyC 192(ctx)
496 #define keyD 208(ctx)
497 #define keyE 224(ctx)
498
499 #define aes aesenc
500 #define aeslast aesenclast
501
502 // all aes encrypt operations start with the following sequence
503 .macro aes_common_part
504 movups key0, %xmm4
505 movups key1, %xmm5
506 pxor %xmm4, %xmm0
507 pxor %xmm4, %xmm1
508 pxor %xmm4, %xmm2
509 pxor %xmm4, %xmm3
510 movups key2, %xmm4
511 aes %xmm5, %xmm0
512 aes %xmm5, %xmm1
513 aes %xmm5, %xmm2
514 aes %xmm5, %xmm3
515 movups key3, %xmm5
516 aes %xmm4, %xmm0
517 aes %xmm4, %xmm1
518 aes %xmm4, %xmm2
519 aes %xmm4, %xmm3
520 movups key4, %xmm4
521 aes %xmm5, %xmm0
522 aes %xmm5, %xmm1
523 aes %xmm5, %xmm2
524 aes %xmm5, %xmm3
525 movups key5, %xmm5
526 aes %xmm4, %xmm0
527 aes %xmm4, %xmm1
528 aes %xmm4, %xmm2
529 aes %xmm4, %xmm3
530 movups key6, %xmm4
531 aes %xmm5, %xmm0
532 aes %xmm5, %xmm1
533 aes %xmm5, %xmm2
534 aes %xmm5, %xmm3
535 movups key7, %xmm5
536 aes %xmm4, %xmm0
537 aes %xmm4, %xmm1
538 aes %xmm4, %xmm2
539 aes %xmm4, %xmm3
540 movups key8, %xmm4
541 aes %xmm5, %xmm0
542 aes %xmm5, %xmm1
543 aes %xmm5, %xmm2
544 aes %xmm5, %xmm3
545 movups key9, %xmm5
546 aes %xmm4, %xmm0
547 aes %xmm4, %xmm1
548 aes %xmm4, %xmm2
549 aes %xmm4, %xmm3
550 movups keyA, %xmm4
551 aes %xmm5, %xmm0
552 aes %xmm5, %xmm1
553 aes %xmm5, %xmm2
554 aes %xmm5, %xmm3
555 .endm
556
557 // all aes encypt operations end with the following 4 instructions
558 .macro aes_last
559 aeslast %xmm4, %xmm0
560 aeslast %xmm4, %xmm1
561 aeslast %xmm4, %xmm2
562 aeslast %xmm4, %xmm3
563 .endm
564
565 .macro aes_128
566 aes_common_part // encrypt common part
567 aes_last // encrypt ending part
568 .endm
569
570 .macro aes_192
571 aes_common_part // encrypt common part
572
573 // 10 extra instructions in between common and ending
574 movups keyB, %xmm5
575 aes %xmm4, %xmm0
576 aes %xmm4, %xmm1
577 aes %xmm4, %xmm2
578 aes %xmm4, %xmm3
579 movups keyC, %xmm4
580 aes %xmm5, %xmm0
581 aes %xmm5, %xmm1
582 aes %xmm5, %xmm2
583 aes %xmm5, %xmm3
584
585 aes_last // encrypt ending part
586 .endm
587
588 .macro aes_256
589 aes_common_part // encrypt common part
590
591 // 20 extra instructions in between common and ending
592 movups keyB, %xmm5
593 aes %xmm4, %xmm0
594 aes %xmm4, %xmm1
595 aes %xmm4, %xmm2
596 aes %xmm4, %xmm3
597 movups keyC, %xmm4
598 aes %xmm5, %xmm0
599 aes %xmm5, %xmm1
600 aes %xmm5, %xmm2
601 aes %xmm5, %xmm3
602 movups keyD, %xmm5
603 aes %xmm4, %xmm0
604 aes %xmm4, %xmm1
605 aes %xmm4, %xmm2
606 aes %xmm4, %xmm3
607 movups keyE, %xmm4
608 aes %xmm5, %xmm0
609 aes %xmm5, %xmm1
610 aes %xmm5, %xmm2
611 aes %xmm5, %xmm3
612
613 aes_last // encrypt ending part
614 .endm
615
616 160: // AES-128 encrypt
617 aes_128
618 jmp 8f
619
620 192: // AES-192 encrypt
621 aes_192
622 jmp 8f
623
624 224: // AES-256 encrypt
625 aes_256
626
627 8:
628
629 // 4 C = C ^ T
630 pxor tweak1, %xmm0
631 pxor tweak2, %xmm1
632 pxor tweak3, %xmm2
633 pxor tweak4, %xmm3
634
635 // write 4 Cs
636 movups %xmm0, (C)
637 movups %xmm1, 16(C)
638 movups %xmm2, 32(C)
639 movups %xmm3, 48(C)
640
641 add $64, P
642 add $64, C
643
644 sub $4, lim
645 jge 0b
646
647 #ifdef KERNEL
648 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
649 #endif
650 movups %xmm7, (T)
651
652 9:
653 xor %eax, %eax // to return CRYPT_OK
654 add $4, lim // post-increment lim by 4
655 je 9f // if lim==0, branch to prepare to return
656
657 L_crypt_group_sw:
658
659 movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop
660
661 sub $1, lim // pre-decrement lim by 1
662 jl 1f // if lim < 1, branch to prepare to return
663 0:
664 movups (P), %xmm0 // P
665
666 // prepare for calling aes_encrypt
667 #if defined __i386__
668 mov C, (%esp) // C
669 mov C, 4(%esp) // C
670 // ctx was prepared previously in preamble
671 #else
672 mov C, %rdi // C
673 mov C, %rsi // C
674 mov ctx, %rdx // ctx
675 #endif
676
677 pxor %xmm7, %xmm0 // C = P ^ T
678 movups %xmm0, (C) // save C into memory
679
680 call _aes_encrypt_xmm_no_save // err = aes_encrypt(C,C,ctx);
681
682 cmp $CRYPT_OK, %eax // err == CRYPT_OK ?
683 jne 9f // if err != CRYPT_OK, branch to exit with error
684
685 movups (C), %xmm0 // load xmm0 with C
686 pxor %xmm7, %xmm0 // C ^= T
687 movups %xmm0, (C) // save output C
688
689 xts_mult_x_on_xmm7
690
691 add $16, C // next C
692 add $16, P // next P
693 sub $1, lim // lim--
694 jge 0b // if (lim>0) repeat the scalar loop
695
696 1: movups %xmm7, (T) // save final tweak
697 L_error_crypt:
698 9:
699 // if kernel, restore used xmm registers
700 #ifdef KERNEL
701 movaps 0x50(sp), %xmm0
702 movaps 0x60(sp), %xmm1
703 movaps 0x70(sp), %xmm2
704 movaps 0x80(sp), %xmm3
705 movaps 0x90(sp), %xmm4
706 movaps 0xa0(sp), %xmm7
707 #endif
708
709 #if defined __i386__
710 add $(12+16*8+16*4), %esp
711 pop %esi
712 pop %edi
713 pop %ebx
714 #else
715 add $(8+16*8+16*5), %rsp
716 pop %r15
717 pop %r14
718 pop %r13
719 pop %r12
720 pop %rbx
721 #endif
722 leave
723 ret
724
725 #undef P
726 #undef C
727 #undef T
728 #undef ctx
729 #undef sp
730
731 /*
732 The following is x86_64/i386 assembly implementation of
733
734 int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx);
735
736 Its C code implementation is given in xtsClearC.c
737
738 all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data.
739
740 The operation of tweak_crypt
741
742 1. P = C ^ T
743 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err;
744 3. P = P ^ T
745 4. xts_mult_x(T)
746 5. return CRYPT_OK;
747
748 The following is the assembly implementation flow
749
750 1. save used xmm registers (xmm1/xmm7) if kernel code
751 2. load xmm1 = C, xmm7 = T
752 3. xmm1 = P = C ^ T
753 4. write xmm1 to P
754 5. call aes_decryp(P,P,ctx); note that it will use aesni if available, also xmm will return intact
755 6. load xmm1 = P
756 7. xmm1 = P = P^T = xmm1 ^ xmm7
757 8. write xmm1 to P
758 9. update T (in xmm7) via xts_mult_x macro
759 a. restore xmm registers (xmm1/xmm7) if kernel code
760 b. return CRYPT_OK (in eax)
761
762 Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
763
764 */
765
766 .text
767 .align 4,0x90
768 .globl _tweak_uncrypt
769 _tweak_uncrypt:
770 #if defined __i386__
771
772 // push into stack for local use
773 push %ebp
774 mov %esp, %ebp
775 push %ebx
776 push %edi
777 push %esi
778
779 // alllocate stack memory for local use
780 sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
781
782 // load with called arguments
783 mov 8(%ebp), %eax // C, we need this only briefly, so eax is fine
784 mov 12(%ebp), %edi // P
785 mov 16(%ebp), %ebx // T
786 mov 20(%ebp), %esi // ctx
787
788 #define C %eax
789 #define P %edi
790 #define T %ebx
791 #define ctx %esi
792 #define sp %esp
793
794 #else
795 // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
796
797 // push into stack for local use
798 push %rbp
799 mov %rsp, %rbp
800 push %r12
801 push %r13
802 push %r14
803 push %r15
804
805 // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
806 #ifdef KERNEL
807 sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386
808 #endif
809
810 // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt
811 mov %rsi, %r13
812 mov %rdx, %r14
813 mov %rcx, %r15
814
815 #define C %rdi
816 #define P %r13
817 #define T %r14
818 #define ctx %r15
819 #define sp %rsp
820
821 #endif
822
823 // if kernel, save used xmm registers
824 #ifdef KERNEL
825 movaps %xmm1, 16(sp)
826 movaps %xmm2, 32(sp)
827 movaps %xmm7, 48(sp)
828 #endif
829
830 movups (C), %xmm1 // C
831 movups (T), %xmm7 // T
832
833 // setup caliing arguments for aes_decrypt
834 #if defined __i386__
835 mov P, (%esp) // P
836 mov P, 4(%esp) // P
837 mov ctx, 8(%esp) // ctx
838 #else
839 mov P, %rdi // P
840 mov P, %rsi // P
841 mov ctx, %rdx // ctx
842 #endif
843
844 pxor %xmm7, %xmm1 // P = C ^ T
845 movups %xmm1, (P) // save P into memory
846
847 call _aes_decrypt // err = aes_decrypt(P,P,ctx);
848
849 cmp $CRYPT_OK, %eax // check err == CRYPT_OK
850 jne 9f // if err != CRYPT_OK, exit
851
852 movups (P), %xmm1 // load xmm1 = P
853 pxor %xmm7, %xmm1 // P ^= T
854 movups %xmm1, (P) // write P with xmm1, xmm1 is freed now, will be changed in the following macro
855
856 xts_mult_x_on_xmm7 // update T (on xmm7)
857
858 movups %xmm7, (T) // write xmm7 to T
859 9:
860
861 // restore used xmm registers if this is for kernel
862 #ifdef KERNEL
863 movaps 16(sp), %xmm1
864 movaps 32(sp), %xmm2
865 movaps 48(sp), %xmm7
866 #endif
867
868 // free stack memory and restore callee registers
869 #if defined __i386__
870 add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
871 pop %esi
872 pop %edi
873 pop %ebx
874 #else
875 #ifdef KERNEL
876 add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386
877 #endif
878 pop %r15
879 pop %r14
880 pop %r13
881 pop %r12
882 #endif
883
884 // return, eax/rax already has the return val
885 leave
886 ret
887
888 #undef P
889 #undef C
890 #undef T
891 #undef ctx
892 #undef sp
893
894 /*
895 The following is x86_64/i386 assembly implementation of
896
897 int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim);
898
899 TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
900 This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need
901 to happen once for all grouped blocks.
902
903 The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
904 If aesni is available, the code branch to optimized code that uses aesni.
905
906 The optimized aesni code operates as follows:
907
908 while (more than 4 consecutive blocks available) {
909
910 do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
911
912 perform 4 P = C ^ T; // T is on 16-byte aligned stack
913
914 perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput)
915
916 perform 4 P = P ^ T // T is on 16-byte aligned stack
917
918 }
919
920 The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
921
922 1. P = C ^ T
923 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err;
924 3. P = P ^ T
925 4. xts_mult_x(T)
926
927 Note: used xmm registers :
928 xmm0-xmm5, xmm7 if aesni is available
929 xmm0-xmm4, xmm7 if aesni is not available.
930
931 */
932
933 .text
934 .align 4,0x90
935 .globl _tweak_uncrypt_group
936 _tweak_uncrypt_group:
937
938 #if defined __i386__
939
940 // push callee-saved registers for local use
941 push %ebp
942 mov %esp, %ebp
943 push %ebx
944 push %edi
945 push %esi
946
947 // allocate stack memory for local use and/or xmm register save for kernel code
948 sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
949 // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni
950 // transfer calling arguments
951 mov 20(%ebp), %eax // ctx
952 mov 12(%ebp), %edi // P
953 mov 16(%ebp), %ebx // T
954 mov 8(%ebp), %esi // C
955 mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt
956
957 #define C %esi
958 #define P %edi
959 #define T %ebx
960 #define lim 24(%ebp)
961 #define sp %esp
962
963 #else
964
965 // push callee-saved registers for local use
966 push %rbp
967 mov %rsp, %rbp
968 push %rbx
969 push %r12
970 push %r13
971 push %r14
972 push %r15
973
974 // allocate stack memory for local use and/or xmm register save for kernel code
975 sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
976
977 // rdi/rsi/rdx/rcx/r8
978 // transfer calling arguments
979 mov %rdi, %r12
980 mov %rsi, %r13
981 mov %rdx, %r14
982 mov %rcx, %r15
983 mov %r8, %rbx
984
985 #define C %r12
986 #define P %r13
987 #define T %r14
988 #define ctx %r15
989 #define lim %ebx
990 #define sp %rsp
991 #endif
992
993 #ifdef KERNEL
994 movaps %xmm0, 0x50(sp)
995 movaps %xmm1, 0x60(sp)
996 movaps %xmm2, 0x70(sp)
997 movaps %xmm3, 0x80(sp)
998 movaps %xmm4, 0x90(sp)
999 movaps %xmm7, 0xa0(sp)
1000 #endif
1001
1002 // probe __cpu_capabilities to detect aesni
1003 #if defined __x86_64__
1004 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
1005 mov (%rax), %eax // %eax = __cpu_capabilities
1006 #else // i386
1007 #if defined KERNEL
1008 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
1009 mov (%eax), %eax // %eax = __cpu_capabilities
1010 #else
1011 movl _COMM_PAGE_CPU_CAPABILITIES, %eax
1012 #endif
1013 #endif
1014 test $(kHasAES), %eax
1015 je L_uncrypt_group_sw // if aesni not available, jump to sw-based implementation
1016
1017 // aesni-based implementation
1018
1019 sub $4, lim // pre-decrement lim by 4
1020 jl 9f // if lim < 4, skip the following code
1021
1022 movups (T), %xmm7 // xmm7 is the tweak before decrypting every 4 blocks
1023 #ifdef KERNEL
1024 movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5
1025 #endif
1026
1027 0:
1028 // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
1029 // xmm7 will be the tweak for next 4-blocks iteration
1030
1031 #define tweak1 16(sp)
1032 #define tweak2 32(sp)
1033 #define tweak3 48(sp)
1034 #define tweak4 64(sp)
1035
1036 movaps %xmm7, tweak1 // save 1st tweak on stack
1037 xts_mult_x_on_xmm7 // compute 2nd tweak
1038 movaps %xmm7, tweak2 // save 2nd tweak on stack
1039 xts_mult_x_on_xmm7 // compute 3rd tweak
1040 movaps %xmm7, tweak3 // save 3rd tweak on stack
1041 xts_mult_x_on_xmm7 // compute 4th tweak
1042 movaps %xmm7, tweak4 // save 4th tweak on stack
1043 xts_mult_x_on_xmm7 // compute 1st tweak for next iteration
1044
1045 // read 4 Cs
1046 movups (C), %xmm0
1047 movups 16(C), %xmm1
1048 movups 32(C), %xmm2
1049 movups 48(C), %xmm3
1050
1051 // 4 P = C ^ T
1052 pxor tweak1, %xmm0
1053 pxor tweak2, %xmm1
1054 pxor tweak3, %xmm2
1055 pxor tweak4, %xmm3
1056
1057 // 4 interleaved aes_decrypt
1058
1059 #if defined __i386__
1060 mov 8(sp), %ecx // ctx
1061 #undef ctx
1062 #define ctx %ecx
1063 #endif
1064
1065 mov 240(ctx), %eax // aes length
1066
1067 cmp $160, %eax // AES-128 ?
1068 je 160f
1069 cmp $192, %eax // AES-192 ?
1070 je 192f
1071 cmp $224, %eax // AES-256 ?
1072 je 224f
1073 mov $-1, %eax // error : non-supported aes length
1074 #ifdef KERNEL
1075 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
1076 #endif
1077 jmp L_error_uncrypt
1078
1079 // definitions, macros to construc hw-aes-decrypt
1080 // will reuse previously defined key0 = (ctx), key1 = 16(ctx), ....
1081 #undef aes
1082 #undef aeslast
1083 #define aes aesdec
1084 #define aeslast aesdeclast
1085
1086 .macro aes_decrypt_common
1087 movups key8, %xmm4
1088 aes %xmm5, %xmm0
1089 aes %xmm5, %xmm1
1090 aes %xmm5, %xmm2
1091 aes %xmm5, %xmm3
1092 movups key7, %xmm5
1093 aes %xmm4, %xmm0
1094 aes %xmm4, %xmm1
1095 aes %xmm4, %xmm2
1096 aes %xmm4, %xmm3
1097 movups key6, %xmm4
1098 aes %xmm5, %xmm0
1099 aes %xmm5, %xmm1
1100 aes %xmm5, %xmm2
1101 aes %xmm5, %xmm3
1102 movups key5, %xmm5
1103 aes %xmm4, %xmm0
1104 aes %xmm4, %xmm1
1105 aes %xmm4, %xmm2
1106 aes %xmm4, %xmm3
1107 movups key4, %xmm4
1108 aes %xmm5, %xmm0
1109 aes %xmm5, %xmm1
1110 aes %xmm5, %xmm2
1111 aes %xmm5, %xmm3
1112 movups key3, %xmm5
1113 aes %xmm4, %xmm0
1114 aes %xmm4, %xmm1
1115 aes %xmm4, %xmm2
1116 aes %xmm4, %xmm3
1117 movups key2, %xmm4
1118 aes %xmm5, %xmm0
1119 aes %xmm5, %xmm1
1120 aes %xmm5, %xmm2
1121 aes %xmm5, %xmm3
1122 movups key1, %xmm5
1123 aes %xmm4, %xmm0
1124 aes %xmm4, %xmm1
1125 aes %xmm4, %xmm2
1126 aes %xmm4, %xmm3
1127 movups key0, %xmm4
1128 aes %xmm5, %xmm0
1129 aes %xmm5, %xmm1
1130 aes %xmm5, %xmm2
1131 aes %xmm5, %xmm3
1132 aeslast %xmm4, %xmm0
1133 aeslast %xmm4, %xmm1
1134 aeslast %xmm4, %xmm2
1135 aeslast %xmm4, %xmm3
1136 .endm
1137
1138 .macro aes_dec_128
1139 movups keyA, %xmm4
1140 movups key9, %xmm5
1141 pxor %xmm4, %xmm0
1142 pxor %xmm4, %xmm1
1143 pxor %xmm4, %xmm2
1144 pxor %xmm4, %xmm3
1145 aes_decrypt_common
1146 .endm
1147
1148 .macro aes_dec_192
1149 movups keyC, %xmm4
1150 movups keyB, %xmm5
1151 pxor %xmm4, %xmm0
1152 pxor %xmm4, %xmm1
1153 pxor %xmm4, %xmm2
1154 pxor %xmm4, %xmm3
1155 movups keyA, %xmm4
1156 aes %xmm5, %xmm0
1157 aes %xmm5, %xmm1
1158 aes %xmm5, %xmm2
1159 aes %xmm5, %xmm3
1160 movups key9, %xmm5
1161 aes %xmm4, %xmm0
1162 aes %xmm4, %xmm1
1163 aes %xmm4, %xmm2
1164 aes %xmm4, %xmm3
1165 aes_decrypt_common
1166 .endm
1167
1168 .macro aes_dec_256
1169 movups keyE, %xmm4
1170 movups keyD, %xmm5
1171 pxor %xmm4, %xmm0
1172 pxor %xmm4, %xmm1
1173 pxor %xmm4, %xmm2
1174 pxor %xmm4, %xmm3
1175 movups keyC, %xmm4
1176 aes %xmm5, %xmm0
1177 aes %xmm5, %xmm1
1178 aes %xmm5, %xmm2
1179 aes %xmm5, %xmm3
1180 movups keyB, %xmm5
1181 aes %xmm4, %xmm0
1182 aes %xmm4, %xmm1
1183 aes %xmm4, %xmm2
1184 aes %xmm4, %xmm3
1185 movups keyA, %xmm4
1186 aes %xmm5, %xmm0
1187 aes %xmm5, %xmm1
1188 aes %xmm5, %xmm2
1189 aes %xmm5, %xmm3
1190 movups key9, %xmm5
1191 aes %xmm4, %xmm0
1192 aes %xmm4, %xmm1
1193 aes %xmm4, %xmm2
1194 aes %xmm4, %xmm3
1195 aes_decrypt_common
1196 .endm
1197
1198 160: // AES-128 decrypt
1199 aes_dec_128
1200 jmp 8f
1201
1202 192: // AES-192 decrypt
1203 aes_dec_192
1204 jmp 8f
1205
1206 224: // AES-256 decrypt
1207 aes_dec_256
1208
1209 8:
1210
1211 // 4 P = P ^ T
1212 pxor tweak1, %xmm0
1213 pxor tweak2, %xmm1
1214 pxor tweak3, %xmm2
1215 pxor tweak4, %xmm3
1216
1217 // write 4 Ps
1218 movups %xmm0, (P)
1219 movups %xmm1, 16(P)
1220 movups %xmm2, 32(P)
1221 movups %xmm3, 48(P)
1222
1223 add $64, C
1224 add $64, P
1225
1226 sub $4, lim
1227 jge 0b
1228
1229 #ifdef KERNEL
1230 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
1231 #endif
1232 movups %xmm7, (T)
1233
1234 9:
1235 xor %eax, %eax // to return CRYPT_OK
1236 add $4, lim // post-increment lim by 4
1237 je 9f // if lim==0, branch to prepare to return
1238
1239 L_uncrypt_group_sw:
1240
1241 movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop
1242
1243 sub $1, lim // pre-decrement lim by 1
1244 jl 1f // if lim < 1, branch to prepare to return
1245 0:
1246 movups (C), %xmm0 // C
1247
1248 // prepare for calling aes_decrypt
1249 #if defined __i386__
1250 mov P, (%esp) // P
1251 mov P, 4(%esp) // P
1252 // ctx was prepared previously in preamble
1253 #else
1254 mov P, %rdi // P
1255 mov P, %rsi // P
1256 mov ctx, %rdx // ctx
1257 #endif
1258
1259 pxor %xmm7, %xmm0 // P = C ^ T
1260 movups %xmm0, (P) // save P into memory
1261
1262 call _aes_decrypt_xmm_no_save // err = aes_decrypt(P,P,ctx);
1263
1264 cmp $CRYPT_OK, %eax // err == CRYPT_OK ?
1265 jne 9f // if err != CRYPT_OK, branch to exit with error
1266
1267 movups (P), %xmm0 // load xmm0 with P
1268 pxor %xmm7, %xmm0 // P ^= T
1269 movups %xmm0, (P) // save output P
1270
1271 xts_mult_x_on_xmm7
1272
1273 add $16, C // next C
1274 add $16, P // next P
1275 sub $1, lim // lim--
1276 jge 0b // if (lim>0) repeat the scalar loop
1277
1278 1: movups %xmm7, (T) // save final tweak
1279 L_error_uncrypt:
1280 9:
1281 // if kernel, restore used xmm registers
1282 #ifdef KERNEL
1283 movaps 0x50(sp), %xmm0
1284 movaps 0x60(sp), %xmm1
1285 movaps 0x70(sp), %xmm2
1286 movaps 0x80(sp), %xmm3
1287 movaps 0x90(sp), %xmm4
1288 movaps 0xa0(sp), %xmm7
1289 #endif
1290
1291 #if defined __i386__
1292 add $(12+16*8+16*4), %esp
1293 pop %esi
1294 pop %edi
1295 pop %ebx
1296 #else
1297 add $(8+16*8+16*5), %rsp
1298 pop %r15
1299 pop %r14
1300 pop %r13
1301 pop %r12
1302 pop %rbx
1303 #endif
1304 leave
1305 ret