2 This file "aesxts.s" provides x86_64 / i386 optimization of the following functions
4 0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions
5 1. void xts_mult_x(uint8_t *I);
6 2. int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx);
7 3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim);
8 4. int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx);
9 5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim);
11 This file should be compiled together with xtsClearC.c
13 functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures
14 functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c)
15 - we can possibly add C code for functions 3 and 5 for future porting to other architectures
22 #include <i386/cpu_capabilities.h>
24 #include <System/i386/cpu_capabilities.h>
26 #define CRYPT_OK 0 // can not include "crypt.h" in which CRYPT_OK is from enum
29 The following macro is used throughout the functions in this file.
30 It is the core function within the function xts_mult_x defined in (xtsClearC.c)
32 upon entry, %xmm7 = the input tweak (128-bit),
33 on return, %xmm7 = the updated tweak (128-bit)
34 the macro uses %xmm1/%xmm2/%ecx in the computation
35 the operation can be described as follows :
36 0. let x = %xmm7; // 128-bit little-endian input
37 1. x = rotate_left(x,1); // rotate left by 1 -bit
38 2. if (x&1) x ^= 0x0000...0086; // if least significant bit = 1, least significant byte ^= 0x86;
41 It's a pity that SSE does not support shifting of the whole 128-bit xmm registers.
43 1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits
44 2. manipulating the shifted quad words to form the 128-bit shifted result.
48 Used : %xmm1/%xmm2/%ecx
50 The macro is good for both x86_64 and i386.
54 .macro xts_mult_x_on_xmm7 // input : x = %xmm7, MS = most significant, LS = least significant
55 movaps %xmm7, %xmm1 // %xmm1 = a copy of x
56 movaps %xmm7, %xmm2 // %xmm2 = a copy of x
57 psllq $$1, %xmm7 // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled
58 psrlq $$63, %xmm1 // 2 leading bits, each in the least significant bit of a quad word
59 psrad $$31, %xmm2 // the MS 32-bit will be either 0 or -1, depending on the MS bit of x
60 pshufd $$0xc6, %xmm1, %xmm1 // switch the positions of the 2 leading bits
61 pshufd $$0x03, %xmm2, %xmm2 // the LS 32-bit will be either 0 or -1, depending on the MS bit of x
62 por %xmm1, %xmm7 // we finally has %xmm7 = rotate_left(x,1);
63 movl $$0x86, %ecx // a potential byte to xor the bottom byte
64 movd %ecx, %xmm1 // copy it to %xmm1, the other is 0
65 pand %xmm2, %xmm1 // %xmm1 = 0 or 0x86, depending on the MS bit of x
66 pxor %xmm1, %xmm7 // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x
71 function : void xts_mult_x(uint8_t *I);
73 1. load (__m128*) (I) into xmm7
74 2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx)
75 3. save output (%xmm7) to memory pointed by I
77 input : 16-byte memory pointed by I
78 output : same 16-byte memory pointed by I
80 if kernel code, xmm1/xmm2/xmm7 saved and restored
81 other used registers : eax/ecx
89 #if defined __x86_64__
90 #define I %rdi // 1st argument at %rdi for x86_64
93 mov 4(%esp), %eax // 1st argument at stack, offset 4 for ret_addr for i386
98 // if KERNEL code, allocate memory and save xmm1/xmm2/xmm7
100 #if defined __x86_64__
101 sub $0x38, sp // 8-bytes alignment + 3 * 16 bytes
103 sub $0x3c, sp // 12-bytes alignment + 3 * 16 bytes
110 // load, compute, and save
111 movups (I), %xmm7 // load input tweak 128-bit into %xmm7
112 xts_mult_x_on_xmm7 // the macro (also used else where) will update %xmm7 as the output
113 movups %xmm7, (I) // save the xts_mult_x output
115 // if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory
120 #if defined __x86_64__
121 add $0x38, sp // 8-bytes alignment + 3 * 16 bytes
123 add $0x3c, sp // 12-bytes alignment + 3 * 16 bytes
133 The following is x86_64/i386 assembly implementation of
135 int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx);
137 Its C code implementation is given in xtsClearC.c
139 all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data.
141 The operation of tweak_crypt
144 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err;
149 The following is the assembly implementation flow
151 1. save used xmm registers (xmm1/xmm7) if kernel code
152 2. load xmm1 = P, xmm7 = T
155 5. call aes_encryp(C,C,ctx); note that it will use aesni if available, also xmm will return intact
157 7. xmm1 = C = C^T = xmm1 ^ xmm7
159 9. update T (in xmm7) via xts_mult_x macro
160 a. restore xmm registers (xmm1/xmm7) if kernel code
161 b. return CRYPT_OK (in eax)
163 Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
173 // push into stack for local use
180 // alllocate stack memory for local use
181 sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
183 // load with called arguments
184 mov 8(%ebp), %eax // P, we need this only briefly, so eax is fine
185 mov 12(%ebp), %edi // C
186 mov 16(%ebp), %ebx // T
187 mov 20(%ebp), %esi // ctx
196 // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
198 // push into stack for local use
206 // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
208 sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386
211 // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt
224 // if kernel, save used xmm registers
231 movups (P), %xmm1 // P
232 movups (T), %xmm7 // T
234 // setup caliing arguments for aes_encrypt
238 mov ctx, 8(%esp) // ctx
245 pxor %xmm7, %xmm1 // C = P ^ T
246 movups %xmm1, (C) // save C into memory
248 call _aes_encrypt // err = aes_encrypt(C,C,ctx);
250 cmp $CRYPT_OK, %eax // check err == CRYPT_OK
251 jne 9f // if err != CRYPT_OK, exit
253 movups (C), %xmm1 // load xmm1 = C
254 pxor %xmm7, %xmm1 // C ^= T
255 movups %xmm1, (C) // write C with xmm1, xmm1 is freed now, will be changed in the following macro
257 xts_mult_x_on_xmm7 // update T (on xmm7)
259 movups %xmm7, (T) // write xmm7 to T
262 // restore used xmm registers if this is for kernel
269 // free stack memory and restore callee registers
271 add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
277 add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386
285 // return, eax/rax already has the return val
296 The following is x86_64/i386 assembly implementation of
298 int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim);
300 TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
301 This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need
302 to happen once for all grouped blocks.
304 The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
305 If aesni is available, the code branch to optimized code that uses aesni.
307 The optimized aesni code operates as follows:
309 while (more than 4 consecutive blocks available) {
311 do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
313 perform 4 C = P ^ T; // T is on 16-byte aligned stack
315 perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput)
317 perform 4 C = C ^ T // T is on 16-byte aligned stack
321 The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
324 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err;
328 Note: used xmm registers :
329 xmm0-xmm5, xmm7 if aesni is available
330 xmm0-xmm4, xmm7 if aesni is not available.
336 .globl _tweak_crypt_group
341 // push callee-saved registers for local use
348 // allocate stack memory for local use and/or xmm register save for kernel code
349 sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
350 // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni
351 // transfer calling arguments
352 mov 20(%ebp), %eax // ctx
353 mov 12(%ebp), %edi // C
354 mov 16(%ebp), %ebx // T
355 mov 8(%ebp), %esi // P
356 mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt
366 // push callee-saved registers for local use
375 // allocate stack memory for local use and/or xmm register save for kernel code
376 sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
378 // rdi/rsi/rdx/rcx/r8
379 // transfer calling arguments
395 movaps %xmm0, 0x50(sp)
396 movaps %xmm1, 0x60(sp)
397 movaps %xmm2, 0x70(sp)
398 movaps %xmm3, 0x80(sp)
399 movaps %xmm4, 0x90(sp)
400 movaps %xmm7, 0xa0(sp)
403 // probe __cpu_capabilities to detect aesni
404 #if defined __x86_64__
405 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
406 mov (%rax), %eax // %eax = __cpu_capabilities
409 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
410 mov (%eax), %eax // %eax = __cpu_capabilities
412 movl _COMM_PAGE_CPU_CAPABILITIES, %eax
415 test $(kHasAES), %eax
416 je L_crypt_group_sw // if aesni not available, jump to sw-based implementation
418 // aesni-based implementation
420 sub $4, lim // pre-decrement lim by 4
421 jl 9f // if lim < 4, skip the following code
423 movups (T), %xmm7 // xmm7 is the tweak before encrypting every 4 blocks
425 movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5
429 // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
430 // xmm7 will be the tweak for next 4-blocks iteration
432 #define tweak1 16(sp)
433 #define tweak2 32(sp)
434 #define tweak3 48(sp)
435 #define tweak4 64(sp)
437 movaps %xmm7, tweak1 // save 1st tweak on stack
438 xts_mult_x_on_xmm7 // compute 2nd tweak
439 movaps %xmm7, tweak2 // save 2nd tweak on stack
440 xts_mult_x_on_xmm7 // compute 3rd tweak
441 movaps %xmm7, tweak3 // save 3rd tweak on stack
442 xts_mult_x_on_xmm7 // compute 4th tweak
443 movaps %xmm7, tweak4 // save 4th tweak on stack
444 xts_mult_x_on_xmm7 // compute 1st tweak for next iteration
458 // 4 interleaved aes_encrypt
461 mov 8(sp), %ecx // ctx
466 mov 240(ctx), %eax // aes length
468 cmp $160, %eax // AES-128 ?
470 cmp $192, %eax // AES-192 ?
472 cmp $224, %eax // AES-256 ?
474 mov $-1, %eax // error : non-supported aes length
476 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
480 // definitions, macros, and constructs for 4 blocks hw-aes-encrypt
482 // the following key definitions will also be used in tweak_uncrypt_group
490 #define key7 112(ctx)
491 #define key8 128(ctx)
492 #define key9 144(ctx)
493 #define keyA 160(ctx)
494 #define keyB 176(ctx)
495 #define keyC 192(ctx)
496 #define keyD 208(ctx)
497 #define keyE 224(ctx)
500 #define aeslast aesenclast
502 // all aes encrypt operations start with the following sequence
503 .macro aes_common_part
557 // all aes encypt operations end with the following 4 instructions
566 aes_common_part // encrypt common part
567 aes_last // encrypt ending part
571 aes_common_part // encrypt common part
573 // 10 extra instructions in between common and ending
585 aes_last // encrypt ending part
589 aes_common_part // encrypt common part
591 // 20 extra instructions in between common and ending
613 aes_last // encrypt ending part
616 160: // AES-128 encrypt
620 192: // AES-192 encrypt
624 224: // AES-256 encrypt
648 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
653 xor %eax, %eax // to return CRYPT_OK
654 add $4, lim // post-increment lim by 4
655 je 9f // if lim==0, branch to prepare to return
659 movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop
661 sub $1, lim // pre-decrement lim by 1
662 jl 1f // if lim < 1, branch to prepare to return
664 movups (P), %xmm0 // P
666 // prepare for calling aes_encrypt
670 // ctx was prepared previously in preamble
677 pxor %xmm7, %xmm0 // C = P ^ T
678 movups %xmm0, (C) // save C into memory
680 call _aes_encrypt_xmm_no_save // err = aes_encrypt(C,C,ctx);
682 cmp $CRYPT_OK, %eax // err == CRYPT_OK ?
683 jne 9f // if err != CRYPT_OK, branch to exit with error
685 movups (C), %xmm0 // load xmm0 with C
686 pxor %xmm7, %xmm0 // C ^= T
687 movups %xmm0, (C) // save output C
694 jge 0b // if (lim>0) repeat the scalar loop
696 1: movups %xmm7, (T) // save final tweak
699 // if kernel, restore used xmm registers
701 movaps 0x50(sp), %xmm0
702 movaps 0x60(sp), %xmm1
703 movaps 0x70(sp), %xmm2
704 movaps 0x80(sp), %xmm3
705 movaps 0x90(sp), %xmm4
706 movaps 0xa0(sp), %xmm7
710 add $(12+16*8+16*4), %esp
715 add $(8+16*8+16*5), %rsp
732 The following is x86_64/i386 assembly implementation of
734 int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx);
736 Its C code implementation is given in xtsClearC.c
738 all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data.
740 The operation of tweak_crypt
743 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err;
748 The following is the assembly implementation flow
750 1. save used xmm registers (xmm1/xmm7) if kernel code
751 2. load xmm1 = C, xmm7 = T
754 5. call aes_decryp(P,P,ctx); note that it will use aesni if available, also xmm will return intact
756 7. xmm1 = P = P^T = xmm1 ^ xmm7
758 9. update T (in xmm7) via xts_mult_x macro
759 a. restore xmm registers (xmm1/xmm7) if kernel code
760 b. return CRYPT_OK (in eax)
762 Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
768 .globl _tweak_uncrypt
772 // push into stack for local use
779 // alllocate stack memory for local use
780 sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
782 // load with called arguments
783 mov 8(%ebp), %eax // C, we need this only briefly, so eax is fine
784 mov 12(%ebp), %edi // P
785 mov 16(%ebp), %ebx // T
786 mov 20(%ebp), %esi // ctx
795 // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
797 // push into stack for local use
805 // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
807 sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386
810 // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt
823 // if kernel, save used xmm registers
830 movups (C), %xmm1 // C
831 movups (T), %xmm7 // T
833 // setup caliing arguments for aes_decrypt
837 mov ctx, 8(%esp) // ctx
844 pxor %xmm7, %xmm1 // P = C ^ T
845 movups %xmm1, (P) // save P into memory
847 call _aes_decrypt // err = aes_decrypt(P,P,ctx);
849 cmp $CRYPT_OK, %eax // check err == CRYPT_OK
850 jne 9f // if err != CRYPT_OK, exit
852 movups (P), %xmm1 // load xmm1 = P
853 pxor %xmm7, %xmm1 // P ^= T
854 movups %xmm1, (P) // write P with xmm1, xmm1 is freed now, will be changed in the following macro
856 xts_mult_x_on_xmm7 // update T (on xmm7)
858 movups %xmm7, (T) // write xmm7 to T
861 // restore used xmm registers if this is for kernel
868 // free stack memory and restore callee registers
870 add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
876 add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386
884 // return, eax/rax already has the return val
895 The following is x86_64/i386 assembly implementation of
897 int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim);
899 TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
900 This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need
901 to happen once for all grouped blocks.
903 The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
904 If aesni is available, the code branch to optimized code that uses aesni.
906 The optimized aesni code operates as follows:
908 while (more than 4 consecutive blocks available) {
910 do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
912 perform 4 P = C ^ T; // T is on 16-byte aligned stack
914 perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput)
916 perform 4 P = P ^ T // T is on 16-byte aligned stack
920 The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
923 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err;
927 Note: used xmm registers :
928 xmm0-xmm5, xmm7 if aesni is available
929 xmm0-xmm4, xmm7 if aesni is not available.
935 .globl _tweak_uncrypt_group
936 _tweak_uncrypt_group:
940 // push callee-saved registers for local use
947 // allocate stack memory for local use and/or xmm register save for kernel code
948 sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
949 // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni
950 // transfer calling arguments
951 mov 20(%ebp), %eax // ctx
952 mov 12(%ebp), %edi // P
953 mov 16(%ebp), %ebx // T
954 mov 8(%ebp), %esi // C
955 mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt
965 // push callee-saved registers for local use
974 // allocate stack memory for local use and/or xmm register save for kernel code
975 sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
977 // rdi/rsi/rdx/rcx/r8
978 // transfer calling arguments
994 movaps %xmm0, 0x50(sp)
995 movaps %xmm1, 0x60(sp)
996 movaps %xmm2, 0x70(sp)
997 movaps %xmm3, 0x80(sp)
998 movaps %xmm4, 0x90(sp)
999 movaps %xmm7, 0xa0(sp)
1002 // probe __cpu_capabilities to detect aesni
1003 #if defined __x86_64__
1004 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
1005 mov (%rax), %eax // %eax = __cpu_capabilities
1008 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
1009 mov (%eax), %eax // %eax = __cpu_capabilities
1011 movl _COMM_PAGE_CPU_CAPABILITIES, %eax
1014 test $(kHasAES), %eax
1015 je L_uncrypt_group_sw // if aesni not available, jump to sw-based implementation
1017 // aesni-based implementation
1019 sub $4, lim // pre-decrement lim by 4
1020 jl 9f // if lim < 4, skip the following code
1022 movups (T), %xmm7 // xmm7 is the tweak before decrypting every 4 blocks
1024 movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5
1028 // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
1029 // xmm7 will be the tweak for next 4-blocks iteration
1031 #define tweak1 16(sp)
1032 #define tweak2 32(sp)
1033 #define tweak3 48(sp)
1034 #define tweak4 64(sp)
1036 movaps %xmm7, tweak1 // save 1st tweak on stack
1037 xts_mult_x_on_xmm7 // compute 2nd tweak
1038 movaps %xmm7, tweak2 // save 2nd tweak on stack
1039 xts_mult_x_on_xmm7 // compute 3rd tweak
1040 movaps %xmm7, tweak3 // save 3rd tweak on stack
1041 xts_mult_x_on_xmm7 // compute 4th tweak
1042 movaps %xmm7, tweak4 // save 4th tweak on stack
1043 xts_mult_x_on_xmm7 // compute 1st tweak for next iteration
1057 // 4 interleaved aes_decrypt
1059 #if defined __i386__
1060 mov 8(sp), %ecx // ctx
1065 mov 240(ctx), %eax // aes length
1067 cmp $160, %eax // AES-128 ?
1069 cmp $192, %eax // AES-192 ?
1071 cmp $224, %eax // AES-256 ?
1073 mov $-1, %eax // error : non-supported aes length
1075 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
1079 // definitions, macros to construc hw-aes-decrypt
1080 // will reuse previously defined key0 = (ctx), key1 = 16(ctx), ....
1084 #define aeslast aesdeclast
1086 .macro aes_decrypt_common
1132 aeslast %xmm4, %xmm0
1133 aeslast %xmm4, %xmm1
1134 aeslast %xmm4, %xmm2
1135 aeslast %xmm4, %xmm3
1198 160: // AES-128 decrypt
1202 192: // AES-192 decrypt
1206 224: // AES-256 decrypt
1230 movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5
1235 xor %eax, %eax // to return CRYPT_OK
1236 add $4, lim // post-increment lim by 4
1237 je 9f // if lim==0, branch to prepare to return
1241 movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop
1243 sub $1, lim // pre-decrement lim by 1
1244 jl 1f // if lim < 1, branch to prepare to return
1246 movups (C), %xmm0 // C
1248 // prepare for calling aes_decrypt
1249 #if defined __i386__
1252 // ctx was prepared previously in preamble
1256 mov ctx, %rdx // ctx
1259 pxor %xmm7, %xmm0 // P = C ^ T
1260 movups %xmm0, (P) // save P into memory
1262 call _aes_decrypt_xmm_no_save // err = aes_decrypt(P,P,ctx);
1264 cmp $CRYPT_OK, %eax // err == CRYPT_OK ?
1265 jne 9f // if err != CRYPT_OK, branch to exit with error
1267 movups (P), %xmm0 // load xmm0 with P
1268 pxor %xmm7, %xmm0 // P ^= T
1269 movups %xmm0, (P) // save output P
1273 add $16, C // next C
1274 add $16, P // next P
1275 sub $1, lim // lim--
1276 jge 0b // if (lim>0) repeat the scalar loop
1278 1: movups %xmm7, (T) // save final tweak
1281 // if kernel, restore used xmm registers
1283 movaps 0x50(sp), %xmm0
1284 movaps 0x60(sp), %xmm1
1285 movaps 0x70(sp), %xmm2
1286 movaps 0x80(sp), %xmm3
1287 movaps 0x90(sp), %xmm4
1288 movaps 0xa0(sp), %xmm7
1291 #if defined __i386__
1292 add $(12+16*8+16*4), %esp
1297 add $(8+16*8+16*5), %rsp