-/*\r
- ---------------------------------------------------------------------------\r
- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.\r
-\r
- LICENSE TERMS\r
-\r
- The free distribution and use of this software in both source and binary\r
- form is allowed (with or without changes) provided that:\r
-\r
- 1. distributions of this source code include the above copyright\r
- notice, this list of conditions and the following disclaimer;\r
-\r
- 2. distributions in binary form include the above copyright\r
- notice, this list of conditions and the following disclaimer\r
- in the documentation and/or other associated materials;\r
-\r
- 3. the copyright holder's name is not used to endorse products\r
- built using this software without specific written permission.\r
-\r
- ALTERNATIVELY, provided that this notice is retained in full, this product\r
- may be distributed under the terms of the GNU General Public License (GPL),\r
- in which case the provisions of the GPL apply INSTEAD OF those given above.\r
-\r
- DISCLAIMER\r
-\r
- This software is provided 'as is' with no explicit or implied warranties\r
- in respect of its properties, including, but not limited to, correctness\r
- and/or fitness for purpose.\r
- ---------------------------------------------------------------------------\r
- Issue 31/01/2006\r
-\r
- These subroutines implement multiple block AES modes for ECB, CBC, CFB,\r
- OFB and CTR encryption, The code provides support for the VIA Advanced \r
- Cryptography Engine (ACE).\r
-\r
- NOTE: In the following subroutines, the AES contexts (ctx) must be\r
- 16 byte aligned if VIA ACE is being used\r
-*/\r
-\r
-/* modified 3/5/10 cclee */\r
-/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */\r
-/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */\r
-\r
-/* HW-AES specific implementation cclee 3-12-10 */\r
-/* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, \r
- and if kHasAES is detected, branch to the hw-specific functions here */\r
-\r
-\r
-/* \r
- This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation\r
- of _aes_encrypt_cbc and _aes_decrypt_cbc. \r
-\r
- These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. \r
- They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.\r
-\r
- The AES HW is detected 1st thing in \r
- _aes_encrypt_cbc (aes_modes_asm.s) \r
- _aes_decrypt_cbc (aes_modes_asm.s)\r
- and, if AES HW is detected, branch without link (ie, jump) to the functions here.\r
-\r
- The implementation here follows the examples in an Intel White Paper\r
- "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01\r
-\r
- Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01\r
-\r
- cclee 3-13-10\r
-*/\r
-\r
-/* \r
- The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block\r
- in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks\r
- in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.\r
-\r
- The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) \r
-\r
- This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.\r
- On a 2.4GHz core-i5/2.66GHz core-i7, the x86_64 decrypt throughput (in xnu-iokit) has been improved\r
- from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.\r
- The encrypt throughput is not changed. \r
-\r
- I also enhanced the assembly code comments.\r
-\r
- cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)\r
-\r
-*/\r
-\r
-/* ---------------------------------------------------------------------------------------------------------------- \r
-\r
- aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
-\r
- For simplicity, I am assuming all variables are in 128-bit data type.\r
-\r
- aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)\r
- {\r
- while(num_blk--) {\r
- *iv ^= *ibuf++;\r
- aes_encrypt(iv, iv, ctx);\r
- *obuf++ = *iv;\r
- }\r
- return 0;\r
- }\r
-\r
- The following is an implementation of this function using Intel AESNI.\r
- This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. \r
- Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
- to this aesni-based function should it detecs that aesni is available.\r
- Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
-\r
- Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks\r
- are serially chained. This prevents us from arranging several blocks for encryption in parallel.\r
-\r
- ----------------------------------------------------------------------------------------------------------------*/\r
-\r
- .text\r
- .align 4,0x90\r
- .globl _aes_encrypt_cbc_hw\r
-_aes_encrypt_cbc_hw:\r
-\r
- // push/save registers for local use\r
-#if defined __i386__\r
-\r
- push %ebp\r
- movl %esp, %ebp\r
- push %ebx\r
- push %edi\r
-\r
- #define sp %esp\r
-\r
-#else // __x86_64__\r
-\r
- push %rbp\r
- mov %rsp, %rbp\r
- push %rbx\r
- push %r13\r
- push %r14\r
- push %r15\r
-\r
- #define sp %rsp\r
-\r
-#endif\r
-\r
- // if this is kernel code, need to save used xmm registers\r
-#ifdef KERNEL\r
-\r
-#if defined __i386__\r
- sub $(8*16), %esp // for possible xmm0-xmm7 save/restore\r
-#else\r
- sub $(16*16), %rsp // xmm0-xmm15 save/restore \r
-#endif\r
-\r
- movaps %xmm0, (sp)\r
- movaps %xmm1, 16(sp)\r
- movaps %xmm2, 32(sp)\r
- movaps %xmm3, 48(sp)\r
- movaps %xmm4, 64(sp)\r
- movaps %xmm5, 80(sp)\r
- movaps %xmm6, 96(sp)\r
- movaps %xmm7, 112(sp)\r
-#if defined __x86_64__\r
- movaps %xmm8, 16*8(sp)\r
- movaps %xmm9, 16*9(sp)\r
- movaps %xmm10, 16*10(sp)\r
- movaps %xmm11, 16*11(sp)\r
- movaps %xmm12, 16*12(sp)\r
- movaps %xmm13, 16*13(sp)\r
- movaps %xmm14, 16*14(sp)\r
- movaps %xmm15, 16*15(sp)\r
-#endif // __x86_64__\r
-\r
-#endif // KERNEL\r
-\r
- #define iv %xmm0\r
-\r
-#ifdef __i386__\r
-\r
- mov 12(%ebp), %eax // in_iv\r
- mov 24(%ebp), %edx // ctx\r
- movups (%eax), iv // iv = in_iv \r
- mov 8(%ebp), %ebx // ibuf\r
- mov 16(%ebp), %ecx // num_blk\r
- mov 20(%ebp), %edi // obuf\r
-\r
- #define ibuf %ebx\r
- #define obuf %edi\r
- #define num_blk %ecx \r
- #define ctx %edx\r
-\r
-#else\r
-\r
- mov %rdi, %rbx // ibuf\r
- movups (%rsi), iv // iv = in_iv\r
- mov %rdx, %r13 // num_blk\r
- mov %rcx, %r14 // obuf\r
- mov %r8, %r15 // ctx \r
-\r
- #define ibuf %rbx\r
- #define num_blk %r13d\r
- #define obuf %r14 \r
- #define ctx %r15\r
-\r
-#endif\r
-\r
- mov 240(ctx), %eax // aes length\r
- cmp $160, %eax // aes-128 encrypt ?\r
- je L_encrypt_128\r
- cmp $192, %eax // aes-192 encrypt ?\r
- je L_encrypt_192\r
- cmp $224, %eax // aes-256 encrypt ?\r
- je L_encrypt_256\r
- mov $-1, %eax // return error\r
- jmp L_error \r
-\r
- //\r
- // aes-128 encrypt_cbc operation, up to L_HW_cbc_done\r
- //\r
-\r
-L_encrypt_128:\r
-\r
- cmp $1, num_blk // check number of block\r
- jl L_HW_cbc_done // should it be less than 1, nothing to do\r
-\r
- movups (ctx), %xmm2 // key0\r
- movups 16(ctx), %xmm3 // key1\r
- movups 32(ctx), %xmm4 // key2\r
- movups 48(ctx), %xmm5 // key3\r
- movups 64(ctx), %xmm6 // key4\r
- movups 80(ctx), %xmm7 // key5\r
-#if defined __x86_64__\r
- movups 96(ctx), %xmm8 // key6\r
- movups 112(ctx), %xmm9 // key7\r
- movups 128(ctx), %xmm10 // key8\r
- movups 144(ctx), %xmm11 // key9\r
- movups 160(ctx), %xmm12 // keyA\r
-#endif\r
-\r
- // while (num_blk--) {\r
- // *iv ^= *ibuf++;\r
- // aes_encrypt(iv, iv, ctx);\r
- // *obuf++ = *iv;\r
- // }\r
-0:\r
- movups (ibuf), %xmm1 // *ibuf\r
- pxor %xmm2, iv // 1st instruction inside aes_encrypt\r
- pxor %xmm1, iv // *iv ^= *ibuf\r
-\r
- // finishing up the rest of aes_encrypt\r
- aesenc %xmm3, iv\r
- aesenc %xmm4, iv\r
- aesenc %xmm5, iv\r
- aesenc %xmm6, iv\r
- aesenc %xmm7, iv\r
-#if defined __x86_64__\r
- aesenc %xmm8, iv\r
- aesenc %xmm9, iv\r
- aesenc %xmm10, iv\r
- aesenc %xmm11, iv\r
- aesenclast %xmm12, iv\r
-#else\r
- movups 96(ctx), %xmm1 // key6\r
- aesenc %xmm1, iv\r
- movups 112(ctx), %xmm1 // key7\r
- aesenc %xmm1, iv\r
- movups 128(ctx), %xmm1 // key8\r
- aesenc %xmm1, iv\r
- movups 144(ctx), %xmm1 // key9\r
- aesenc %xmm1, iv\r
- movups 160(ctx), %xmm1 // keyA\r
- aesenclast %xmm1, iv\r
-#endif\r
-\r
- movups iv, (obuf) // *obuf = *iv;\r
- add $16, obuf // obuf++;\r
- add $16, ibuf // ibuf++;\r
- sub $1, num_blk // num_blk --\r
- jg 0b // if num_blk > 0, repeat the loop\r
-\r
- // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)\r
-\r
-L_HW_cbc_done:\r
-\r
- xor %eax, %eax // to return CRYPT_OK\r
-\r
-L_error:\r
-\r
- // if kernel, restore xmm registers\r
-#ifdef KERNEL \r
- movaps 0(sp), %xmm0\r
- movaps 16(sp), %xmm1\r
- movaps 32(sp), %xmm2\r
- movaps 48(sp), %xmm3\r
- movaps 64(sp), %xmm4\r
- movaps 80(sp), %xmm5\r
- movaps 96(sp), %xmm6\r
- movaps 112(sp), %xmm7\r
-#if defined __x86_64__\r
- movaps 16*8(sp), %xmm8\r
- movaps 16*9(sp), %xmm9\r
- movaps 16*10(sp), %xmm10\r
- movaps 16*11(sp), %xmm11\r
- movaps 16*12(sp), %xmm12\r
- movaps 16*13(sp), %xmm13\r
- movaps 16*14(sp), %xmm14\r
- movaps 16*15(sp), %xmm15\r
-#endif // __x86_64__\r
-#endif // KERNEL\r
-\r
- // release used stack memory, restore used callee-saved registers, and return \r
-#if defined __i386__\r
-#ifdef KERNEL\r
- add $(8*16), %esp\r
-#endif\r
- pop %edi\r
- pop %ebx\r
-#else\r
-#ifdef KERNEL\r
- add $(16*16), %rsp \r
-#endif\r
- pop %r15\r
- pop %r14\r
- pop %r13\r
- pop %rbx\r
-#endif\r
- leave\r
- ret\r
-\r
- //\r
- // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
- //\r
-\r
-L_encrypt_192:\r
-\r
- cmp $1, num_blk // check number of block\r
- jl L_HW_cbc_done // should it be less than 1, nothing to do\r
-\r
- movups (ctx), %xmm2 // key0\r
- movups 16(ctx), %xmm3 // key1\r
- movups 32(ctx), %xmm4 // key2\r
- movups 48(ctx), %xmm5 // key3\r
- movups 64(ctx), %xmm6 // key4\r
- movups 80(ctx), %xmm7 // key5\r
-#if defined __x86_64__\r
- movups 96(ctx), %xmm8 // key6\r
- movups 112(ctx), %xmm9 // key7\r
- movups 128(ctx), %xmm10 // key8\r
- movups 144(ctx), %xmm11 // key9\r
- movups 160(ctx), %xmm12 // keyA\r
- movups 176(ctx), %xmm13 // keyB\r
- movups 192(ctx), %xmm14 // keyC\r
-#endif\r
- \r
- // while (num_blk--) {\r
- // *iv ^= *ibuf++;\r
- // aes_encrypt(iv, iv, ctx);\r
- // *obuf++ = *iv;\r
- // }\r
-0:\r
- movups (ibuf), %xmm1 // *ibuf\r
- pxor %xmm1, iv // *iv ^= ibuf\r
-\r
- // aes_encrypt(iv, iv, ctx);\r
-\r
- pxor %xmm2, iv\r
- aesenc %xmm3, iv\r
- aesenc %xmm4, iv\r
- aesenc %xmm5, iv\r
- aesenc %xmm6, iv\r
- aesenc %xmm7, iv\r
-#if defined __x86_64__\r
- aesenc %xmm8, iv\r
- aesenc %xmm9, iv\r
- aesenc %xmm10, iv\r
- aesenc %xmm11, iv\r
- aesenc %xmm12, iv\r
- aesenc %xmm13, iv\r
- aesenclast %xmm14, iv\r
-#else\r
- movups 96(ctx), %xmm1\r
- aesenc %xmm1, iv\r
- movups 112(ctx), %xmm1\r
- aesenc %xmm1, iv\r
- movups 128(ctx), %xmm1\r
- aesenc %xmm1, iv\r
- movups 144(ctx), %xmm1\r
- aesenc %xmm1, iv\r
- movups 160(ctx), %xmm1\r
- aesenc %xmm1, iv\r
- movups 176(ctx), %xmm1\r
- aesenc %xmm1, iv\r
- movups 192(ctx), %xmm1\r
- aesenclast %xmm1, iv\r
-#endif\r
-\r
- movups iv, (obuf) // *obuf = *iv;\r
- add $16, ibuf // ibuf++\r
- add $16, obuf // obuf++\r
-\r
- sub $1, num_blk // num_blk --\r
- jg 0b // if num_blk > 0, repeat the loop\r
-\r
- jmp L_HW_cbc_done // share with the common exit code\r
-\r
- //\r
- // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
- //\r
-\r
-L_encrypt_256:\r
-\r
- cmp $1, num_blk // check number of block\r
- jl L_HW_cbc_done // should it be less than 1, nothing to do\r
-\r
- movups (ctx), %xmm2 // key0\r
- movups 16(ctx), %xmm3 // key1\r
- movups 32(ctx), %xmm4 // key2\r
- movups 48(ctx), %xmm5 // key3\r
- movups 64(ctx), %xmm6 // key4\r
- movups 80(ctx), %xmm7 // key5\r
-#if defined __x86_64__\r
- movups 96(ctx), %xmm8 // key6\r
- movups 112(ctx), %xmm9 // key7\r
- movups 128(ctx), %xmm10 // key8\r
- movups 144(ctx), %xmm11 // key9\r
- movups 160(ctx), %xmm12 // keyA\r
- movups 176(ctx), %xmm13 // keyB\r
- movups 192(ctx), %xmm14 // keyC\r
- movups 208(ctx), %xmm15 // keyD\r
- // movups 224(ctx), %xmm1 // keyE\r
-#endif\r
-\r
- // while (num_blk--) {\r
- // *iv ^= *ibuf++;\r
- // aes_encrypt(iv, iv, ctx);\r
- // *obuf++ = *iv;\r
- // }\r
-0:\r
- movups (ibuf), %xmm1 // *ibuf\r
- pxor %xmm1, iv // *iv ^= ibuf\r
- \r
- // aes_encrypt(iv, iv, ctx);\r
- pxor %xmm2, iv\r
- aesenc %xmm3, iv\r
- aesenc %xmm4, iv\r
- aesenc %xmm5, iv\r
- aesenc %xmm6, iv\r
- aesenc %xmm7, iv\r
-#if defined __x86_64__\r
- movups 224(ctx), %xmm1 // keyE\r
- aesenc %xmm8, iv\r
- aesenc %xmm9, iv\r
- aesenc %xmm10, iv\r
- aesenc %xmm11, iv\r
- aesenc %xmm12, iv\r
- aesenc %xmm13, iv\r
- aesenc %xmm14, iv\r
- aesenc %xmm15, iv\r
- aesenclast %xmm1, iv\r
-#else\r
- movups 96(ctx), %xmm1 // key6\r
- aesenc %xmm1, iv\r
- movups 112(ctx), %xmm1 // key7\r
- aesenc %xmm1, iv\r
- movups 128(ctx), %xmm1 // key8\r
- aesenc %xmm1, iv\r
- movups 144(ctx), %xmm1 // key9\r
- aesenc %xmm1, iv\r
- movups 160(ctx), %xmm1 // keyA\r
- aesenc %xmm1, iv\r
- movups 176(ctx), %xmm1 // keyB\r
- aesenc %xmm1, iv\r
- movups 192(ctx), %xmm1 // keyC\r
- aesenc %xmm1, iv\r
- movups 208(ctx), %xmm1 // keyD\r
- aesenc %xmm1, iv\r
- movups 224(ctx), %xmm1 // keyE\r
- aesenclast %xmm1, iv\r
-#endif\r
-\r
- movups iv, (obuf) // *obuf = *iv;\r
- add $16, ibuf // ibuf++\r
- add $16, obuf // obuf++\r
-\r
- sub $1, num_blk // num_blk --\r
- jg 0b // if num_blk > 0, repeat the loop\r
-\r
- jmp L_HW_cbc_done // share with the common exit code\r
-\r
-\r
-\r
- //\r
- // --------- END of aes_encrypt_cbc_hw -------------------\r
- //\r
-\r
-\r
-/* ---------------------------------------------------------------------------------------------------------------- \r
-\r
- aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
-\r
- For simplicity, I am assuming all variables are in 128-bit data type.\r
-\r
- aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)\r
- {\r
- while(num_blk--) {\r
- aes_decrypt(ibuf, obuf, ctx);\r
- *obuf++ ^= *iv;\r
- *iv = *ibuf++;\r
- }\r
- return 0;\r
- }\r
-\r
- The following is an implementation of this function using Intel AESNI.\r
- This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. \r
- Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
- to this aesni-based function should it detecs that aesni is available.\r
- Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
-\r
- Note that the decryption operation is not related over blocks.\r
- This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.\r
- This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)\r
- The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.\r
-\r
- Example C code for packing 4 blocks in an iteration is shown as follows:\r
-\r
- while ((num_blk-=4)>=0) {\r
-\r
- // the following 4 functions can be interleaved to exploit parallelism\r
- aes_decrypt(ibuf, obuf, ctx);\r
- aes_decrypt(ibuf+1, obuf+1, ctx);\r
- aes_decrypt(ibuf+2, obuf+2, ctx);\r
- aes_decrypt(ibuf+3, obuf+3, ctx);\r
-\r
- obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
- *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
- }\r
- num_blk+=4;\r
-\r
- ----------------------------------------------------------------------------------------------------------------*/\r
-\r
- .text\r
- .align 4,0x90\r
- .globl _aes_decrypt_cbc_hw\r
-_aes_decrypt_cbc_hw:\r
-\r
- // push/save registers for local use\r
-#if defined __i386__\r
-\r
- push %ebp\r
- movl %esp, %ebp\r
- push %ebx // ibuf\r
- push %edi // obuf\r
-\r
- #define sp %esp\r
-\r
-#else // __x86_64__\r
-\r
- push %rbp\r
- mov %rsp, %rbp\r
- push %rbx\r
- push %r13\r
- push %r14\r
- push %r15\r
-\r
- #define sp %rsp\r
-\r
-#endif\r
-\r
-\r
- // if kernel, allocate stack space to save xmm registers\r
-#ifdef KERNEL\r
-#if defined __i386__\r
- sub $(8*16), %esp\r
-#else\r
- sub $(16*16), %rsp\r
-#endif\r
- movaps %xmm0, (sp)\r
- movaps %xmm1, 16(sp)\r
- movaps %xmm2, 32(sp)\r
- movaps %xmm3, 48(sp)\r
- movaps %xmm4, 64(sp)\r
- movaps %xmm5, 80(sp)\r
- movaps %xmm6, 96(sp)\r
- movaps %xmm7, 112(sp)\r
-#if defined __x86_64__\r
- movaps %xmm8, 16*8(sp)\r
- movaps %xmm9, 16*9(sp)\r
- movaps %xmm10, 16*10(sp)\r
- movaps %xmm11, 16*11(sp)\r
- movaps %xmm12, 16*12(sp)\r
- movaps %xmm13, 16*13(sp)\r
- movaps %xmm14, 16*14(sp)\r
- movaps %xmm15, 16*15(sp)\r
-#endif // __x86_64__\r
-#endif\r
-\r
- #undef iv\r
- #define iv %xmm0\r
-\r
-#if defined __i386__\r
- mov 12(%ebp), %eax // in_iv\r
- mov 24(%ebp), %edx // ctx\r
- movups (%eax), iv // iv = in_iv \r
- mov 8(%ebp), %ebx // ibuf\r
- mov 16(%ebp), %ecx // num_blk\r
- mov 20(%ebp), %edi // obuf\r
-\r
- #define ibuf %ebx\r
- #define obuf %edi\r
- #define num_blk %ecx \r
- #define ctx %edx\r
-\r
-#else // __x86_64__, rdi/rsi/rdx/rcx/r8\r
-\r
- mov %rdi, %rbx // ibuf\r
- movups (%rsi), iv // iv = in_iv\r
- mov %rdx, %r13 // num_blk\r
- mov %rcx, %r14 // obuf\r
- mov %r8, %r15 // ctx \r
-\r
- #define ibuf %rbx\r
- #define num_blk %r13d\r
- #define obuf %r14 \r
- #define ctx %r15\r
-\r
-#endif\r
-\r
- mov 240(ctx), %eax // aes length\r
- cmp $160, %eax // aes-128 decrypt\r
- je L_decrypt_128\r
- cmp $192, %eax // aes-192 decrypt\r
- je L_decrypt_192\r
- cmp $224, %eax // aes-256 decrypt\r
- je L_decrypt_256\r
-\r
- mov $-1, %eax // wrong aes length, to return -1\r
- jmp L_error // early exit due to wrong aes length\r
-\r
-\r
- //\r
- // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
- //\r
-\r
-L_decrypt_128:\r
-\r
- cmp $1, num_blk\r
- jl L_HW_cbc_done // if num_blk < 1, early return\r
-\r
- // aes-128 decrypt expanded keys\r
- movups 160(ctx), %xmm3\r
- movups 144(ctx), %xmm4\r
- movups 128(ctx), %xmm5\r
- movups 112(ctx), %xmm6\r
- movups 96(ctx), %xmm7\r
-#if defined __x86_64__\r
- movups 80(ctx), %xmm8\r
- movups 64(ctx), %xmm9\r
- movups 48(ctx), %xmm10\r
- movups 32(ctx), %xmm11\r
- movups 16(ctx), %xmm12\r
- movups 0(ctx), %xmm13\r
-#endif\r
-\r
- // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
-\r
- // while ((num_blk-=4)>=0) {\r
- // aes_decrypt(ibuf, obuf, ctx);\r
- // aes_decrypt(ibuf+1, obuf+1, ctx);\r
- // aes_decrypt(ibuf+2, obuf+2, ctx);\r
- // aes_decrypt(ibuf+3, obuf+3, ctx);\r
- // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
- // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
- // }\r
-\r
- sub $4, num_blk // pre decrement num_blk by 4\r
- jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
-\r
-0:\r
-\r
-\r
-#if defined __x86_64__\r
-\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
- movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
- movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
-\r
- // for x86_64, the expanded keys are already stored in xmm3-xmm13\r
-\r
- // aes-128 decrypt round 0 per 4 blocks\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- pxor %xmm3, %xmm14\r
- pxor %xmm3, %xmm15\r
-\r
- // aes-128 decrypt round 1 per 4 blocks\r
- aesdec %xmm4, %xmm1\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm4, %xmm14\r
- aesdec %xmm4, %xmm15\r
-\r
- // aes-128 decrypt round 2 per 4 blocks\r
- aesdec %xmm5, %xmm1\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm5, %xmm14\r
- aesdec %xmm5, %xmm15\r
-\r
- // aes-128 decrypt round 3 per 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm14\r
- aesdec %xmm6, %xmm15\r
-\r
- // aes-128 decrypt round 4 per 4 blocks\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm14\r
- aesdec %xmm7, %xmm15\r
-\r
- // aes-128 decrypt round 5 per 4 blocks\r
- aesdec %xmm8, %xmm1\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm8, %xmm14\r
- aesdec %xmm8, %xmm15\r
-\r
- // aes-128 decrypt round 6 per 4 blocks\r
- aesdec %xmm9, %xmm1\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm9, %xmm14\r
- aesdec %xmm9, %xmm15\r
-\r
- // aes-128 decrypt round 7 per 4 blocks\r
- aesdec %xmm10, %xmm1\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm10, %xmm14\r
- aesdec %xmm10, %xmm15\r
-\r
- // aes-128 decrypt round 8 per 4 blocks\r
- aesdec %xmm11, %xmm1\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm11, %xmm14\r
- aesdec %xmm11, %xmm15\r
-\r
- // aes-128 decrypt round 9 per 4 blocks\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm12, %xmm14\r
- aesdec %xmm12, %xmm15\r
-\r
- // aes-128 decrypt round 10 (last) per 4 blocks\r
- aesdeclast %xmm13, %xmm1\r
- aesdeclast %xmm13, %xmm2\r
- aesdeclast %xmm13, %xmm14\r
- aesdeclast %xmm13, %xmm15\r
-\r
- pxor iv, %xmm1 // obuf[0] ^= *iv; \r
- movups (ibuf), iv // ibuf[0]\r
- pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; \r
- movups 16(ibuf), iv // ibuf[1]\r
- pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; \r
- movups 32(ibuf), iv // ibuf[2] \r
- pxor iv, %xmm15 // obuf[3] ^= obuf[2]; \r
- movups 48(ibuf), iv // *iv = ibuf[3]\r
-\r
- movups %xmm1, (obuf) // write 1st obuf\r
- movups %xmm2, 16(obuf) // write 2nd obuf\r
- movups %xmm14, 32(obuf) // write 3rd obuf\r
- movups %xmm15, 48(obuf) // write 4th obuf\r
-\r
-\r
-#else\r
-\r
- // aes_decrypt_cbc per 4 blocks using aes-128 for i386\r
- // xmm1/xmm2/xmm4/xmm5 used for obuf per block\r
- // xmm3 = key0\r
- // xmm0 = iv\r
- // xmm6/xmm7 dynamically load with other expanded keys\r
-\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
- movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
- movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
-\r
- // aes_decrypt\r
- // for i386, sequentially load expanded keys into xmm6/xmm7\r
-\r
- movups 144(ctx), %xmm6 // key1\r
-\r
- // aes-128 decrypt round 0 per 4 blocks\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- pxor %xmm3, %xmm4\r
- pxor %xmm3, %xmm5\r
-\r
- movups 128(ctx), %xmm7 // key2\r
-\r
- // aes-128 decrypt round 1 per 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 112(ctx), %xmm6 // key3\r
-\r
- // aes-128 decrypt round 2 per 4 blocks\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 96(ctx), %xmm7 // key4\r
-\r
- // aes-128 decrypt round 3 per 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 80(ctx), %xmm6 // key5\r
-\r
- // aes-128 decrypt round 4 per 4 blocks\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 64(ctx), %xmm7 // key6\r
-\r
- // aes-128 decrypt round 5 per 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 48(ctx), %xmm6 // key7\r
-\r
- // aes-128 decrypt round 6 per 4 blocks\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 32(ctx), %xmm7 // key8\r
-\r
- // aes-128 decrypt round 7 per 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 16(ctx), %xmm6 // key9\r
-\r
- // aes-128 decrypt round 8 per 4 blocks\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 0(ctx), %xmm7 // keyA\r
-\r
- // aes-128 decrypt round 9 per 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- // aes-128 decrypt round 10 (last) per 4 blocks\r
- aesdeclast %xmm7, %xmm1\r
- aesdeclast %xmm7, %xmm2\r
- aesdeclast %xmm7, %xmm4\r
- aesdeclast %xmm7, %xmm5\r
-\r
- pxor iv, %xmm1 // 1st obuf ^= iv; \r
- movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm2 // 2nd obuf ^= iv; \r
- movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm4 // 3rd obuf ^= iv; \r
- movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm5 // 4th obuf ^= iv; \r
- movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
-\r
- movups %xmm1, (obuf) // write 1st obuf\r
- movups %xmm2, 16(obuf) // write 2nd obuf\r
- movups %xmm4, 32(obuf) // write 3rd obuf\r
- movups %xmm5, 48(obuf) // write 4th obuf\r
-#endif\r
-\r
- add $64, ibuf // ibuf += 4; \r
- add $64, obuf // obuf += 4; \r
-\r
- sub $4, num_blk // num_blk -= 4\r
- jge 0b // if num_blk > 0, repeat the loop\r
-\r
-9: add $4, num_blk // post incremtn num_blk by 4\r
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
-\r
-#if defined __i386__\r
- // updated as they might be needed as expanded keys in the remaining\r
- movups 144(ctx), %xmm4\r
- movups 128(ctx), %xmm5\r
- movups 112(ctx), %xmm6\r
- movups 96(ctx), %xmm7\r
-#endif\r
-\r
- test $2, num_blk // check whether num_blk has 2 blocks\r
- je 9f // if num_blk & 2 == 0, skip the per-pair processing code\r
-\r
- // do the remaining 2 blocks together\r
-\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
-\r
- // aes_decrypt\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- aesdec %xmm4, %xmm1\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm5, %xmm1\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
-#if defined __x86_64__\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm8, %xmm1\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm9, %xmm1\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm10, %xmm1\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm11, %xmm1\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdeclast %xmm13, %xmm1\r
- aesdeclast %xmm13, %xmm2\r
-#else\r
- movups 80(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- movups 64(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- movups 48(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- movups 32(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- movups 16(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- movups 0(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdeclast %xmm7, %xmm1\r
- aesdeclast %xmm7, %xmm2\r
- movups 112(ctx), %xmm6\r
- movups 96(ctx), %xmm7\r
-#endif\r
-\r
- pxor iv, %xmm1 // obuf[0] ^= *iv; \r
- movups (ibuf), iv // ibuf[0]\r
- pxor iv, %xmm2 // obuf[1] ^= ibuf[0]\r
- movups 16(ibuf), iv // *iv = ibuf[1]\r
-\r
- movups %xmm1, (obuf) // write obuf[0]\r
- movups %xmm2, 16(obuf) // write obuf[1]\r
-\r
- add $32, ibuf // ibuf += 2\r
- add $32, obuf // obuf += 2\r
-\r
-9:\r
- test $1, num_blk // check whether num_blk has residual 1 block\r
- je L_HW_cbc_done // if num_blk == 0, no need for residual processing code\r
- \r
- movups (ibuf), %xmm2 // tmp = ibuf\r
- // aes_decrypt\r
- pxor %xmm3, %xmm2\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm7, %xmm2\r
-#if defined __x86_64__\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm12, %xmm2\r
- aesdeclast %xmm13, %xmm2\r
-#else\r
- movups 80(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 64(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 48(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 32(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 16(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups (ctx), %xmm1\r
- aesdeclast %xmm1, %xmm2\r
-#endif\r
-\r
- pxor iv, %xmm2 // *obuf ^= *iv; \r
- movups (ibuf), iv // *iv = *ibuf;\r
- movups %xmm2, (obuf) // write *obuf\r
-\r
- jmp L_HW_cbc_done\r
-\r
- //\r
- // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
- //\r
-\r
-L_decrypt_192:\r
-\r
- cmp $1, num_blk\r
- jl L_HW_cbc_done // if num_blk < 1, early return\r
-\r
- // aes-192 decryp expanded keys\r
- movups 192(ctx), %xmm3\r
- movups 176(ctx), %xmm4\r
- movups 160(ctx), %xmm5\r
- movups 144(ctx), %xmm6\r
- movups 128(ctx), %xmm7\r
-#if defined __x86_64__\r
- movups 112(ctx), %xmm8\r
- movups 96(ctx), %xmm9\r
- movups 80(ctx), %xmm10\r
- movups 64(ctx), %xmm11\r
- movups 48(ctx), %xmm12\r
- movups 32(ctx), %xmm13\r
- movups 16(ctx), %xmm14\r
- movups (ctx), %xmm15\r
-#endif\r
-\r
- // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
-\r
- // while ((num_blk-=4)>=0) {\r
- // aes_decrypt(ibuf, obuf, ctx);\r
- // aes_decrypt(ibuf+1, obuf+1, ctx);\r
- // aes_decrypt(ibuf+2, obuf+2, ctx);\r
- // aes_decrypt(ibuf+3, obuf+3, ctx);\r
- // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
- // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
- // }\r
-\r
- sub $4, num_blk // pre decrement num_blk by 4\r
- jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
-0:\r
-\r
-#if defined __x86_64__\r
-\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
- movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
- movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
-\r
- // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
- // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards\r
-\r
- // round 0 for 4 blocks\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- pxor %xmm3, %xmm14\r
- pxor %xmm3, %xmm15\r
-\r
- // round 1 for 4 blocks\r
- aesdec %xmm4, %xmm1\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm4, %xmm14\r
- aesdec %xmm4, %xmm15\r
-\r
- // round 2 for 4 blocks\r
- aesdec %xmm5, %xmm1\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm5, %xmm14\r
- aesdec %xmm5, %xmm15\r
-\r
- // round 3 for 4 blocks\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm14\r
- aesdec %xmm6, %xmm15\r
-\r
- // round 4 for 4 blocks\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm14\r
- aesdec %xmm7, %xmm15\r
-\r
- // round 5 for 4 blocks\r
- aesdec %xmm8, %xmm1\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm8, %xmm14\r
- aesdec %xmm8, %xmm15\r
-\r
- // round 6 for 4 blocks\r
- aesdec %xmm9, %xmm1\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm9, %xmm14\r
- aesdec %xmm9, %xmm15\r
-\r
- // round 7 for 4 blocks\r
- aesdec %xmm10, %xmm1\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm10, %xmm14\r
- aesdec %xmm10, %xmm15\r
-\r
- // round 8 for 4 blocks\r
- aesdec %xmm11, %xmm1\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm11, %xmm14\r
- aesdec %xmm11, %xmm15\r
-\r
- // round 9 for 4 blocks\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm12, %xmm14\r
- aesdec %xmm12, %xmm15\r
-\r
- movups 16(ctx), %xmm12\r
-\r
- // round A for 4 blocks\r
- aesdec %xmm13, %xmm1\r
- aesdec %xmm13, %xmm2\r
- aesdec %xmm13, %xmm14\r
- aesdec %xmm13, %xmm15\r
-\r
- movups (ctx), %xmm13\r
-\r
- // round B for 4 blocks\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm12, %xmm14\r
- aesdec %xmm12, %xmm15\r
-\r
- movups 48(ctx), %xmm12 // restore %xmm12 to its original key\r
-\r
- // round C (last) for 4 blocks\r
- aesdeclast %xmm13, %xmm1\r
- aesdeclast %xmm13, %xmm2\r
- aesdeclast %xmm13, %xmm14\r
- aesdeclast %xmm13, %xmm15\r
-\r
- movups 32(ctx), %xmm13 // restore %xmm13 to its original key\r
-\r
- pxor iv, %xmm1 // obuf[0] ^= *iv; \r
- movups (ibuf), iv // ibuf[0]\r
- pxor iv, %xmm2 // obuf[1] ^= ibuf[0] \r
- movups 16(ibuf), iv // ibuf[1]\r
- pxor iv, %xmm14 // obuf[2] ^= ibuf[1] \r
- movups 32(ibuf), iv // ibuf[2] \r
- pxor iv, %xmm15 // obuf[3] ^= ibuf[2] \r
- movups 48(ibuf), iv // *iv = ibuf[3] \r
-\r
- movups %xmm1, (obuf) // write 1st obuf\r
- movups %xmm2, 16(obuf) // write 2nd obuf\r
- movups %xmm14, 32(obuf) // write 3rd obuf\r
- movups %xmm15, 48(obuf) // write 4th obuf\r
-\r
- add $64, ibuf // ibuf += 4; \r
- add $64, obuf // obuf += 4; \r
-\r
- sub $4, num_blk // num_blk -= 4\r
- jge 0b // if num_blk > 0, repeat the loop\r
-\r
-9: add $4, num_blk // post incremtn num_blk by 4\r
- je L_HW_cbc_done // if num_blk == 0, prepare to return \r
-\r
- movups 16(ctx), %xmm14 // restore %xmm14 to its key\r
- movups (ctx), %xmm15 // restore %xmm15 to its key\r
-\r
-#else\r
-\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
- movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
- movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
-\r
- // aes_decrypt\r
- // for i386, sequentially load expanded keys into xmm6/xmm7\r
- movups 176(ctx), %xmm6\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- pxor %xmm3, %xmm4\r
- pxor %xmm3, %xmm5\r
-\r
- movups 160(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 144(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 128(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 112(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 96(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 80(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 64(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 48(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 32(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 16(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 0(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- aesdeclast %xmm7, %xmm1\r
- aesdeclast %xmm7, %xmm2\r
- aesdeclast %xmm7, %xmm4\r
- aesdeclast %xmm7, %xmm5\r
-\r
- pxor iv, %xmm1 // 1st obuf ^= iv; \r
- movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm2 // 2nd obuf ^= iv; \r
- movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm4 // 3rd obuf ^= iv; \r
- movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm5 // 4th obuf ^= iv; \r
- movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- movups %xmm1, (obuf) // write 1st obuf\r
- movups %xmm2, 16(obuf) // write 2nd obuf\r
- movups %xmm4, 32(obuf) // write 3rd obuf\r
- movups %xmm5, 48(obuf) // write 4th obuf\r
-\r
- add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r
- add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r
-\r
- sub $4, num_blk // num_blk -= 4\r
- jge 0b // if num_blk > 0, repeat the loop\r
-\r
-\r
-9: add $4, num_blk // post incremtn num_blk by 4\r
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
-\r
- movups 176(ctx), %xmm4\r
- movups 160(ctx), %xmm5\r
- movups 144(ctx), %xmm6\r
- movups 128(ctx), %xmm7\r
-\r
-#endif\r
-\r
- // per-block aes_decrypt_cbc loop\r
-\r
-0:\r
- movups (ibuf), %xmm2 // tmp = ibuf\r
-\r
- // aes_decrypt\r
- pxor %xmm3, %xmm2\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm7, %xmm2\r
-#if defined __x86_64__\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm13, %xmm2\r
- aesdec %xmm14, %xmm2\r
- aesdeclast %xmm15, %xmm2\r
-#else\r
- movups 112(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 96(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 80(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 64(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 48(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 32(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 16(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups (ctx), %xmm1\r
- aesdeclast %xmm1, %xmm2\r
-#endif\r
-\r
- pxor iv, %xmm2 // obuf ^= iv; \r
- movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
-\r
- movups %xmm2, (obuf) // write obuf\r
-\r
- add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r
- add $16, obuf // obuf += AES_BLOCK_SIZE; \r
- sub $1, num_blk // num_blk --\r
- jg 0b // if num_blk > 0, repeat the loop\r
-\r
- jmp L_HW_cbc_done\r
-\r
- //\r
- // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
- //\r
-\r
-L_decrypt_256:\r
-\r
- cmp $1, num_blk\r
- jl L_HW_cbc_done \r
-\r
- movups 224(ctx), %xmm3\r
- movups 208(ctx), %xmm4\r
- movups 192(ctx), %xmm5\r
- movups 176(ctx), %xmm6\r
- movups 160(ctx), %xmm7\r
-#if defined __x86_64__\r
- movups 144(ctx), %xmm8\r
- movups 128(ctx), %xmm9\r
- movups 112(ctx), %xmm10\r
- movups 96(ctx), %xmm11\r
- movups 80(ctx), %xmm12\r
- movups 64(ctx), %xmm13\r
- movups 48(ctx), %xmm14\r
- movups 32(ctx), %xmm15\r
-// movups 16(ctx), %xmm14\r
-// movups (ctx), %xmm15\r
-#endif\r
-\r
-#if defined __x86_64__\r
-\r
- sub $4, num_blk // pre decrement num_blk by 4\r
- jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
-0:\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
- movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
- movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
-\r
- // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- pxor %xmm3, %xmm14\r
- pxor %xmm3, %xmm15\r
-\r
- aesdec %xmm4, %xmm1\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm4, %xmm14\r
- aesdec %xmm4, %xmm15\r
-\r
- aesdec %xmm5, %xmm1\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm5, %xmm14\r
- aesdec %xmm5, %xmm15\r
-\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm14\r
- aesdec %xmm6, %xmm15\r
-\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm14\r
- aesdec %xmm7, %xmm15\r
-\r
- aesdec %xmm8, %xmm1\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm8, %xmm14\r
- aesdec %xmm8, %xmm15\r
-\r
- aesdec %xmm9, %xmm1\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm9, %xmm14\r
- aesdec %xmm9, %xmm15\r
-\r
- aesdec %xmm10, %xmm1\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm10, %xmm14\r
- aesdec %xmm10, %xmm15\r
-\r
- aesdec %xmm11, %xmm1\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm11, %xmm14\r
- aesdec %xmm11, %xmm15\r
-\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm12, %xmm14\r
- aesdec %xmm12, %xmm15\r
- movups 48(ctx), %xmm12\r
-\r
- aesdec %xmm13, %xmm1\r
- aesdec %xmm13, %xmm2\r
- aesdec %xmm13, %xmm14\r
- aesdec %xmm13, %xmm15\r
- movups 32(ctx), %xmm13\r
-\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm12, %xmm14\r
- aesdec %xmm12, %xmm15\r
- movups 16(ctx), %xmm12\r
-\r
- aesdec %xmm13, %xmm1\r
- aesdec %xmm13, %xmm2\r
- aesdec %xmm13, %xmm14\r
- aesdec %xmm13, %xmm15\r
- movups (ctx), %xmm13\r
-\r
- aesdec %xmm12, %xmm1\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm12, %xmm14\r
- aesdec %xmm12, %xmm15\r
- movups 80(ctx), %xmm12\r
-\r
- aesdeclast %xmm13, %xmm1\r
- aesdeclast %xmm13, %xmm2\r
- aesdeclast %xmm13, %xmm14\r
- aesdeclast %xmm13, %xmm15\r
- movups 64(ctx), %xmm13\r
-\r
- pxor iv, %xmm1 // obuf ^= iv; \r
- movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm2 // obuf ^= iv; \r
- movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm14 // obuf ^= iv; \r
- movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm15 // obuf ^= iv; \r
- movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
-\r
- movups %xmm1, (obuf) // write 1st obuf\r
- movups %xmm2, 16(obuf) // write 2nd obuf\r
- movups %xmm14, 32(obuf) // write 3rd obuf\r
- movups %xmm15, 48(obuf) // write 4th obuf\r
-\r
- add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; \r
- add $64, obuf // obuf += AES_BLOCK_SIZE*4; \r
-\r
- sub $4, num_blk // num_blk -= 4\r
- jge 0b // if num_blk > 0, repeat the loop\r
-\r
-9: add $4, num_blk // post incremtn num_blk by 4\r
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
-\r
- movups 48(ctx), %xmm14\r
- movups 32(ctx), %xmm15\r
-\r
-#else\r
-\r
- sub $4, num_blk // pre decrement num_blk by 4\r
- jl 9f // if num_blk < 4, skip the per-pair processing code\r
-0:\r
- movups (ibuf), %xmm1 // tmp = 1st ibuf\r
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
- movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
- movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
-\r
- // aes_decrypt\r
- // for i386, sequentially load expanded keys into xmm6/xmm7\r
- movups 208(ctx), %xmm6\r
- pxor %xmm3, %xmm1\r
- pxor %xmm3, %xmm2\r
- pxor %xmm3, %xmm4\r
- pxor %xmm3, %xmm5\r
-\r
- movups 192(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 176(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 160(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 144(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 128(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 112(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 96(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 80(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 64(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 48(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 32(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- movups 16(ctx), %xmm6\r
- aesdec %xmm7, %xmm1\r
- aesdec %xmm7, %xmm2\r
- aesdec %xmm7, %xmm4\r
- aesdec %xmm7, %xmm5\r
-\r
- movups 0(ctx), %xmm7\r
- aesdec %xmm6, %xmm1\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm6, %xmm4\r
- aesdec %xmm6, %xmm5\r
-\r
- aesdeclast %xmm7, %xmm1\r
- aesdeclast %xmm7, %xmm2\r
- aesdeclast %xmm7, %xmm4\r
- aesdeclast %xmm7, %xmm5\r
-\r
- pxor iv, %xmm1 // 1st obuf ^= iv; \r
- movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm2 // 2nd obuf ^= iv; \r
- movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm4 // 3rd obuf ^= iv; \r
- movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- pxor iv, %xmm5 // 4th obuf ^= iv; \r
- movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
- movups %xmm1, (obuf) // write 1st obuf\r
- movups %xmm2, 16(obuf) // write 2nd obuf\r
- movups %xmm4, 32(obuf) // write 3rd obuf\r
- movups %xmm5, 48(obuf) // write 4th obuf\r
-\r
- add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r
- add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r
-\r
- sub $4, num_blk // num_blk -= 4\r
- jge 0b // if num_blk > 0, repeat the loop\r
-\r
-\r
-9: add $4, num_blk // post incremtn num_blk by 4\r
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
-\r
- movups 208(ctx), %xmm4\r
- movups 192(ctx), %xmm5\r
- movups 176(ctx), %xmm6\r
- movups 160(ctx), %xmm7\r
-\r
-#endif\r
-\r
-0:\r
- movups (ibuf), %xmm2 // tmp = ibuf\r
-\r
- // aes_decrypt\r
- pxor %xmm3, %xmm2\r
- aesdec %xmm4, %xmm2\r
- aesdec %xmm5, %xmm2\r
- aesdec %xmm6, %xmm2\r
- aesdec %xmm7, %xmm2\r
-#if defined __x86_64__\r
- aesdec %xmm8, %xmm2\r
- aesdec %xmm9, %xmm2\r
- aesdec %xmm10, %xmm2\r
- aesdec %xmm11, %xmm2\r
- aesdec %xmm12, %xmm2\r
- aesdec %xmm13, %xmm2\r
- aesdec %xmm14, %xmm2\r
- aesdec %xmm15, %xmm2\r
-#else\r
- movups 144(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 128(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 112(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 96(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 80(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 64(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 48(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups 32(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
-#endif\r
- movups 16(ctx), %xmm1\r
- aesdec %xmm1, %xmm2\r
- movups (ctx), %xmm1\r
- aesdeclast %xmm1, %xmm2\r
-\r
- pxor iv, %xmm2 // obuf ^= iv; \r
- movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
-\r
- movups %xmm2, (obuf) // write obuf\r
-\r
- add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r
- add $16, obuf // obuf += AES_BLOCK_SIZE; \r
- sub $1, num_blk // num_blk --\r
- jg 0b // if num_blk > 0, repeat the loop\r
-\r
- jmp L_HW_cbc_done\r
-\r
- //\r
- // --------- END of aes_decrypt_cbc_hw -------------------\r
- //\r
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+ 1. distributions of this source code include the above copyright
+ notice, this list of conditions and the following disclaimer;
+
+ 2. distributions in binary form include the above copyright
+ notice, this list of conditions and the following disclaimer
+ in the documentation and/or other associated materials;
+
+ 3. the copyright holder's name is not used to endorse products
+ built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 31/01/2006
+
+ These subroutines implement multiple block AES modes for ECB, CBC, CFB,
+ OFB and CTR encryption, The code provides support for the VIA Advanced
+ Cryptography Engine (ACE).
+
+ NOTE: In the following subroutines, the AES contexts (ctx) must be
+ 16 byte aligned if VIA ACE is being used
+*/
+
+/* ----------------------------------------------------------------------------------------------------------------
+
+ aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
+
+ For simplicity, I am assuming all variables are in 128-bit data type.
+
+ aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
+ {
+ while(num_blk--) {
+ *iv ^= *ibuf++;
+ aes_encrypt(iv, iv, ctx);
+ *obuf++ = *iv;
+ }
+ return 0;
+ }
+
+ The following is an implementation of this function using Intel AESNI.
+ This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
+ Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
+ to this aesni-based function should it detecs that aesni is available.
+ Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
+
+ Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
+ are serially chained. This prevents us from arranging several blocks for encryption in parallel.
+
+ ----------------------------------------------------------------------------------------------------------------*/
+
+ .text
+ .align 4,0x90
+ .globl _aes_encrypt_cbc_hw
+_aes_encrypt_cbc_hw:
+
+ // push/save registers for local use
+#if defined __i386__
+
+ push %ebp
+ movl %esp, %ebp
+ push %ebx
+ push %edi
+
+ #define sp %esp
+
+#else // __x86_64__
+
+ push %rbp
+ mov %rsp, %rbp
+ push %rbx
+ push %r13
+ push %r14
+ push %r15
+
+ #define sp %rsp
+
+#endif
+
+ // if this is kernel code, need to save used xmm registers
+#ifdef KERNEL
+
+#if defined __i386__
+ sub $(8*16), %esp // for possible xmm0-xmm7 save/restore
+#else
+ sub $(16*16), %rsp // xmm0-xmm15 save/restore
+#endif
+
+ movaps %xmm0, (sp)
+ movaps %xmm1, 16(sp)
+ movaps %xmm2, 32(sp)
+ movaps %xmm3, 48(sp)
+ movaps %xmm4, 64(sp)
+ movaps %xmm5, 80(sp)
+ movaps %xmm6, 96(sp)
+ movaps %xmm7, 112(sp)
+#if defined __x86_64__
+ movaps %xmm8, 16*8(sp)
+ movaps %xmm9, 16*9(sp)
+ movaps %xmm10, 16*10(sp)
+ movaps %xmm11, 16*11(sp)
+ movaps %xmm12, 16*12(sp)
+ movaps %xmm13, 16*13(sp)
+ movaps %xmm14, 16*14(sp)
+ movaps %xmm15, 16*15(sp)
+#endif // __x86_64__
+
+#endif // KERNEL
+
+ #define iv %xmm0
+
+#ifdef __i386__
+
+ mov 12(%ebp), %eax // in_iv
+ mov 24(%ebp), %edx // ctx
+ movups (%eax), iv // iv = in_iv
+ mov 8(%ebp), %ebx // ibuf
+ mov 16(%ebp), %ecx // num_blk
+ mov 20(%ebp), %edi // obuf
+
+ #define ibuf %ebx
+ #define obuf %edi
+ #define num_blk %ecx
+ #define ctx %edx
+
+#else
+
+ mov %rdi, %rbx // ibuf
+ movups (%rsi), iv // iv = in_iv
+ mov %rdx, %r13 // num_blk
+ mov %rcx, %r14 // obuf
+ mov %r8, %r15 // ctx
+
+ #define ibuf %rbx
+ #define num_blk %r13d
+ #define obuf %r14
+ #define ctx %r15
+
+#endif
+
+ mov 240(ctx), %eax // aes length
+ cmp $160, %eax // aes-128 encrypt ?
+ je L_encrypt_128
+ cmp $192, %eax // aes-192 encrypt ?
+ je L_encrypt_192
+ cmp $224, %eax // aes-256 encrypt ?
+ je L_encrypt_256
+ mov $-1, %eax // return error
+ jmp L_error
+
+ //
+ // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
+ //
+
+L_encrypt_128:
+
+ cmp $1, num_blk // check number of block
+ jl L_HW_cbc_done // should it be less than 1, nothing to do
+
+ movups (ctx), %xmm2 // key0
+ movups 16(ctx), %xmm3 // key1
+ movups 32(ctx), %xmm4 // key2
+ movups 48(ctx), %xmm5 // key3
+ movups 64(ctx), %xmm6 // key4
+ movups 80(ctx), %xmm7 // key5
+#if defined __x86_64__
+ movups 96(ctx), %xmm8 // key6
+ movups 112(ctx), %xmm9 // key7
+ movups 128(ctx), %xmm10 // key8
+ movups 144(ctx), %xmm11 // key9
+ movups 160(ctx), %xmm12 // keyA
+#endif
+
+ // while (num_blk--) {
+ // *iv ^= *ibuf++;
+ // aes_encrypt(iv, iv, ctx);
+ // *obuf++ = *iv;
+ // }
+0:
+ movups (ibuf), %xmm1 // *ibuf
+ pxor %xmm2, iv // 1st instruction inside aes_encrypt
+ pxor %xmm1, iv // *iv ^= *ibuf
+
+ // finishing up the rest of aes_encrypt
+ aesenc %xmm3, iv
+ aesenc %xmm4, iv
+ aesenc %xmm5, iv
+ aesenc %xmm6, iv
+ aesenc %xmm7, iv
+#if defined __x86_64__
+ aesenc %xmm8, iv
+ aesenc %xmm9, iv
+ aesenc %xmm10, iv
+ aesenc %xmm11, iv
+ aesenclast %xmm12, iv
+#else
+ movups 96(ctx), %xmm1 // key6
+ aesenc %xmm1, iv
+ movups 112(ctx), %xmm1 // key7
+ aesenc %xmm1, iv
+ movups 128(ctx), %xmm1 // key8
+ aesenc %xmm1, iv
+ movups 144(ctx), %xmm1 // key9
+ aesenc %xmm1, iv
+ movups 160(ctx), %xmm1 // keyA
+ aesenclast %xmm1, iv
+#endif
+
+ movups iv, (obuf) // *obuf = *iv;
+ add $16, obuf // obuf++;
+ add $16, ibuf // ibuf++;
+ sub $1, num_blk // num_blk --
+ jg 0b // if num_blk > 0, repeat the loop
+
+ // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
+
+L_HW_cbc_done:
+
+ xor %eax, %eax // to return CRYPT_OK
+
+L_error:
+
+ // if kernel, restore xmm registers
+#ifdef KERNEL
+ movaps 0(sp), %xmm0
+ movaps 16(sp), %xmm1
+ movaps 32(sp), %xmm2
+ movaps 48(sp), %xmm3
+ movaps 64(sp), %xmm4
+ movaps 80(sp), %xmm5
+ movaps 96(sp), %xmm6
+ movaps 112(sp), %xmm7
+#if defined __x86_64__
+ movaps 16*8(sp), %xmm8
+ movaps 16*9(sp), %xmm9
+ movaps 16*10(sp), %xmm10
+ movaps 16*11(sp), %xmm11
+ movaps 16*12(sp), %xmm12
+ movaps 16*13(sp), %xmm13
+ movaps 16*14(sp), %xmm14
+ movaps 16*15(sp), %xmm15
+#endif // __x86_64__
+#endif // KERNEL
+
+ // release used stack memory, restore used callee-saved registers, and return
+#if defined __i386__
+#ifdef KERNEL
+ add $(8*16), %esp
+#endif
+ pop %edi
+ pop %ebx
+#else
+#ifdef KERNEL
+ add $(16*16), %rsp
+#endif
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %rbx
+#endif
+ leave
+ ret
+
+ //
+ // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
+ //
+
+L_encrypt_192:
+
+ cmp $1, num_blk // check number of block
+ jl L_HW_cbc_done // should it be less than 1, nothing to do
+
+ movups (ctx), %xmm2 // key0
+ movups 16(ctx), %xmm3 // key1
+ movups 32(ctx), %xmm4 // key2
+ movups 48(ctx), %xmm5 // key3
+ movups 64(ctx), %xmm6 // key4
+ movups 80(ctx), %xmm7 // key5
+#if defined __x86_64__
+ movups 96(ctx), %xmm8 // key6
+ movups 112(ctx), %xmm9 // key7
+ movups 128(ctx), %xmm10 // key8
+ movups 144(ctx), %xmm11 // key9
+ movups 160(ctx), %xmm12 // keyA
+ movups 176(ctx), %xmm13 // keyB
+ movups 192(ctx), %xmm14 // keyC
+#endif
+
+ // while (num_blk--) {
+ // *iv ^= *ibuf++;
+ // aes_encrypt(iv, iv, ctx);
+ // *obuf++ = *iv;
+ // }
+0:
+ movups (ibuf), %xmm1 // *ibuf
+ pxor %xmm1, iv // *iv ^= ibuf
+
+ // aes_encrypt(iv, iv, ctx);
+
+ pxor %xmm2, iv
+ aesenc %xmm3, iv
+ aesenc %xmm4, iv
+ aesenc %xmm5, iv
+ aesenc %xmm6, iv
+ aesenc %xmm7, iv
+#if defined __x86_64__
+ aesenc %xmm8, iv
+ aesenc %xmm9, iv
+ aesenc %xmm10, iv
+ aesenc %xmm11, iv
+ aesenc %xmm12, iv
+ aesenc %xmm13, iv
+ aesenclast %xmm14, iv
+#else
+ movups 96(ctx), %xmm1
+ aesenc %xmm1, iv
+ movups 112(ctx), %xmm1
+ aesenc %xmm1, iv
+ movups 128(ctx), %xmm1
+ aesenc %xmm1, iv
+ movups 144(ctx), %xmm1
+ aesenc %xmm1, iv
+ movups 160(ctx), %xmm1
+ aesenc %xmm1, iv
+ movups 176(ctx), %xmm1
+ aesenc %xmm1, iv
+ movups 192(ctx), %xmm1
+ aesenclast %xmm1, iv
+#endif
+
+ movups iv, (obuf) // *obuf = *iv;
+ add $16, ibuf // ibuf++
+ add $16, obuf // obuf++
+
+ sub $1, num_blk // num_blk --
+ jg 0b // if num_blk > 0, repeat the loop
+
+ jmp L_HW_cbc_done // share with the common exit code
+
+ //
+ // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
+ //
+
+L_encrypt_256:
+
+ cmp $1, num_blk // check number of block
+ jl L_HW_cbc_done // should it be less than 1, nothing to do
+
+ movups (ctx), %xmm2 // key0
+ movups 16(ctx), %xmm3 // key1
+ movups 32(ctx), %xmm4 // key2
+ movups 48(ctx), %xmm5 // key3
+ movups 64(ctx), %xmm6 // key4
+ movups 80(ctx), %xmm7 // key5
+#if defined __x86_64__
+ movups 96(ctx), %xmm8 // key6
+ movups 112(ctx), %xmm9 // key7
+ movups 128(ctx), %xmm10 // key8
+ movups 144(ctx), %xmm11 // key9
+ movups 160(ctx), %xmm12 // keyA
+ movups 176(ctx), %xmm13 // keyB
+ movups 192(ctx), %xmm14 // keyC
+ movups 208(ctx), %xmm15 // keyD
+ // movups 224(ctx), %xmm1 // keyE
+#endif
+
+ // while (num_blk--) {
+ // *iv ^= *ibuf++;
+ // aes_encrypt(iv, iv, ctx);
+ // *obuf++ = *iv;
+ // }
+0:
+ movups (ibuf), %xmm1 // *ibuf
+ pxor %xmm1, iv // *iv ^= ibuf
+
+ // aes_encrypt(iv, iv, ctx);
+ pxor %xmm2, iv
+ aesenc %xmm3, iv
+ aesenc %xmm4, iv
+ aesenc %xmm5, iv
+ aesenc %xmm6, iv
+ aesenc %xmm7, iv
+#if defined __x86_64__
+ movups 224(ctx), %xmm1 // keyE
+ aesenc %xmm8, iv
+ aesenc %xmm9, iv
+ aesenc %xmm10, iv
+ aesenc %xmm11, iv
+ aesenc %xmm12, iv
+ aesenc %xmm13, iv
+ aesenc %xmm14, iv
+ aesenc %xmm15, iv
+ aesenclast %xmm1, iv
+#else
+ movups 96(ctx), %xmm1 // key6
+ aesenc %xmm1, iv
+ movups 112(ctx), %xmm1 // key7
+ aesenc %xmm1, iv
+ movups 128(ctx), %xmm1 // key8
+ aesenc %xmm1, iv
+ movups 144(ctx), %xmm1 // key9
+ aesenc %xmm1, iv
+ movups 160(ctx), %xmm1 // keyA
+ aesenc %xmm1, iv
+ movups 176(ctx), %xmm1 // keyB
+ aesenc %xmm1, iv
+ movups 192(ctx), %xmm1 // keyC
+ aesenc %xmm1, iv
+ movups 208(ctx), %xmm1 // keyD
+ aesenc %xmm1, iv
+ movups 224(ctx), %xmm1 // keyE
+ aesenclast %xmm1, iv
+#endif
+
+ movups iv, (obuf) // *obuf = *iv;
+ add $16, ibuf // ibuf++
+ add $16, obuf // obuf++
+
+ sub $1, num_blk // num_blk --
+ jg 0b // if num_blk > 0, repeat the loop
+
+ jmp L_HW_cbc_done // share with the common exit code
+
+
+
+ //
+ // --------- END of aes_encrypt_cbc_hw -------------------
+ //
+
+
+/* ----------------------------------------------------------------------------------------------------------------
+
+ aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
+
+ For simplicity, I am assuming all variables are in 128-bit data type.
+
+ aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
+ {
+ while(num_blk--) {
+ aes_decrypt(ibuf, obuf, ctx);
+ *obuf++ ^= *iv;
+ *iv = *ibuf++;
+ }
+ return 0;
+ }
+
+ The following is an implementation of this function using Intel AESNI.
+ This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
+ Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
+ to this aesni-based function should it detecs that aesni is available.
+ Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
+
+ Note that the decryption operation is not related over blocks.
+ This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
+ This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
+ The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
+
+ Example C code for packing 4 blocks in an iteration is shown as follows:
+
+ while ((num_blk-=4)>=0) {
+
+ // the following 4 functions can be interleaved to exploit parallelism
+ aes_decrypt(ibuf, obuf, ctx);
+ aes_decrypt(ibuf+1, obuf+1, ctx);
+ aes_decrypt(ibuf+2, obuf+2, ctx);
+ aes_decrypt(ibuf+3, obuf+3, ctx);
+
+ obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
+ *iv = ibuf[3]; ibuf += 4; obuf += 4;
+ }
+ num_blk+=4;
+
+ ----------------------------------------------------------------------------------------------------------------*/
+
+ .text
+ .align 4,0x90
+ .globl _aes_decrypt_cbc_hw
+_aes_decrypt_cbc_hw:
+
+ // push/save registers for local use
+#if defined __i386__
+
+ push %ebp
+ movl %esp, %ebp
+ push %ebx // ibuf
+ push %edi // obuf
+
+ #define sp %esp
+
+#else // __x86_64__
+
+ push %rbp
+ mov %rsp, %rbp
+ push %rbx
+ push %r13
+ push %r14
+ push %r15
+
+ #define sp %rsp
+
+#endif
+
+
+ // if kernel, allocate stack space to save xmm registers
+#ifdef KERNEL
+#if defined __i386__
+ sub $(8*16), %esp
+#else
+ sub $(16*16), %rsp
+#endif
+ movaps %xmm0, (sp)
+ movaps %xmm1, 16(sp)
+ movaps %xmm2, 32(sp)
+ movaps %xmm3, 48(sp)
+ movaps %xmm4, 64(sp)
+ movaps %xmm5, 80(sp)
+ movaps %xmm6, 96(sp)
+ movaps %xmm7, 112(sp)
+#if defined __x86_64__
+ movaps %xmm8, 16*8(sp)
+ movaps %xmm9, 16*9(sp)
+ movaps %xmm10, 16*10(sp)
+ movaps %xmm11, 16*11(sp)
+ movaps %xmm12, 16*12(sp)
+ movaps %xmm13, 16*13(sp)
+ movaps %xmm14, 16*14(sp)
+ movaps %xmm15, 16*15(sp)
+#endif // __x86_64__
+#endif
+
+ #undef iv
+ #define iv %xmm0
+
+#if defined __i386__
+ mov 12(%ebp), %eax // in_iv
+ mov 24(%ebp), %edx // ctx
+ movups (%eax), iv // iv = in_iv
+ mov 8(%ebp), %ebx // ibuf
+ mov 16(%ebp), %ecx // num_blk
+ mov 20(%ebp), %edi // obuf
+
+ #define ibuf %ebx
+ #define obuf %edi
+ #define num_blk %ecx
+ #define ctx %edx
+
+#else // __x86_64__, rdi/rsi/rdx/rcx/r8
+
+ mov %rdi, %rbx // ibuf
+ movups (%rsi), iv // iv = in_iv
+ mov %rdx, %r13 // num_blk
+ mov %rcx, %r14 // obuf
+ mov %r8, %r15 // ctx
+
+ #define ibuf %rbx
+ #define num_blk %r13d
+ #define obuf %r14
+ #define ctx %r15
+
+#endif
+
+ mov 240(ctx), %eax // aes length
+ cmp $160, %eax // aes-128 decrypt
+ je L_decrypt_128
+ cmp $192, %eax // aes-192 decrypt
+ je L_decrypt_192
+ cmp $224, %eax // aes-256 decrypt
+ je L_decrypt_256
+
+ mov $-1, %eax // wrong aes length, to return -1
+ jmp L_error // early exit due to wrong aes length
+
+
+ //
+ // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
+ //
+
+L_decrypt_128:
+
+ cmp $1, num_blk
+ jl L_HW_cbc_done // if num_blk < 1, early return
+
+ // aes-128 decrypt expanded keys
+ movups 160(ctx), %xmm3
+ movups 144(ctx), %xmm4
+ movups 128(ctx), %xmm5
+ movups 112(ctx), %xmm6
+ movups 96(ctx), %xmm7
+#if defined __x86_64__
+ movups 80(ctx), %xmm8
+ movups 64(ctx), %xmm9
+ movups 48(ctx), %xmm10
+ movups 32(ctx), %xmm11
+ movups 16(ctx), %xmm12
+ movups 0(ctx), %xmm13
+#endif
+
+ // performs 4 block decryption in an iteration to exploit decrypt in parallel
+
+ // while ((num_blk-=4)>=0) {
+ // aes_decrypt(ibuf, obuf, ctx);
+ // aes_decrypt(ibuf+1, obuf+1, ctx);
+ // aes_decrypt(ibuf+2, obuf+2, ctx);
+ // aes_decrypt(ibuf+3, obuf+3, ctx);
+ // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
+ // *iv = ibuf[3]; ibuf += 4; obuf += 4;
+ // }
+
+ sub $4, num_blk // pre decrement num_blk by 4
+ jl 9f // if num_blk < 4, skip the per-4-blocks processing code
+
+0:
+
+
+#if defined __x86_64__
+
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+ movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
+ movups 48(ibuf), %xmm15 // tmp = 4th ibuf
+
+ // for x86_64, the expanded keys are already stored in xmm3-xmm13
+
+ // aes-128 decrypt round 0 per 4 blocks
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ pxor %xmm3, %xmm14
+ pxor %xmm3, %xmm15
+
+ // aes-128 decrypt round 1 per 4 blocks
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm14
+ aesdec %xmm4, %xmm15
+
+ // aes-128 decrypt round 2 per 4 blocks
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm14
+ aesdec %xmm5, %xmm15
+
+ // aes-128 decrypt round 3 per 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm14
+ aesdec %xmm6, %xmm15
+
+ // aes-128 decrypt round 4 per 4 blocks
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm14
+ aesdec %xmm7, %xmm15
+
+ // aes-128 decrypt round 5 per 4 blocks
+ aesdec %xmm8, %xmm1
+ aesdec %xmm8, %xmm2
+ aesdec %xmm8, %xmm14
+ aesdec %xmm8, %xmm15
+
+ // aes-128 decrypt round 6 per 4 blocks
+ aesdec %xmm9, %xmm1
+ aesdec %xmm9, %xmm2
+ aesdec %xmm9, %xmm14
+ aesdec %xmm9, %xmm15
+
+ // aes-128 decrypt round 7 per 4 blocks
+ aesdec %xmm10, %xmm1
+ aesdec %xmm10, %xmm2
+ aesdec %xmm10, %xmm14
+ aesdec %xmm10, %xmm15
+
+ // aes-128 decrypt round 8 per 4 blocks
+ aesdec %xmm11, %xmm1
+ aesdec %xmm11, %xmm2
+ aesdec %xmm11, %xmm14
+ aesdec %xmm11, %xmm15
+
+ // aes-128 decrypt round 9 per 4 blocks
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdec %xmm12, %xmm14
+ aesdec %xmm12, %xmm15
+
+ // aes-128 decrypt round 10 (last) per 4 blocks
+ aesdeclast %xmm13, %xmm1
+ aesdeclast %xmm13, %xmm2
+ aesdeclast %xmm13, %xmm14
+ aesdeclast %xmm13, %xmm15
+
+ pxor iv, %xmm1 // obuf[0] ^= *iv;
+ movups (ibuf), iv // ibuf[0]
+ pxor iv, %xmm2 // obuf[1] ^= ibuf[0];
+ movups 16(ibuf), iv // ibuf[1]
+ pxor iv, %xmm14 // obuf[2] ^= ibuf[1];
+ movups 32(ibuf), iv // ibuf[2]
+ pxor iv, %xmm15 // obuf[3] ^= obuf[2];
+ movups 48(ibuf), iv // *iv = ibuf[3]
+
+ movups %xmm1, (obuf) // write 1st obuf
+ movups %xmm2, 16(obuf) // write 2nd obuf
+ movups %xmm14, 32(obuf) // write 3rd obuf
+ movups %xmm15, 48(obuf) // write 4th obuf
+
+
+#else
+
+ // aes_decrypt_cbc per 4 blocks using aes-128 for i386
+ // xmm1/xmm2/xmm4/xmm5 used for obuf per block
+ // xmm3 = key0
+ // xmm0 = iv
+ // xmm6/xmm7 dynamically load with other expanded keys
+
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+ movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
+ movups 48(ibuf), %xmm5 // tmp = 4th ibuf
+
+ // aes_decrypt
+ // for i386, sequentially load expanded keys into xmm6/xmm7
+
+ movups 144(ctx), %xmm6 // key1
+
+ // aes-128 decrypt round 0 per 4 blocks
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ pxor %xmm3, %xmm4
+ pxor %xmm3, %xmm5
+
+ movups 128(ctx), %xmm7 // key2
+
+ // aes-128 decrypt round 1 per 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 112(ctx), %xmm6 // key3
+
+ // aes-128 decrypt round 2 per 4 blocks
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 96(ctx), %xmm7 // key4
+
+ // aes-128 decrypt round 3 per 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 80(ctx), %xmm6 // key5
+
+ // aes-128 decrypt round 4 per 4 blocks
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 64(ctx), %xmm7 // key6
+
+ // aes-128 decrypt round 5 per 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 48(ctx), %xmm6 // key7
+
+ // aes-128 decrypt round 6 per 4 blocks
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 32(ctx), %xmm7 // key8
+
+ // aes-128 decrypt round 7 per 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 16(ctx), %xmm6 // key9
+
+ // aes-128 decrypt round 8 per 4 blocks
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 0(ctx), %xmm7 // keyA
+
+ // aes-128 decrypt round 9 per 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ // aes-128 decrypt round 10 (last) per 4 blocks
+ aesdeclast %xmm7, %xmm1
+ aesdeclast %xmm7, %xmm2
+ aesdeclast %xmm7, %xmm4
+ aesdeclast %xmm7, %xmm5
+
+ pxor iv, %xmm1 // 1st obuf ^= iv;
+ movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm2 // 2nd obuf ^= iv;
+ movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm4 // 3rd obuf ^= iv;
+ movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm5 // 4th obuf ^= iv;
+ movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+ movups %xmm1, (obuf) // write 1st obuf
+ movups %xmm2, 16(obuf) // write 2nd obuf
+ movups %xmm4, 32(obuf) // write 3rd obuf
+ movups %xmm5, 48(obuf) // write 4th obuf
+#endif
+
+ add $64, ibuf // ibuf += 4;
+ add $64, obuf // obuf += 4;
+
+ sub $4, num_blk // num_blk -= 4
+ jge 0b // if num_blk > 0, repeat the loop
+
+9: add $4, num_blk // post incremtn num_blk by 4
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
+
+#if defined __i386__
+ // updated as they might be needed as expanded keys in the remaining
+ movups 144(ctx), %xmm4
+ movups 128(ctx), %xmm5
+ movups 112(ctx), %xmm6
+ movups 96(ctx), %xmm7
+#endif
+
+ test $2, num_blk // check whether num_blk has 2 blocks
+ je 9f // if num_blk & 2 == 0, skip the per-pair processing code
+
+ // do the remaining 2 blocks together
+
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+
+ // aes_decrypt
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+#if defined __x86_64__
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm8, %xmm1
+ aesdec %xmm8, %xmm2
+ aesdec %xmm9, %xmm1
+ aesdec %xmm9, %xmm2
+ aesdec %xmm10, %xmm1
+ aesdec %xmm10, %xmm2
+ aesdec %xmm11, %xmm1
+ aesdec %xmm11, %xmm2
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdeclast %xmm13, %xmm1
+ aesdeclast %xmm13, %xmm2
+#else
+ movups 80(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ movups 64(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ movups 48(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ movups 32(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ movups 16(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ movups 0(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdeclast %xmm7, %xmm1
+ aesdeclast %xmm7, %xmm2
+ movups 112(ctx), %xmm6
+ movups 96(ctx), %xmm7
+#endif
+
+ pxor iv, %xmm1 // obuf[0] ^= *iv;
+ movups (ibuf), iv // ibuf[0]
+ pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
+ movups 16(ibuf), iv // *iv = ibuf[1]
+
+ movups %xmm1, (obuf) // write obuf[0]
+ movups %xmm2, 16(obuf) // write obuf[1]
+
+ add $32, ibuf // ibuf += 2
+ add $32, obuf // obuf += 2
+
+9:
+ test $1, num_blk // check whether num_blk has residual 1 block
+ je L_HW_cbc_done // if num_blk == 0, no need for residual processing code
+
+ movups (ibuf), %xmm2 // tmp = ibuf
+ // aes_decrypt
+ pxor %xmm3, %xmm2
+ aesdec %xmm4, %xmm2
+ aesdec %xmm5, %xmm2
+ aesdec %xmm6, %xmm2
+ aesdec %xmm7, %xmm2
+#if defined __x86_64__
+ aesdec %xmm8, %xmm2
+ aesdec %xmm9, %xmm2
+ aesdec %xmm10, %xmm2
+ aesdec %xmm11, %xmm2
+ aesdec %xmm12, %xmm2
+ aesdeclast %xmm13, %xmm2
+#else
+ movups 80(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 64(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 48(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 32(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 16(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups (ctx), %xmm1
+ aesdeclast %xmm1, %xmm2
+#endif
+
+ pxor iv, %xmm2 // *obuf ^= *iv;
+ movups (ibuf), iv // *iv = *ibuf;
+ movups %xmm2, (obuf) // write *obuf
+
+ jmp L_HW_cbc_done
+
+ //
+ // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
+ //
+
+L_decrypt_192:
+
+ cmp $1, num_blk
+ jl L_HW_cbc_done // if num_blk < 1, early return
+
+ // aes-192 decryp expanded keys
+ movups 192(ctx), %xmm3
+ movups 176(ctx), %xmm4
+ movups 160(ctx), %xmm5
+ movups 144(ctx), %xmm6
+ movups 128(ctx), %xmm7
+#if defined __x86_64__
+ movups 112(ctx), %xmm8
+ movups 96(ctx), %xmm9
+ movups 80(ctx), %xmm10
+ movups 64(ctx), %xmm11
+ movups 48(ctx), %xmm12
+ movups 32(ctx), %xmm13
+ movups 16(ctx), %xmm14
+ movups (ctx), %xmm15
+#endif
+
+ // performs 4 block decryption in an iteration to exploit decrypt in parallel
+
+ // while ((num_blk-=4)>=0) {
+ // aes_decrypt(ibuf, obuf, ctx);
+ // aes_decrypt(ibuf+1, obuf+1, ctx);
+ // aes_decrypt(ibuf+2, obuf+2, ctx);
+ // aes_decrypt(ibuf+3, obuf+3, ctx);
+ // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
+ // *iv = ibuf[3]; ibuf += 4; obuf += 4;
+ // }
+
+ sub $4, num_blk // pre decrement num_blk by 4
+ jl 9f // if num_blk < 4, skip the per-4-blocks processing code
+0:
+
+#if defined __x86_64__
+
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+ movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
+ movups 48(ibuf), %xmm15 // tmp = 4th ibuf
+
+ // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
+ // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
+
+ // round 0 for 4 blocks
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ pxor %xmm3, %xmm14
+ pxor %xmm3, %xmm15
+
+ // round 1 for 4 blocks
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm14
+ aesdec %xmm4, %xmm15
+
+ // round 2 for 4 blocks
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm14
+ aesdec %xmm5, %xmm15
+
+ // round 3 for 4 blocks
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm14
+ aesdec %xmm6, %xmm15
+
+ // round 4 for 4 blocks
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm14
+ aesdec %xmm7, %xmm15
+
+ // round 5 for 4 blocks
+ aesdec %xmm8, %xmm1
+ aesdec %xmm8, %xmm2
+ aesdec %xmm8, %xmm14
+ aesdec %xmm8, %xmm15
+
+ // round 6 for 4 blocks
+ aesdec %xmm9, %xmm1
+ aesdec %xmm9, %xmm2
+ aesdec %xmm9, %xmm14
+ aesdec %xmm9, %xmm15
+
+ // round 7 for 4 blocks
+ aesdec %xmm10, %xmm1
+ aesdec %xmm10, %xmm2
+ aesdec %xmm10, %xmm14
+ aesdec %xmm10, %xmm15
+
+ // round 8 for 4 blocks
+ aesdec %xmm11, %xmm1
+ aesdec %xmm11, %xmm2
+ aesdec %xmm11, %xmm14
+ aesdec %xmm11, %xmm15
+
+ // round 9 for 4 blocks
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdec %xmm12, %xmm14
+ aesdec %xmm12, %xmm15
+
+ movups 16(ctx), %xmm12
+
+ // round A for 4 blocks
+ aesdec %xmm13, %xmm1
+ aesdec %xmm13, %xmm2
+ aesdec %xmm13, %xmm14
+ aesdec %xmm13, %xmm15
+
+ movups (ctx), %xmm13
+
+ // round B for 4 blocks
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdec %xmm12, %xmm14
+ aesdec %xmm12, %xmm15
+
+ movups 48(ctx), %xmm12 // restore %xmm12 to its original key
+
+ // round C (last) for 4 blocks
+ aesdeclast %xmm13, %xmm1
+ aesdeclast %xmm13, %xmm2
+ aesdeclast %xmm13, %xmm14
+ aesdeclast %xmm13, %xmm15
+
+ movups 32(ctx), %xmm13 // restore %xmm13 to its original key
+
+ pxor iv, %xmm1 // obuf[0] ^= *iv;
+ movups (ibuf), iv // ibuf[0]
+ pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
+ movups 16(ibuf), iv // ibuf[1]
+ pxor iv, %xmm14 // obuf[2] ^= ibuf[1]
+ movups 32(ibuf), iv // ibuf[2]
+ pxor iv, %xmm15 // obuf[3] ^= ibuf[2]
+ movups 48(ibuf), iv // *iv = ibuf[3]
+
+ movups %xmm1, (obuf) // write 1st obuf
+ movups %xmm2, 16(obuf) // write 2nd obuf
+ movups %xmm14, 32(obuf) // write 3rd obuf
+ movups %xmm15, 48(obuf) // write 4th obuf
+
+ add $64, ibuf // ibuf += 4;
+ add $64, obuf // obuf += 4;
+
+ sub $4, num_blk // num_blk -= 4
+ jge 0b // if num_blk > 0, repeat the loop
+
+9: add $4, num_blk // post incremtn num_blk by 4
+ je L_HW_cbc_done // if num_blk == 0, prepare to return
+
+ movups 16(ctx), %xmm14 // restore %xmm14 to its key
+ movups (ctx), %xmm15 // restore %xmm15 to its key
+
+#else
+
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+ movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
+ movups 48(ibuf), %xmm5 // tmp = 4th ibuf
+
+ // aes_decrypt
+ // for i386, sequentially load expanded keys into xmm6/xmm7
+ movups 176(ctx), %xmm6
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ pxor %xmm3, %xmm4
+ pxor %xmm3, %xmm5
+
+ movups 160(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 144(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 128(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 112(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 96(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 80(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 64(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 48(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 32(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 16(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 0(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ aesdeclast %xmm7, %xmm1
+ aesdeclast %xmm7, %xmm2
+ aesdeclast %xmm7, %xmm4
+ aesdeclast %xmm7, %xmm5
+
+ pxor iv, %xmm1 // 1st obuf ^= iv;
+ movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm2 // 2nd obuf ^= iv;
+ movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm4 // 3rd obuf ^= iv;
+ movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm5 // 4th obuf ^= iv;
+ movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
+ movups %xmm1, (obuf) // write 1st obuf
+ movups %xmm2, 16(obuf) // write 2nd obuf
+ movups %xmm4, 32(obuf) // write 3rd obuf
+ movups %xmm5, 48(obuf) // write 4th obuf
+
+ add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
+ add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
+
+ sub $4, num_blk // num_blk -= 4
+ jge 0b // if num_blk > 0, repeat the loop
+
+
+9: add $4, num_blk // post incremtn num_blk by 4
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
+
+ movups 176(ctx), %xmm4
+ movups 160(ctx), %xmm5
+ movups 144(ctx), %xmm6
+ movups 128(ctx), %xmm7
+
+#endif
+
+ // per-block aes_decrypt_cbc loop
+
+0:
+ movups (ibuf), %xmm2 // tmp = ibuf
+
+ // aes_decrypt
+ pxor %xmm3, %xmm2
+ aesdec %xmm4, %xmm2
+ aesdec %xmm5, %xmm2
+ aesdec %xmm6, %xmm2
+ aesdec %xmm7, %xmm2
+#if defined __x86_64__
+ aesdec %xmm8, %xmm2
+ aesdec %xmm9, %xmm2
+ aesdec %xmm10, %xmm2
+ aesdec %xmm11, %xmm2
+ aesdec %xmm12, %xmm2
+ aesdec %xmm13, %xmm2
+ aesdec %xmm14, %xmm2
+ aesdeclast %xmm15, %xmm2
+#else
+ movups 112(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 96(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 80(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 64(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 48(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 32(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 16(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups (ctx), %xmm1
+ aesdeclast %xmm1, %xmm2
+#endif
+
+ pxor iv, %xmm2 // obuf ^= iv;
+ movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+ movups %xmm2, (obuf) // write obuf
+
+ add $16, ibuf // ibuf += AES_BLOCK_SIZE;
+ add $16, obuf // obuf += AES_BLOCK_SIZE;
+ sub $1, num_blk // num_blk --
+ jg 0b // if num_blk > 0, repeat the loop
+
+ jmp L_HW_cbc_done
+
+ //
+ // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
+ //
+
+L_decrypt_256:
+
+ cmp $1, num_blk
+ jl L_HW_cbc_done
+
+ movups 224(ctx), %xmm3
+ movups 208(ctx), %xmm4
+ movups 192(ctx), %xmm5
+ movups 176(ctx), %xmm6
+ movups 160(ctx), %xmm7
+#if defined __x86_64__
+ movups 144(ctx), %xmm8
+ movups 128(ctx), %xmm9
+ movups 112(ctx), %xmm10
+ movups 96(ctx), %xmm11
+ movups 80(ctx), %xmm12
+ movups 64(ctx), %xmm13
+ movups 48(ctx), %xmm14
+ movups 32(ctx), %xmm15
+// movups 16(ctx), %xmm14
+// movups (ctx), %xmm15
+#endif
+
+#if defined __x86_64__
+
+ sub $4, num_blk // pre decrement num_blk by 4
+ jl 9f // if num_blk < 4, skip the per-4-blocks processing code
+0:
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+ movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
+ movups 48(ibuf), %xmm15 // tmp = 4th ibuf
+
+ // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ pxor %xmm3, %xmm14
+ pxor %xmm3, %xmm15
+
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm14
+ aesdec %xmm4, %xmm15
+
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm14
+ aesdec %xmm5, %xmm15
+
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm14
+ aesdec %xmm6, %xmm15
+
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm14
+ aesdec %xmm7, %xmm15
+
+ aesdec %xmm8, %xmm1
+ aesdec %xmm8, %xmm2
+ aesdec %xmm8, %xmm14
+ aesdec %xmm8, %xmm15
+
+ aesdec %xmm9, %xmm1
+ aesdec %xmm9, %xmm2
+ aesdec %xmm9, %xmm14
+ aesdec %xmm9, %xmm15
+
+ aesdec %xmm10, %xmm1
+ aesdec %xmm10, %xmm2
+ aesdec %xmm10, %xmm14
+ aesdec %xmm10, %xmm15
+
+ aesdec %xmm11, %xmm1
+ aesdec %xmm11, %xmm2
+ aesdec %xmm11, %xmm14
+ aesdec %xmm11, %xmm15
+
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdec %xmm12, %xmm14
+ aesdec %xmm12, %xmm15
+ movups 48(ctx), %xmm12
+
+ aesdec %xmm13, %xmm1
+ aesdec %xmm13, %xmm2
+ aesdec %xmm13, %xmm14
+ aesdec %xmm13, %xmm15
+ movups 32(ctx), %xmm13
+
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdec %xmm12, %xmm14
+ aesdec %xmm12, %xmm15
+ movups 16(ctx), %xmm12
+
+ aesdec %xmm13, %xmm1
+ aesdec %xmm13, %xmm2
+ aesdec %xmm13, %xmm14
+ aesdec %xmm13, %xmm15
+ movups (ctx), %xmm13
+
+ aesdec %xmm12, %xmm1
+ aesdec %xmm12, %xmm2
+ aesdec %xmm12, %xmm14
+ aesdec %xmm12, %xmm15
+ movups 80(ctx), %xmm12
+
+ aesdeclast %xmm13, %xmm1
+ aesdeclast %xmm13, %xmm2
+ aesdeclast %xmm13, %xmm14
+ aesdeclast %xmm13, %xmm15
+ movups 64(ctx), %xmm13
+
+ pxor iv, %xmm1 // obuf ^= iv;
+ movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm2 // obuf ^= iv;
+ movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm14 // obuf ^= iv;
+ movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm15 // obuf ^= iv;
+ movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+ movups %xmm1, (obuf) // write 1st obuf
+ movups %xmm2, 16(obuf) // write 2nd obuf
+ movups %xmm14, 32(obuf) // write 3rd obuf
+ movups %xmm15, 48(obuf) // write 4th obuf
+
+ add $64, ibuf // ibuf += AES_BLOCK_SIZE*4;
+ add $64, obuf // obuf += AES_BLOCK_SIZE*4;
+
+ sub $4, num_blk // num_blk -= 4
+ jge 0b // if num_blk > 0, repeat the loop
+
+9: add $4, num_blk // post incremtn num_blk by 4
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
+
+ movups 48(ctx), %xmm14
+ movups 32(ctx), %xmm15
+
+#else
+
+ sub $4, num_blk // pre decrement num_blk by 4
+ jl 9f // if num_blk < 4, skip the per-pair processing code
+0:
+ movups (ibuf), %xmm1 // tmp = 1st ibuf
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
+ movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
+ movups 48(ibuf), %xmm5 // tmp = 4th ibuf
+
+ // aes_decrypt
+ // for i386, sequentially load expanded keys into xmm6/xmm7
+ movups 208(ctx), %xmm6
+ pxor %xmm3, %xmm1
+ pxor %xmm3, %xmm2
+ pxor %xmm3, %xmm4
+ pxor %xmm3, %xmm5
+
+ movups 192(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 176(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 160(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 144(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 128(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 112(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 96(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 80(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 64(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 48(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 32(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ movups 16(ctx), %xmm6
+ aesdec %xmm7, %xmm1
+ aesdec %xmm7, %xmm2
+ aesdec %xmm7, %xmm4
+ aesdec %xmm7, %xmm5
+
+ movups 0(ctx), %xmm7
+ aesdec %xmm6, %xmm1
+ aesdec %xmm6, %xmm2
+ aesdec %xmm6, %xmm4
+ aesdec %xmm6, %xmm5
+
+ aesdeclast %xmm7, %xmm1
+ aesdeclast %xmm7, %xmm2
+ aesdeclast %xmm7, %xmm4
+ aesdeclast %xmm7, %xmm5
+
+ pxor iv, %xmm1 // 1st obuf ^= iv;
+ movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm2 // 2nd obuf ^= iv;
+ movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm4 // 3rd obuf ^= iv;
+ movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
+ pxor iv, %xmm5 // 4th obuf ^= iv;
+ movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
+ movups %xmm1, (obuf) // write 1st obuf
+ movups %xmm2, 16(obuf) // write 2nd obuf
+ movups %xmm4, 32(obuf) // write 3rd obuf
+ movups %xmm5, 48(obuf) // write 4th obuf
+
+ add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
+ add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
+
+ sub $4, num_blk // num_blk -= 4
+ jge 0b // if num_blk > 0, repeat the loop
+
+
+9: add $4, num_blk // post incremtn num_blk by 4
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
+
+ movups 208(ctx), %xmm4
+ movups 192(ctx), %xmm5
+ movups 176(ctx), %xmm6
+ movups 160(ctx), %xmm7
+
+#endif
+
+0:
+ movups (ibuf), %xmm2 // tmp = ibuf
+
+ // aes_decrypt
+ pxor %xmm3, %xmm2
+ aesdec %xmm4, %xmm2
+ aesdec %xmm5, %xmm2
+ aesdec %xmm6, %xmm2
+ aesdec %xmm7, %xmm2
+#if defined __x86_64__
+ aesdec %xmm8, %xmm2
+ aesdec %xmm9, %xmm2
+ aesdec %xmm10, %xmm2
+ aesdec %xmm11, %xmm2
+ aesdec %xmm12, %xmm2
+ aesdec %xmm13, %xmm2
+ aesdec %xmm14, %xmm2
+ aesdec %xmm15, %xmm2
+#else
+ movups 144(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 128(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 112(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 96(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 80(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 64(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 48(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups 32(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+#endif
+ movups 16(ctx), %xmm1
+ aesdec %xmm1, %xmm2
+ movups (ctx), %xmm1
+ aesdeclast %xmm1, %xmm2
+
+ pxor iv, %xmm2 // obuf ^= iv;
+ movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
+
+ movups %xmm2, (obuf) // write obuf
+
+ add $16, ibuf // ibuf += AES_BLOCK_SIZE;
+ add $16, obuf // obuf += AES_BLOCK_SIZE;
+ sub $1, num_blk // num_blk --
+ jg 0b // if num_blk > 0, repeat the loop
+
+ jmp L_HW_cbc_done
+
+ //
+ // --------- END of aes_decrypt_cbc_hw -------------------
+ //