-/*
- ---------------------------------------------------------------------------
- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
-
- 1. distributions of this source code include the above copyright
- notice, this list of conditions and the following disclaimer;
-
- 2. distributions in binary form include the above copyright
- notice, this list of conditions and the following disclaimer
- in the documentation and/or other associated materials;
-
- 3. the copyright holder's name is not used to endorse products
- built using this software without specific written permission.
-
- ALTERNATIVELY, provided that this notice is retained in full, this product
- may be distributed under the terms of the GNU General Public License (GPL),
- in which case the provisions of the GPL apply INSTEAD OF those given above.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue 31/01/2006
-
- These subroutines implement multiple block AES modes for ECB, CBC, CFB,
- OFB and CTR encryption, The code provides support for the VIA Advanced
- Cryptography Engine (ACE).
-
- NOTE: In the following subroutines, the AES contexts (ctx) must be
- 16 byte aligned if VIA ACE is being used
-*/
-
-/* ----------------------------------------------------------------------------------------------------------------
-
- aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
-
- For simplicity, I am assuming all variables are in 128-bit data type.
-
- aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
- {
- while(num_blk--) {
- *iv ^= *ibuf++;
- aes_encrypt(iv, iv, ctx);
- *obuf++ = *iv;
- }
- return 0;
- }
-
- The following is an implementation of this function using Intel AESNI.
- This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
- Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
- to this aesni-based function should it detecs that aesni is available.
- Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
-
- Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
- are serially chained. This prevents us from arranging several blocks for encryption in parallel.
-
- ----------------------------------------------------------------------------------------------------------------*/
-
- .text
- .align 4,0x90
- .globl _aes_encrypt_cbc_hw
-_aes_encrypt_cbc_hw:
-
- // push/save registers for local use
-#if defined __i386__
-
- push %ebp
- movl %esp, %ebp
- push %ebx
- push %edi
-
- #define sp %esp
-
-#else // __x86_64__
-
- push %rbp
- mov %rsp, %rbp
- push %rbx
- push %r13
- push %r14
- push %r15
-
- #define sp %rsp
-
-#endif
-
- // if this is kernel code, need to save used xmm registers
-#ifdef KERNEL
-
-#if defined __i386__
- sub $(8*16), %esp // for possible xmm0-xmm7 save/restore
-#else
- sub $(16*16), %rsp // xmm0-xmm15 save/restore
-#endif
-
- movaps %xmm0, (sp)
- movaps %xmm1, 16(sp)
- movaps %xmm2, 32(sp)
- movaps %xmm3, 48(sp)
- movaps %xmm4, 64(sp)
- movaps %xmm5, 80(sp)
- movaps %xmm6, 96(sp)
- movaps %xmm7, 112(sp)
-#if defined __x86_64__
- movaps %xmm8, 16*8(sp)
- movaps %xmm9, 16*9(sp)
- movaps %xmm10, 16*10(sp)
- movaps %xmm11, 16*11(sp)
- movaps %xmm12, 16*12(sp)
- movaps %xmm13, 16*13(sp)
- movaps %xmm14, 16*14(sp)
- movaps %xmm15, 16*15(sp)
-#endif // __x86_64__
-
-#endif // KERNEL
-
- #define iv %xmm0
-
-#ifdef __i386__
-
- mov 12(%ebp), %eax // in_iv
- mov 24(%ebp), %edx // ctx
- movups (%eax), iv // iv = in_iv
- mov 8(%ebp), %ebx // ibuf
- mov 16(%ebp), %ecx // num_blk
- mov 20(%ebp), %edi // obuf
-
- #define ibuf %ebx
- #define obuf %edi
- #define num_blk %ecx
- #define ctx %edx
-
-#else
-
- mov %rdi, %rbx // ibuf
- movups (%rsi), iv // iv = in_iv
- mov %rdx, %r13 // num_blk
- mov %rcx, %r14 // obuf
- mov %r8, %r15 // ctx
-
- #define ibuf %rbx
- #define num_blk %r13d
- #define obuf %r14
- #define ctx %r15
-
-#endif
-
- mov 240(ctx), %eax // aes length
- cmp $160, %eax // aes-128 encrypt ?
- je L_encrypt_128
- cmp $192, %eax // aes-192 encrypt ?
- je L_encrypt_192
- cmp $224, %eax // aes-256 encrypt ?
- je L_encrypt_256
- mov $-1, %eax // return error
- jmp L_error
-
- //
- // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
- //
-
-L_encrypt_128:
-
- cmp $1, num_blk // check number of block
- jl L_HW_cbc_done // should it be less than 1, nothing to do
-
- movups (ctx), %xmm2 // key0
- movups 16(ctx), %xmm3 // key1
- movups 32(ctx), %xmm4 // key2
- movups 48(ctx), %xmm5 // key3
- movups 64(ctx), %xmm6 // key4
- movups 80(ctx), %xmm7 // key5
-#if defined __x86_64__
- movups 96(ctx), %xmm8 // key6
- movups 112(ctx), %xmm9 // key7
- movups 128(ctx), %xmm10 // key8
- movups 144(ctx), %xmm11 // key9
- movups 160(ctx), %xmm12 // keyA
-#endif
-
- // while (num_blk--) {
- // *iv ^= *ibuf++;
- // aes_encrypt(iv, iv, ctx);
- // *obuf++ = *iv;
- // }
-0:
- movups (ibuf), %xmm1 // *ibuf
- pxor %xmm2, iv // 1st instruction inside aes_encrypt
- pxor %xmm1, iv // *iv ^= *ibuf
-
- // finishing up the rest of aes_encrypt
- aesenc %xmm3, iv
- aesenc %xmm4, iv
- aesenc %xmm5, iv
- aesenc %xmm6, iv
- aesenc %xmm7, iv
-#if defined __x86_64__
- aesenc %xmm8, iv
- aesenc %xmm9, iv
- aesenc %xmm10, iv
- aesenc %xmm11, iv
- aesenclast %xmm12, iv
-#else
- movups 96(ctx), %xmm1 // key6
- aesenc %xmm1, iv
- movups 112(ctx), %xmm1 // key7
- aesenc %xmm1, iv
- movups 128(ctx), %xmm1 // key8
- aesenc %xmm1, iv
- movups 144(ctx), %xmm1 // key9
- aesenc %xmm1, iv
- movups 160(ctx), %xmm1 // keyA
- aesenclast %xmm1, iv
-#endif
-
- movups iv, (obuf) // *obuf = *iv;
- add $16, obuf // obuf++;
- add $16, ibuf // ibuf++;
- sub $1, num_blk // num_blk --
- jg 0b // if num_blk > 0, repeat the loop
-
- // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
-
-L_HW_cbc_done:
-
- xor %eax, %eax // to return CRYPT_OK
-
-L_error:
-
- // if kernel, restore xmm registers
-#ifdef KERNEL
- movaps 0(sp), %xmm0
- movaps 16(sp), %xmm1
- movaps 32(sp), %xmm2
- movaps 48(sp), %xmm3
- movaps 64(sp), %xmm4
- movaps 80(sp), %xmm5
- movaps 96(sp), %xmm6
- movaps 112(sp), %xmm7
-#if defined __x86_64__
- movaps 16*8(sp), %xmm8
- movaps 16*9(sp), %xmm9
- movaps 16*10(sp), %xmm10
- movaps 16*11(sp), %xmm11
- movaps 16*12(sp), %xmm12
- movaps 16*13(sp), %xmm13
- movaps 16*14(sp), %xmm14
- movaps 16*15(sp), %xmm15
-#endif // __x86_64__
-#endif // KERNEL
-
- // release used stack memory, restore used callee-saved registers, and return
-#if defined __i386__
-#ifdef KERNEL
- add $(8*16), %esp
-#endif
- pop %edi
- pop %ebx
-#else
-#ifdef KERNEL
- add $(16*16), %rsp
-#endif
- pop %r15
- pop %r14
- pop %r13
- pop %rbx
-#endif
- leave
- ret
-
- //
- // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
- //
-
-L_encrypt_192:
-
- cmp $1, num_blk // check number of block
- jl L_HW_cbc_done // should it be less than 1, nothing to do
-
- movups (ctx), %xmm2 // key0
- movups 16(ctx), %xmm3 // key1
- movups 32(ctx), %xmm4 // key2
- movups 48(ctx), %xmm5 // key3
- movups 64(ctx), %xmm6 // key4
- movups 80(ctx), %xmm7 // key5
-#if defined __x86_64__
- movups 96(ctx), %xmm8 // key6
- movups 112(ctx), %xmm9 // key7
- movups 128(ctx), %xmm10 // key8
- movups 144(ctx), %xmm11 // key9
- movups 160(ctx), %xmm12 // keyA
- movups 176(ctx), %xmm13 // keyB
- movups 192(ctx), %xmm14 // keyC
-#endif
-
- // while (num_blk--) {
- // *iv ^= *ibuf++;
- // aes_encrypt(iv, iv, ctx);
- // *obuf++ = *iv;
- // }
-0:
- movups (ibuf), %xmm1 // *ibuf
- pxor %xmm1, iv // *iv ^= ibuf
-
- // aes_encrypt(iv, iv, ctx);
-
- pxor %xmm2, iv
- aesenc %xmm3, iv
- aesenc %xmm4, iv
- aesenc %xmm5, iv
- aesenc %xmm6, iv
- aesenc %xmm7, iv
-#if defined __x86_64__
- aesenc %xmm8, iv
- aesenc %xmm9, iv
- aesenc %xmm10, iv
- aesenc %xmm11, iv
- aesenc %xmm12, iv
- aesenc %xmm13, iv
- aesenclast %xmm14, iv
-#else
- movups 96(ctx), %xmm1
- aesenc %xmm1, iv
- movups 112(ctx), %xmm1
- aesenc %xmm1, iv
- movups 128(ctx), %xmm1
- aesenc %xmm1, iv
- movups 144(ctx), %xmm1
- aesenc %xmm1, iv
- movups 160(ctx), %xmm1
- aesenc %xmm1, iv
- movups 176(ctx), %xmm1
- aesenc %xmm1, iv
- movups 192(ctx), %xmm1
- aesenclast %xmm1, iv
-#endif
-
- movups iv, (obuf) // *obuf = *iv;
- add $16, ibuf // ibuf++
- add $16, obuf // obuf++
-
- sub $1, num_blk // num_blk --
- jg 0b // if num_blk > 0, repeat the loop
-
- jmp L_HW_cbc_done // share with the common exit code
-
- //
- // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
- //
-
-L_encrypt_256:
-
- cmp $1, num_blk // check number of block
- jl L_HW_cbc_done // should it be less than 1, nothing to do
-
- movups (ctx), %xmm2 // key0
- movups 16(ctx), %xmm3 // key1
- movups 32(ctx), %xmm4 // key2
- movups 48(ctx), %xmm5 // key3
- movups 64(ctx), %xmm6 // key4
- movups 80(ctx), %xmm7 // key5
-#if defined __x86_64__
- movups 96(ctx), %xmm8 // key6
- movups 112(ctx), %xmm9 // key7
- movups 128(ctx), %xmm10 // key8
- movups 144(ctx), %xmm11 // key9
- movups 160(ctx), %xmm12 // keyA
- movups 176(ctx), %xmm13 // keyB
- movups 192(ctx), %xmm14 // keyC
- movups 208(ctx), %xmm15 // keyD
- // movups 224(ctx), %xmm1 // keyE
-#endif
-
- // while (num_blk--) {
- // *iv ^= *ibuf++;
- // aes_encrypt(iv, iv, ctx);
- // *obuf++ = *iv;
- // }
-0:
- movups (ibuf), %xmm1 // *ibuf
- pxor %xmm1, iv // *iv ^= ibuf
-
- // aes_encrypt(iv, iv, ctx);
- pxor %xmm2, iv
- aesenc %xmm3, iv
- aesenc %xmm4, iv
- aesenc %xmm5, iv
- aesenc %xmm6, iv
- aesenc %xmm7, iv
-#if defined __x86_64__
- movups 224(ctx), %xmm1 // keyE
- aesenc %xmm8, iv
- aesenc %xmm9, iv
- aesenc %xmm10, iv
- aesenc %xmm11, iv
- aesenc %xmm12, iv
- aesenc %xmm13, iv
- aesenc %xmm14, iv
- aesenc %xmm15, iv
- aesenclast %xmm1, iv
-#else
- movups 96(ctx), %xmm1 // key6
- aesenc %xmm1, iv
- movups 112(ctx), %xmm1 // key7
- aesenc %xmm1, iv
- movups 128(ctx), %xmm1 // key8
- aesenc %xmm1, iv
- movups 144(ctx), %xmm1 // key9
- aesenc %xmm1, iv
- movups 160(ctx), %xmm1 // keyA
- aesenc %xmm1, iv
- movups 176(ctx), %xmm1 // keyB
- aesenc %xmm1, iv
- movups 192(ctx), %xmm1 // keyC
- aesenc %xmm1, iv
- movups 208(ctx), %xmm1 // keyD
- aesenc %xmm1, iv
- movups 224(ctx), %xmm1 // keyE
- aesenclast %xmm1, iv
-#endif
-
- movups iv, (obuf) // *obuf = *iv;
- add $16, ibuf // ibuf++
- add $16, obuf // obuf++
-
- sub $1, num_blk // num_blk --
- jg 0b // if num_blk > 0, repeat the loop
-
- jmp L_HW_cbc_done // share with the common exit code
-
-
-
- //
- // --------- END of aes_encrypt_cbc_hw -------------------
- //
-
-
-/* ----------------------------------------------------------------------------------------------------------------
-
- aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
-
- For simplicity, I am assuming all variables are in 128-bit data type.
-
- aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
- {
- while(num_blk--) {
- aes_decrypt(ibuf, obuf, ctx);
- *obuf++ ^= *iv;
- *iv = *ibuf++;
- }
- return 0;
- }
-
- The following is an implementation of this function using Intel AESNI.
- This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
- Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
- to this aesni-based function should it detecs that aesni is available.
- Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
-
- Note that the decryption operation is not related over blocks.
- This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
- This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
- The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
-
- Example C code for packing 4 blocks in an iteration is shown as follows:
-
- while ((num_blk-=4)>=0) {
-
- // the following 4 functions can be interleaved to exploit parallelism
- aes_decrypt(ibuf, obuf, ctx);
- aes_decrypt(ibuf+1, obuf+1, ctx);
- aes_decrypt(ibuf+2, obuf+2, ctx);
- aes_decrypt(ibuf+3, obuf+3, ctx);
-
- obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
- *iv = ibuf[3]; ibuf += 4; obuf += 4;
- }
- num_blk+=4;
-
- ----------------------------------------------------------------------------------------------------------------*/
-
- .text
- .align 4,0x90
- .globl _aes_decrypt_cbc_hw
-_aes_decrypt_cbc_hw:
-
- // push/save registers for local use
-#if defined __i386__
-
- push %ebp
- movl %esp, %ebp
- push %ebx // ibuf
- push %edi // obuf
-
- #define sp %esp
-
-#else // __x86_64__
-
- push %rbp
- mov %rsp, %rbp
- push %rbx
- push %r13
- push %r14
- push %r15
-
- #define sp %rsp
-
-#endif
-
-
- // if kernel, allocate stack space to save xmm registers
-#ifdef KERNEL
-#if defined __i386__
- sub $(8*16), %esp
-#else
- sub $(16*16), %rsp
-#endif
- movaps %xmm0, (sp)
- movaps %xmm1, 16(sp)
- movaps %xmm2, 32(sp)
- movaps %xmm3, 48(sp)
- movaps %xmm4, 64(sp)
- movaps %xmm5, 80(sp)
- movaps %xmm6, 96(sp)
- movaps %xmm7, 112(sp)
-#if defined __x86_64__
- movaps %xmm8, 16*8(sp)
- movaps %xmm9, 16*9(sp)
- movaps %xmm10, 16*10(sp)
- movaps %xmm11, 16*11(sp)
- movaps %xmm12, 16*12(sp)
- movaps %xmm13, 16*13(sp)
- movaps %xmm14, 16*14(sp)
- movaps %xmm15, 16*15(sp)
-#endif // __x86_64__
-#endif
-
- #undef iv
- #define iv %xmm0
-
-#if defined __i386__
- mov 12(%ebp), %eax // in_iv
- mov 24(%ebp), %edx // ctx
- movups (%eax), iv // iv = in_iv
- mov 8(%ebp), %ebx // ibuf
- mov 16(%ebp), %ecx // num_blk
- mov 20(%ebp), %edi // obuf
-
- #define ibuf %ebx
- #define obuf %edi
- #define num_blk %ecx
- #define ctx %edx
-
-#else // __x86_64__, rdi/rsi/rdx/rcx/r8
-
- mov %rdi, %rbx // ibuf
- movups (%rsi), iv // iv = in_iv
- mov %rdx, %r13 // num_blk
- mov %rcx, %r14 // obuf
- mov %r8, %r15 // ctx
-
- #define ibuf %rbx
- #define num_blk %r13d
- #define obuf %r14
- #define ctx %r15
-
-#endif
-
- mov 240(ctx), %eax // aes length
- cmp $160, %eax // aes-128 decrypt
- je L_decrypt_128
- cmp $192, %eax // aes-192 decrypt
- je L_decrypt_192
- cmp $224, %eax // aes-256 decrypt
- je L_decrypt_256
-
- mov $-1, %eax // wrong aes length, to return -1
- jmp L_error // early exit due to wrong aes length
-
-
- //
- // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
- //
-
-L_decrypt_128:
-
- cmp $1, num_blk
- jl L_HW_cbc_done // if num_blk < 1, early return
-
- // aes-128 decrypt expanded keys
- movups 160(ctx), %xmm3
- movups 144(ctx), %xmm4
- movups 128(ctx), %xmm5
- movups 112(ctx), %xmm6
- movups 96(ctx), %xmm7
-#if defined __x86_64__
- movups 80(ctx), %xmm8
- movups 64(ctx), %xmm9
- movups 48(ctx), %xmm10
- movups 32(ctx), %xmm11
- movups 16(ctx), %xmm12
- movups 0(ctx), %xmm13
-#endif
-
- // performs 4 block decryption in an iteration to exploit decrypt in parallel
-
- // while ((num_blk-=4)>=0) {
- // aes_decrypt(ibuf, obuf, ctx);
- // aes_decrypt(ibuf+1, obuf+1, ctx);
- // aes_decrypt(ibuf+2, obuf+2, ctx);
- // aes_decrypt(ibuf+3, obuf+3, ctx);
- // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
- // *iv = ibuf[3]; ibuf += 4; obuf += 4;
- // }
-
- sub $4, num_blk // pre decrement num_blk by 4
- jl 9f // if num_blk < 4, skip the per-4-blocks processing code
-
-0:
-
-
-#if defined __x86_64__
-
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
- movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
- movups 48(ibuf), %xmm15 // tmp = 4th ibuf
-
- // for x86_64, the expanded keys are already stored in xmm3-xmm13
-
- // aes-128 decrypt round 0 per 4 blocks
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- pxor %xmm3, %xmm14
- pxor %xmm3, %xmm15
-
- // aes-128 decrypt round 1 per 4 blocks
- aesdec %xmm4, %xmm1
- aesdec %xmm4, %xmm2
- aesdec %xmm4, %xmm14
- aesdec %xmm4, %xmm15
-
- // aes-128 decrypt round 2 per 4 blocks
- aesdec %xmm5, %xmm1
- aesdec %xmm5, %xmm2
- aesdec %xmm5, %xmm14
- aesdec %xmm5, %xmm15
-
- // aes-128 decrypt round 3 per 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm14
- aesdec %xmm6, %xmm15
-
- // aes-128 decrypt round 4 per 4 blocks
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm14
- aesdec %xmm7, %xmm15
-
- // aes-128 decrypt round 5 per 4 blocks
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm14
- aesdec %xmm8, %xmm15
-
- // aes-128 decrypt round 6 per 4 blocks
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm14
- aesdec %xmm9, %xmm15
-
- // aes-128 decrypt round 7 per 4 blocks
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm14
- aesdec %xmm10, %xmm15
-
- // aes-128 decrypt round 8 per 4 blocks
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm14
- aesdec %xmm11, %xmm15
-
- // aes-128 decrypt round 9 per 4 blocks
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm14
- aesdec %xmm12, %xmm15
-
- // aes-128 decrypt round 10 (last) per 4 blocks
- aesdeclast %xmm13, %xmm1
- aesdeclast %xmm13, %xmm2
- aesdeclast %xmm13, %xmm14
- aesdeclast %xmm13, %xmm15
-
- pxor iv, %xmm1 // obuf[0] ^= *iv;
- movups (ibuf), iv // ibuf[0]
- pxor iv, %xmm2 // obuf[1] ^= ibuf[0];
- movups 16(ibuf), iv // ibuf[1]
- pxor iv, %xmm14 // obuf[2] ^= ibuf[1];
- movups 32(ibuf), iv // ibuf[2]
- pxor iv, %xmm15 // obuf[3] ^= obuf[2];
- movups 48(ibuf), iv // *iv = ibuf[3]
-
- movups %xmm1, (obuf) // write 1st obuf
- movups %xmm2, 16(obuf) // write 2nd obuf
- movups %xmm14, 32(obuf) // write 3rd obuf
- movups %xmm15, 48(obuf) // write 4th obuf
-
-
-#else
-
- // aes_decrypt_cbc per 4 blocks using aes-128 for i386
- // xmm1/xmm2/xmm4/xmm5 used for obuf per block
- // xmm3 = key0
- // xmm0 = iv
- // xmm6/xmm7 dynamically load with other expanded keys
-
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
- movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
- movups 48(ibuf), %xmm5 // tmp = 4th ibuf
-
- // aes_decrypt
- // for i386, sequentially load expanded keys into xmm6/xmm7
-
- movups 144(ctx), %xmm6 // key1
-
- // aes-128 decrypt round 0 per 4 blocks
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- pxor %xmm3, %xmm4
- pxor %xmm3, %xmm5
-
- movups 128(ctx), %xmm7 // key2
-
- // aes-128 decrypt round 1 per 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 112(ctx), %xmm6 // key3
-
- // aes-128 decrypt round 2 per 4 blocks
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 96(ctx), %xmm7 // key4
-
- // aes-128 decrypt round 3 per 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 80(ctx), %xmm6 // key5
-
- // aes-128 decrypt round 4 per 4 blocks
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 64(ctx), %xmm7 // key6
-
- // aes-128 decrypt round 5 per 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 48(ctx), %xmm6 // key7
-
- // aes-128 decrypt round 6 per 4 blocks
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 32(ctx), %xmm7 // key8
-
- // aes-128 decrypt round 7 per 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 16(ctx), %xmm6 // key9
-
- // aes-128 decrypt round 8 per 4 blocks
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 0(ctx), %xmm7 // keyA
-
- // aes-128 decrypt round 9 per 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- // aes-128 decrypt round 10 (last) per 4 blocks
- aesdeclast %xmm7, %xmm1
- aesdeclast %xmm7, %xmm2
- aesdeclast %xmm7, %xmm4
- aesdeclast %xmm7, %xmm5
-
- pxor iv, %xmm1 // 1st obuf ^= iv;
- movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm2 // 2nd obuf ^= iv;
- movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm4 // 3rd obuf ^= iv;
- movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm5 // 4th obuf ^= iv;
- movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-
- movups %xmm1, (obuf) // write 1st obuf
- movups %xmm2, 16(obuf) // write 2nd obuf
- movups %xmm4, 32(obuf) // write 3rd obuf
- movups %xmm5, 48(obuf) // write 4th obuf
-#endif
-
- add $64, ibuf // ibuf += 4;
- add $64, obuf // obuf += 4;
-
- sub $4, num_blk // num_blk -= 4
- jge 0b // if num_blk > 0, repeat the loop
-
-9: add $4, num_blk // post incremtn num_blk by 4
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
-
-#if defined __i386__
- // updated as they might be needed as expanded keys in the remaining
- movups 144(ctx), %xmm4
- movups 128(ctx), %xmm5
- movups 112(ctx), %xmm6
- movups 96(ctx), %xmm7
-#endif
-
- test $2, num_blk // check whether num_blk has 2 blocks
- je 9f // if num_blk & 2 == 0, skip the per-pair processing code
-
- // do the remaining 2 blocks together
-
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
-
- // aes_decrypt
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- aesdec %xmm4, %xmm1
- aesdec %xmm4, %xmm2
- aesdec %xmm5, %xmm1
- aesdec %xmm5, %xmm2
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
-#if defined __x86_64__
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdeclast %xmm13, %xmm1
- aesdeclast %xmm13, %xmm2
-#else
- movups 80(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- movups 64(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- movups 48(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- movups 32(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- movups 16(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- movups 0(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdeclast %xmm7, %xmm1
- aesdeclast %xmm7, %xmm2
- movups 112(ctx), %xmm6
- movups 96(ctx), %xmm7
-#endif
-
- pxor iv, %xmm1 // obuf[0] ^= *iv;
- movups (ibuf), iv // ibuf[0]
- pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
- movups 16(ibuf), iv // *iv = ibuf[1]
-
- movups %xmm1, (obuf) // write obuf[0]
- movups %xmm2, 16(obuf) // write obuf[1]
-
- add $32, ibuf // ibuf += 2
- add $32, obuf // obuf += 2
-
-9:
- test $1, num_blk // check whether num_blk has residual 1 block
- je L_HW_cbc_done // if num_blk == 0, no need for residual processing code
-
- movups (ibuf), %xmm2 // tmp = ibuf
- // aes_decrypt
- pxor %xmm3, %xmm2
- aesdec %xmm4, %xmm2
- aesdec %xmm5, %xmm2
- aesdec %xmm6, %xmm2
- aesdec %xmm7, %xmm2
-#if defined __x86_64__
- aesdec %xmm8, %xmm2
- aesdec %xmm9, %xmm2
- aesdec %xmm10, %xmm2
- aesdec %xmm11, %xmm2
- aesdec %xmm12, %xmm2
- aesdeclast %xmm13, %xmm2
-#else
- movups 80(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 64(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 48(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 32(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 16(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups (ctx), %xmm1
- aesdeclast %xmm1, %xmm2
-#endif
-
- pxor iv, %xmm2 // *obuf ^= *iv;
- movups (ibuf), iv // *iv = *ibuf;
- movups %xmm2, (obuf) // write *obuf
-
- jmp L_HW_cbc_done
-
- //
- // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
- //
-
-L_decrypt_192:
-
- cmp $1, num_blk
- jl L_HW_cbc_done // if num_blk < 1, early return
-
- // aes-192 decryp expanded keys
- movups 192(ctx), %xmm3
- movups 176(ctx), %xmm4
- movups 160(ctx), %xmm5
- movups 144(ctx), %xmm6
- movups 128(ctx), %xmm7
-#if defined __x86_64__
- movups 112(ctx), %xmm8
- movups 96(ctx), %xmm9
- movups 80(ctx), %xmm10
- movups 64(ctx), %xmm11
- movups 48(ctx), %xmm12
- movups 32(ctx), %xmm13
- movups 16(ctx), %xmm14
- movups (ctx), %xmm15
-#endif
-
- // performs 4 block decryption in an iteration to exploit decrypt in parallel
-
- // while ((num_blk-=4)>=0) {
- // aes_decrypt(ibuf, obuf, ctx);
- // aes_decrypt(ibuf+1, obuf+1, ctx);
- // aes_decrypt(ibuf+2, obuf+2, ctx);
- // aes_decrypt(ibuf+3, obuf+3, ctx);
- // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
- // *iv = ibuf[3]; ibuf += 4; obuf += 4;
- // }
-
- sub $4, num_blk // pre decrement num_blk by 4
- jl 9f // if num_blk < 4, skip the per-4-blocks processing code
-0:
-
-#if defined __x86_64__
-
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
- movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
- movups 48(ibuf), %xmm15 // tmp = 4th ibuf
-
- // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
- // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
-
- // round 0 for 4 blocks
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- pxor %xmm3, %xmm14
- pxor %xmm3, %xmm15
-
- // round 1 for 4 blocks
- aesdec %xmm4, %xmm1
- aesdec %xmm4, %xmm2
- aesdec %xmm4, %xmm14
- aesdec %xmm4, %xmm15
-
- // round 2 for 4 blocks
- aesdec %xmm5, %xmm1
- aesdec %xmm5, %xmm2
- aesdec %xmm5, %xmm14
- aesdec %xmm5, %xmm15
-
- // round 3 for 4 blocks
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm14
- aesdec %xmm6, %xmm15
-
- // round 4 for 4 blocks
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm14
- aesdec %xmm7, %xmm15
-
- // round 5 for 4 blocks
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm14
- aesdec %xmm8, %xmm15
-
- // round 6 for 4 blocks
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm14
- aesdec %xmm9, %xmm15
-
- // round 7 for 4 blocks
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm14
- aesdec %xmm10, %xmm15
-
- // round 8 for 4 blocks
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm14
- aesdec %xmm11, %xmm15
-
- // round 9 for 4 blocks
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm14
- aesdec %xmm12, %xmm15
-
- movups 16(ctx), %xmm12
-
- // round A for 4 blocks
- aesdec %xmm13, %xmm1
- aesdec %xmm13, %xmm2
- aesdec %xmm13, %xmm14
- aesdec %xmm13, %xmm15
-
- movups (ctx), %xmm13
-
- // round B for 4 blocks
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm14
- aesdec %xmm12, %xmm15
-
- movups 48(ctx), %xmm12 // restore %xmm12 to its original key
-
- // round C (last) for 4 blocks
- aesdeclast %xmm13, %xmm1
- aesdeclast %xmm13, %xmm2
- aesdeclast %xmm13, %xmm14
- aesdeclast %xmm13, %xmm15
-
- movups 32(ctx), %xmm13 // restore %xmm13 to its original key
-
- pxor iv, %xmm1 // obuf[0] ^= *iv;
- movups (ibuf), iv // ibuf[0]
- pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
- movups 16(ibuf), iv // ibuf[1]
- pxor iv, %xmm14 // obuf[2] ^= ibuf[1]
- movups 32(ibuf), iv // ibuf[2]
- pxor iv, %xmm15 // obuf[3] ^= ibuf[2]
- movups 48(ibuf), iv // *iv = ibuf[3]
-
- movups %xmm1, (obuf) // write 1st obuf
- movups %xmm2, 16(obuf) // write 2nd obuf
- movups %xmm14, 32(obuf) // write 3rd obuf
- movups %xmm15, 48(obuf) // write 4th obuf
-
- add $64, ibuf // ibuf += 4;
- add $64, obuf // obuf += 4;
-
- sub $4, num_blk // num_blk -= 4
- jge 0b // if num_blk > 0, repeat the loop
-
-9: add $4, num_blk // post incremtn num_blk by 4
- je L_HW_cbc_done // if num_blk == 0, prepare to return
-
- movups 16(ctx), %xmm14 // restore %xmm14 to its key
- movups (ctx), %xmm15 // restore %xmm15 to its key
-
-#else
-
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
- movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
- movups 48(ibuf), %xmm5 // tmp = 4th ibuf
-
- // aes_decrypt
- // for i386, sequentially load expanded keys into xmm6/xmm7
- movups 176(ctx), %xmm6
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- pxor %xmm3, %xmm4
- pxor %xmm3, %xmm5
-
- movups 160(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 144(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 128(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 112(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 96(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 80(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 64(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 48(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 32(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 16(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 0(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- aesdeclast %xmm7, %xmm1
- aesdeclast %xmm7, %xmm2
- aesdeclast %xmm7, %xmm4
- aesdeclast %xmm7, %xmm5
-
- pxor iv, %xmm1 // 1st obuf ^= iv;
- movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm2 // 2nd obuf ^= iv;
- movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm4 // 3rd obuf ^= iv;
- movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm5 // 4th obuf ^= iv;
- movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
- movups %xmm1, (obuf) // write 1st obuf
- movups %xmm2, 16(obuf) // write 2nd obuf
- movups %xmm4, 32(obuf) // write 3rd obuf
- movups %xmm5, 48(obuf) // write 4th obuf
-
- add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
- add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
-
- sub $4, num_blk // num_blk -= 4
- jge 0b // if num_blk > 0, repeat the loop
-
-
-9: add $4, num_blk // post incremtn num_blk by 4
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
-
- movups 176(ctx), %xmm4
- movups 160(ctx), %xmm5
- movups 144(ctx), %xmm6
- movups 128(ctx), %xmm7
-
-#endif
-
- // per-block aes_decrypt_cbc loop
-
-0:
- movups (ibuf), %xmm2 // tmp = ibuf
-
- // aes_decrypt
- pxor %xmm3, %xmm2
- aesdec %xmm4, %xmm2
- aesdec %xmm5, %xmm2
- aesdec %xmm6, %xmm2
- aesdec %xmm7, %xmm2
-#if defined __x86_64__
- aesdec %xmm8, %xmm2
- aesdec %xmm9, %xmm2
- aesdec %xmm10, %xmm2
- aesdec %xmm11, %xmm2
- aesdec %xmm12, %xmm2
- aesdec %xmm13, %xmm2
- aesdec %xmm14, %xmm2
- aesdeclast %xmm15, %xmm2
-#else
- movups 112(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 96(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 80(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 64(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 48(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 32(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 16(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups (ctx), %xmm1
- aesdeclast %xmm1, %xmm2
-#endif
-
- pxor iv, %xmm2 // obuf ^= iv;
- movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
-
- movups %xmm2, (obuf) // write obuf
-
- add $16, ibuf // ibuf += AES_BLOCK_SIZE;
- add $16, obuf // obuf += AES_BLOCK_SIZE;
- sub $1, num_blk // num_blk --
- jg 0b // if num_blk > 0, repeat the loop
-
- jmp L_HW_cbc_done
-
- //
- // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
- //
-
-L_decrypt_256:
-
- cmp $1, num_blk
- jl L_HW_cbc_done
-
- movups 224(ctx), %xmm3
- movups 208(ctx), %xmm4
- movups 192(ctx), %xmm5
- movups 176(ctx), %xmm6
- movups 160(ctx), %xmm7
-#if defined __x86_64__
- movups 144(ctx), %xmm8
- movups 128(ctx), %xmm9
- movups 112(ctx), %xmm10
- movups 96(ctx), %xmm11
- movups 80(ctx), %xmm12
- movups 64(ctx), %xmm13
- movups 48(ctx), %xmm14
- movups 32(ctx), %xmm15
-// movups 16(ctx), %xmm14
-// movups (ctx), %xmm15
-#endif
-
-#if defined __x86_64__
-
- sub $4, num_blk // pre decrement num_blk by 4
- jl 9f // if num_blk < 4, skip the per-4-blocks processing code
-0:
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
- movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
- movups 48(ibuf), %xmm15 // tmp = 4th ibuf
-
- // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- pxor %xmm3, %xmm14
- pxor %xmm3, %xmm15
-
- aesdec %xmm4, %xmm1
- aesdec %xmm4, %xmm2
- aesdec %xmm4, %xmm14
- aesdec %xmm4, %xmm15
-
- aesdec %xmm5, %xmm1
- aesdec %xmm5, %xmm2
- aesdec %xmm5, %xmm14
- aesdec %xmm5, %xmm15
-
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm14
- aesdec %xmm6, %xmm15
-
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm14
- aesdec %xmm7, %xmm15
-
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm14
- aesdec %xmm8, %xmm15
-
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm14
- aesdec %xmm9, %xmm15
-
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm14
- aesdec %xmm10, %xmm15
-
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm14
- aesdec %xmm11, %xmm15
-
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm14
- aesdec %xmm12, %xmm15
- movups 48(ctx), %xmm12
-
- aesdec %xmm13, %xmm1
- aesdec %xmm13, %xmm2
- aesdec %xmm13, %xmm14
- aesdec %xmm13, %xmm15
- movups 32(ctx), %xmm13
-
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm14
- aesdec %xmm12, %xmm15
- movups 16(ctx), %xmm12
-
- aesdec %xmm13, %xmm1
- aesdec %xmm13, %xmm2
- aesdec %xmm13, %xmm14
- aesdec %xmm13, %xmm15
- movups (ctx), %xmm13
-
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm14
- aesdec %xmm12, %xmm15
- movups 80(ctx), %xmm12
-
- aesdeclast %xmm13, %xmm1
- aesdeclast %xmm13, %xmm2
- aesdeclast %xmm13, %xmm14
- aesdeclast %xmm13, %xmm15
- movups 64(ctx), %xmm13
-
- pxor iv, %xmm1 // obuf ^= iv;
- movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm2 // obuf ^= iv;
- movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm14 // obuf ^= iv;
- movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm15 // obuf ^= iv;
- movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
-
- movups %xmm1, (obuf) // write 1st obuf
- movups %xmm2, 16(obuf) // write 2nd obuf
- movups %xmm14, 32(obuf) // write 3rd obuf
- movups %xmm15, 48(obuf) // write 4th obuf
-
- add $64, ibuf // ibuf += AES_BLOCK_SIZE*4;
- add $64, obuf // obuf += AES_BLOCK_SIZE*4;
-
- sub $4, num_blk // num_blk -= 4
- jge 0b // if num_blk > 0, repeat the loop
-
-9: add $4, num_blk // post incremtn num_blk by 4
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
-
- movups 48(ctx), %xmm14
- movups 32(ctx), %xmm15
-
-#else
-
- sub $4, num_blk // pre decrement num_blk by 4
- jl 9f // if num_blk < 4, skip the per-pair processing code
-0:
- movups (ibuf), %xmm1 // tmp = 1st ibuf
- movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
- movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
- movups 48(ibuf), %xmm5 // tmp = 4th ibuf
-
- // aes_decrypt
- // for i386, sequentially load expanded keys into xmm6/xmm7
- movups 208(ctx), %xmm6
- pxor %xmm3, %xmm1
- pxor %xmm3, %xmm2
- pxor %xmm3, %xmm4
- pxor %xmm3, %xmm5
-
- movups 192(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 176(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 160(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 144(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 128(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 112(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 96(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 80(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 64(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 48(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 32(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- movups 16(ctx), %xmm6
- aesdec %xmm7, %xmm1
- aesdec %xmm7, %xmm2
- aesdec %xmm7, %xmm4
- aesdec %xmm7, %xmm5
-
- movups 0(ctx), %xmm7
- aesdec %xmm6, %xmm1
- aesdec %xmm6, %xmm2
- aesdec %xmm6, %xmm4
- aesdec %xmm6, %xmm5
-
- aesdeclast %xmm7, %xmm1
- aesdeclast %xmm7, %xmm2
- aesdeclast %xmm7, %xmm4
- aesdeclast %xmm7, %xmm5
-
- pxor iv, %xmm1 // 1st obuf ^= iv;
- movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm2 // 2nd obuf ^= iv;
- movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm4 // 3rd obuf ^= iv;
- movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
- pxor iv, %xmm5 // 4th obuf ^= iv;
- movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
- movups %xmm1, (obuf) // write 1st obuf
- movups %xmm2, 16(obuf) // write 2nd obuf
- movups %xmm4, 32(obuf) // write 3rd obuf
- movups %xmm5, 48(obuf) // write 4th obuf
-
- add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
- add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
-
- sub $4, num_blk // num_blk -= 4
- jge 0b // if num_blk > 0, repeat the loop
-
-
-9: add $4, num_blk // post incremtn num_blk by 4
- je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
-
- movups 208(ctx), %xmm4
- movups 192(ctx), %xmm5
- movups 176(ctx), %xmm6
- movups 160(ctx), %xmm7
-
-#endif
-
-0:
- movups (ibuf), %xmm2 // tmp = ibuf
-
- // aes_decrypt
- pxor %xmm3, %xmm2
- aesdec %xmm4, %xmm2
- aesdec %xmm5, %xmm2
- aesdec %xmm6, %xmm2
- aesdec %xmm7, %xmm2
-#if defined __x86_64__
- aesdec %xmm8, %xmm2
- aesdec %xmm9, %xmm2
- aesdec %xmm10, %xmm2
- aesdec %xmm11, %xmm2
- aesdec %xmm12, %xmm2
- aesdec %xmm13, %xmm2
- aesdec %xmm14, %xmm2
- aesdec %xmm15, %xmm2
-#else
- movups 144(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 128(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 112(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 96(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 80(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 64(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 48(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups 32(ctx), %xmm1
- aesdec %xmm1, %xmm2
-#endif
- movups 16(ctx), %xmm1
- aesdec %xmm1, %xmm2
- movups (ctx), %xmm1
- aesdeclast %xmm1, %xmm2
-
- pxor iv, %xmm2 // obuf ^= iv;
- movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
-
- movups %xmm2, (obuf) // write obuf
-
- add $16, ibuf // ibuf += AES_BLOCK_SIZE;
- add $16, obuf // obuf += AES_BLOCK_SIZE;
- sub $1, num_blk // num_blk --
- jg 0b // if num_blk > 0, repeat the loop
-
- jmp L_HW_cbc_done
-
- //
- // --------- END of aes_decrypt_cbc_hw -------------------
- //
+/*\r
+ ---------------------------------------------------------------------------\r
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.\r
+\r
+ LICENSE TERMS\r
+\r
+ The free distribution and use of this software in both source and binary\r
+ form is allowed (with or without changes) provided that:\r
+\r
+ 1. distributions of this source code include the above copyright\r
+ notice, this list of conditions and the following disclaimer;\r
+\r
+ 2. distributions in binary form include the above copyright\r
+ notice, this list of conditions and the following disclaimer\r
+ in the documentation and/or other associated materials;\r
+\r
+ 3. the copyright holder's name is not used to endorse products\r
+ built using this software without specific written permission.\r
+\r
+ ALTERNATIVELY, provided that this notice is retained in full, this product\r
+ may be distributed under the terms of the GNU General Public License (GPL),\r
+ in which case the provisions of the GPL apply INSTEAD OF those given above.\r
+\r
+ DISCLAIMER\r
+\r
+ This software is provided 'as is' with no explicit or implied warranties\r
+ in respect of its properties, including, but not limited to, correctness\r
+ and/or fitness for purpose.\r
+ ---------------------------------------------------------------------------\r
+ Issue 31/01/2006\r
+\r
+ These subroutines implement multiple block AES modes for ECB, CBC, CFB,\r
+ OFB and CTR encryption, The code provides support for the VIA Advanced \r
+ Cryptography Engine (ACE).\r
+\r
+ NOTE: In the following subroutines, the AES contexts (ctx) must be\r
+ 16 byte aligned if VIA ACE is being used\r
+*/\r
+\r
+\r
+/* ---------------------------------------------------------------------------------------------------------------- \r
+\r
+ aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
+\r
+ For simplicity, I am assuming all variables are in 128-bit data type.\r
+\r
+ aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)\r
+ {\r
+ while(num_blk--) {\r
+ *iv ^= *ibuf++;\r
+ aes_encrypt(iv, iv, ctx);\r
+ *obuf++ = *iv;\r
+ }\r
+ return 0;\r
+ }\r
+\r
+ The following is an implementation of this function using Intel AESNI.\r
+ This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. \r
+ Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
+ to this aesni-based function should it detecs that aesni is available.\r
+ Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
+\r
+ Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks\r
+ are serially chained. This prevents us from arranging several blocks for encryption in parallel.\r
+\r
+ ----------------------------------------------------------------------------------------------------------------*/\r
+\r
+ .text\r
+ .align 4,0x90\r
+ .globl _aes_encrypt_cbc_hw\r
+_aes_encrypt_cbc_hw:\r
+\r
+ // push/save registers for local use\r
+#if defined __i386__\r
+\r
+ push %ebp\r
+ movl %esp, %ebp\r
+ push %ebx\r
+ push %edi\r
+\r
+ #define sp %esp\r
+\r
+#else // __x86_64__\r
+\r
+ push %rbp\r
+ mov %rsp, %rbp\r
+ push %rbx\r
+ push %r13\r
+ push %r14\r
+ push %r15\r
+\r
+ #define sp %rsp\r
+\r
+#endif\r
+\r
+ // if this is kernel code, need to save used xmm registers\r
+#ifdef KERNEL\r
+\r
+#if defined __i386__\r
+ sub $(8*16), %esp // for possible xmm0-xmm7 save/restore\r
+#else\r
+ sub $(16*16), %rsp // xmm0-xmm15 save/restore \r
+#endif\r
+\r
+ movaps %xmm0, (sp)\r
+ movaps %xmm1, 16(sp)\r
+ movaps %xmm2, 32(sp)\r
+ movaps %xmm3, 48(sp)\r
+ movaps %xmm4, 64(sp)\r
+ movaps %xmm5, 80(sp)\r
+ movaps %xmm6, 96(sp)\r
+ movaps %xmm7, 112(sp)\r
+#if defined __x86_64__\r
+ movaps %xmm8, 16*8(sp)\r
+ movaps %xmm9, 16*9(sp)\r
+ movaps %xmm10, 16*10(sp)\r
+ movaps %xmm11, 16*11(sp)\r
+ movaps %xmm12, 16*12(sp)\r
+ movaps %xmm13, 16*13(sp)\r
+ movaps %xmm14, 16*14(sp)\r
+ movaps %xmm15, 16*15(sp)\r
+#endif // __x86_64__\r
+\r
+#endif // KERNEL\r
+\r
+ #define iv %xmm0\r
+\r
+#ifdef __i386__\r
+\r
+ mov 12(%ebp), %eax // in_iv\r
+ mov 24(%ebp), %edx // ctx\r
+ movups (%eax), iv // iv = in_iv \r
+ mov 8(%ebp), %ebx // ibuf\r
+ mov 16(%ebp), %ecx // num_blk\r
+ mov 20(%ebp), %edi // obuf\r
+\r
+ #define ibuf %ebx\r
+ #define obuf %edi\r
+ #define num_blk %ecx \r
+ #define ctx %edx\r
+\r
+#else\r
+\r
+ mov %rdi, %rbx // ibuf\r
+ movups (%rsi), iv // iv = in_iv\r
+ mov %rdx, %r13 // num_blk\r
+ mov %rcx, %r14 // obuf\r
+ mov %r8, %r15 // ctx \r
+\r
+ #define ibuf %rbx\r
+ #define num_blk %r13d\r
+ #define obuf %r14 \r
+ #define ctx %r15\r
+\r
+#endif\r
+\r
+ mov 240(ctx), %eax // aes length\r
+ cmp $160, %eax // aes-128 encrypt ?\r
+ je L_encrypt_128\r
+ cmp $192, %eax // aes-192 encrypt ?\r
+ je L_encrypt_192\r
+ cmp $224, %eax // aes-256 encrypt ?\r
+ je L_encrypt_256\r
+ mov $-1, %eax // return error\r
+ jmp L_error \r
+\r
+ //\r
+ // aes-128 encrypt_cbc operation, up to L_HW_cbc_done\r
+ //\r
+\r
+L_encrypt_128:\r
+\r
+ cmp $1, num_blk // check number of block\r
+ jl L_HW_cbc_done // should it be less than 1, nothing to do\r
+\r
+ movups (ctx), %xmm2 // key0\r
+ movups 16(ctx), %xmm3 // key1\r
+ movups 32(ctx), %xmm4 // key2\r
+ movups 48(ctx), %xmm5 // key3\r
+ movups 64(ctx), %xmm6 // key4\r
+ movups 80(ctx), %xmm7 // key5\r
+#if defined __x86_64__\r
+ movups 96(ctx), %xmm8 // key6\r
+ movups 112(ctx), %xmm9 // key7\r
+ movups 128(ctx), %xmm10 // key8\r
+ movups 144(ctx), %xmm11 // key9\r
+ movups 160(ctx), %xmm12 // keyA\r
+#endif\r
+\r
+ // while (num_blk--) {\r
+ // *iv ^= *ibuf++;\r
+ // aes_encrypt(iv, iv, ctx);\r
+ // *obuf++ = *iv;\r
+ // }\r
+0:\r
+ movups (ibuf), %xmm1 // *ibuf\r
+ pxor %xmm2, iv // 1st instruction inside aes_encrypt\r
+ pxor %xmm1, iv // *iv ^= *ibuf\r
+\r
+ // finishing up the rest of aes_encrypt\r
+ aesenc %xmm3, iv\r
+ aesenc %xmm4, iv\r
+ aesenc %xmm5, iv\r
+ aesenc %xmm6, iv\r
+ aesenc %xmm7, iv\r
+#if defined __x86_64__\r
+ aesenc %xmm8, iv\r
+ aesenc %xmm9, iv\r
+ aesenc %xmm10, iv\r
+ aesenc %xmm11, iv\r
+ aesenclast %xmm12, iv\r
+#else\r
+ movups 96(ctx), %xmm1 // key6\r
+ aesenc %xmm1, iv\r
+ movups 112(ctx), %xmm1 // key7\r
+ aesenc %xmm1, iv\r
+ movups 128(ctx), %xmm1 // key8\r
+ aesenc %xmm1, iv\r
+ movups 144(ctx), %xmm1 // key9\r
+ aesenc %xmm1, iv\r
+ movups 160(ctx), %xmm1 // keyA\r
+ aesenclast %xmm1, iv\r
+#endif\r
+\r
+ movups iv, (obuf) // *obuf = *iv;\r
+ add $16, obuf // obuf++;\r
+ add $16, ibuf // ibuf++;\r
+ sub $1, num_blk // num_blk --\r
+ jg 0b // if num_blk > 0, repeat the loop\r
+\r
+ // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)\r
+\r
+L_HW_cbc_done:\r
+\r
+ xor %eax, %eax // to return CRYPT_OK\r
+\r
+L_error:\r
+\r
+ // if kernel, restore xmm registers\r
+#ifdef KERNEL \r
+ movaps 0(sp), %xmm0\r
+ movaps 16(sp), %xmm1\r
+ movaps 32(sp), %xmm2\r
+ movaps 48(sp), %xmm3\r
+ movaps 64(sp), %xmm4\r
+ movaps 80(sp), %xmm5\r
+ movaps 96(sp), %xmm6\r
+ movaps 112(sp), %xmm7\r
+#if defined __x86_64__\r
+ movaps 16*8(sp), %xmm8\r
+ movaps 16*9(sp), %xmm9\r
+ movaps 16*10(sp), %xmm10\r
+ movaps 16*11(sp), %xmm11\r
+ movaps 16*12(sp), %xmm12\r
+ movaps 16*13(sp), %xmm13\r
+ movaps 16*14(sp), %xmm14\r
+ movaps 16*15(sp), %xmm15\r
+#endif // __x86_64__\r
+#endif // KERNEL\r
+\r
+ // release used stack memory, restore used callee-saved registers, and return \r
+#if defined __i386__\r
+#ifdef KERNEL\r
+ add $(8*16), %esp\r
+#endif\r
+ pop %edi\r
+ pop %ebx\r
+#else\r
+#ifdef KERNEL\r
+ add $(16*16), %rsp \r
+#endif\r
+ pop %r15\r
+ pop %r14\r
+ pop %r13\r
+ pop %rbx\r
+#endif\r
+ leave\r
+ ret\r
+\r
+ //\r
+ // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+ //\r
+\r
+L_encrypt_192:\r
+\r
+ cmp $1, num_blk // check number of block\r
+ jl L_HW_cbc_done // should it be less than 1, nothing to do\r
+\r
+ movups (ctx), %xmm2 // key0\r
+ movups 16(ctx), %xmm3 // key1\r
+ movups 32(ctx), %xmm4 // key2\r
+ movups 48(ctx), %xmm5 // key3\r
+ movups 64(ctx), %xmm6 // key4\r
+ movups 80(ctx), %xmm7 // key5\r
+#if defined __x86_64__\r
+ movups 96(ctx), %xmm8 // key6\r
+ movups 112(ctx), %xmm9 // key7\r
+ movups 128(ctx), %xmm10 // key8\r
+ movups 144(ctx), %xmm11 // key9\r
+ movups 160(ctx), %xmm12 // keyA\r
+ movups 176(ctx), %xmm13 // keyB\r
+ movups 192(ctx), %xmm14 // keyC\r
+#endif\r
+ \r
+ // while (num_blk--) {\r
+ // *iv ^= *ibuf++;\r
+ // aes_encrypt(iv, iv, ctx);\r
+ // *obuf++ = *iv;\r
+ // }\r
+0:\r
+ movups (ibuf), %xmm1 // *ibuf\r
+ pxor %xmm1, iv // *iv ^= ibuf\r
+\r
+ // aes_encrypt(iv, iv, ctx);\r
+\r
+ pxor %xmm2, iv\r
+ aesenc %xmm3, iv\r
+ aesenc %xmm4, iv\r
+ aesenc %xmm5, iv\r
+ aesenc %xmm6, iv\r
+ aesenc %xmm7, iv\r
+#if defined __x86_64__\r
+ aesenc %xmm8, iv\r
+ aesenc %xmm9, iv\r
+ aesenc %xmm10, iv\r
+ aesenc %xmm11, iv\r
+ aesenc %xmm12, iv\r
+ aesenc %xmm13, iv\r
+ aesenclast %xmm14, iv\r
+#else\r
+ movups 96(ctx), %xmm1\r
+ aesenc %xmm1, iv\r
+ movups 112(ctx), %xmm1\r
+ aesenc %xmm1, iv\r
+ movups 128(ctx), %xmm1\r
+ aesenc %xmm1, iv\r
+ movups 144(ctx), %xmm1\r
+ aesenc %xmm1, iv\r
+ movups 160(ctx), %xmm1\r
+ aesenc %xmm1, iv\r
+ movups 176(ctx), %xmm1\r
+ aesenc %xmm1, iv\r
+ movups 192(ctx), %xmm1\r
+ aesenclast %xmm1, iv\r
+#endif\r
+\r
+ movups iv, (obuf) // *obuf = *iv;\r
+ add $16, ibuf // ibuf++\r
+ add $16, obuf // obuf++\r
+\r
+ sub $1, num_blk // num_blk --\r
+ jg 0b // if num_blk > 0, repeat the loop\r
+\r
+ jmp L_HW_cbc_done // share with the common exit code\r
+\r
+ //\r
+ // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+ //\r
+\r
+L_encrypt_256:\r
+\r
+ cmp $1, num_blk // check number of block\r
+ jl L_HW_cbc_done // should it be less than 1, nothing to do\r
+\r
+ movups (ctx), %xmm2 // key0\r
+ movups 16(ctx), %xmm3 // key1\r
+ movups 32(ctx), %xmm4 // key2\r
+ movups 48(ctx), %xmm5 // key3\r
+ movups 64(ctx), %xmm6 // key4\r
+ movups 80(ctx), %xmm7 // key5\r
+#if defined __x86_64__\r
+ movups 96(ctx), %xmm8 // key6\r
+ movups 112(ctx), %xmm9 // key7\r
+ movups 128(ctx), %xmm10 // key8\r
+ movups 144(ctx), %xmm11 // key9\r
+ movups 160(ctx), %xmm12 // keyA\r
+ movups 176(ctx), %xmm13 // keyB\r
+ movups 192(ctx), %xmm14 // keyC\r
+ movups 208(ctx), %xmm15 // keyD\r
+ // movups 224(ctx), %xmm1 // keyE\r
+#endif\r
+\r
+ // while (num_blk--) {\r
+ // *iv ^= *ibuf++;\r
+ // aes_encrypt(iv, iv, ctx);\r
+ // *obuf++ = *iv;\r
+ // }\r
+0:\r
+ movups (ibuf), %xmm1 // *ibuf\r
+ pxor %xmm1, iv // *iv ^= ibuf\r
+ \r
+ // aes_encrypt(iv, iv, ctx);\r
+ pxor %xmm2, iv\r
+ aesenc %xmm3, iv\r
+ aesenc %xmm4, iv\r
+ aesenc %xmm5, iv\r
+ aesenc %xmm6, iv\r
+ aesenc %xmm7, iv\r
+#if defined __x86_64__\r
+ movups 224(ctx), %xmm1 // keyE\r
+ aesenc %xmm8, iv\r
+ aesenc %xmm9, iv\r
+ aesenc %xmm10, iv\r
+ aesenc %xmm11, iv\r
+ aesenc %xmm12, iv\r
+ aesenc %xmm13, iv\r
+ aesenc %xmm14, iv\r
+ aesenc %xmm15, iv\r
+ aesenclast %xmm1, iv\r
+#else\r
+ movups 96(ctx), %xmm1 // key6\r
+ aesenc %xmm1, iv\r
+ movups 112(ctx), %xmm1 // key7\r
+ aesenc %xmm1, iv\r
+ movups 128(ctx), %xmm1 // key8\r
+ aesenc %xmm1, iv\r
+ movups 144(ctx), %xmm1 // key9\r
+ aesenc %xmm1, iv\r
+ movups 160(ctx), %xmm1 // keyA\r
+ aesenc %xmm1, iv\r
+ movups 176(ctx), %xmm1 // keyB\r
+ aesenc %xmm1, iv\r
+ movups 192(ctx), %xmm1 // keyC\r
+ aesenc %xmm1, iv\r
+ movups 208(ctx), %xmm1 // keyD\r
+ aesenc %xmm1, iv\r
+ movups 224(ctx), %xmm1 // keyE\r
+ aesenclast %xmm1, iv\r
+#endif\r
+\r
+ movups iv, (obuf) // *obuf = *iv;\r
+ add $16, ibuf // ibuf++\r
+ add $16, obuf // obuf++\r
+\r
+ sub $1, num_blk // num_blk --\r
+ jg 0b // if num_blk > 0, repeat the loop\r
+\r
+ jmp L_HW_cbc_done // share with the common exit code\r
+\r
+\r
+\r
+ //\r
+ // --------- END of aes_encrypt_cbc_hw -------------------\r
+ //\r
+\r
+\r
+/* ---------------------------------------------------------------------------------------------------------------- \r
+\r
+ aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
+\r
+ For simplicity, I am assuming all variables are in 128-bit data type.\r
+\r
+ aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)\r
+ {\r
+ while(num_blk--) {\r
+ aes_decrypt(ibuf, obuf, ctx);\r
+ *obuf++ ^= *iv;\r
+ *iv = *ibuf++;\r
+ }\r
+ return 0;\r
+ }\r
+\r
+ The following is an implementation of this function using Intel AESNI.\r
+ This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. \r
+ Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
+ to this aesni-based function should it detecs that aesni is available.\r
+ Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
+\r
+ Note that the decryption operation is not related over blocks.\r
+ This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.\r
+ This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)\r
+ The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.\r
+\r
+ Example C code for packing 4 blocks in an iteration is shown as follows:\r
+\r
+ while ((num_blk-=4)>=0) {\r
+\r
+ // the following 4 functions can be interleaved to exploit parallelism\r
+ aes_decrypt(ibuf, obuf, ctx);\r
+ aes_decrypt(ibuf+1, obuf+1, ctx);\r
+ aes_decrypt(ibuf+2, obuf+2, ctx);\r
+ aes_decrypt(ibuf+3, obuf+3, ctx);\r
+\r
+ obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
+ *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
+ }\r
+ num_blk+=4;\r
+\r
+ ----------------------------------------------------------------------------------------------------------------*/\r
+\r
+ .text\r
+ .align 4,0x90\r
+ .globl _aes_decrypt_cbc_hw\r
+_aes_decrypt_cbc_hw:\r
+\r
+ // push/save registers for local use\r
+#if defined __i386__\r
+\r
+ push %ebp\r
+ movl %esp, %ebp\r
+ push %ebx // ibuf\r
+ push %edi // obuf\r
+\r
+ #define sp %esp\r
+\r
+#else // __x86_64__\r
+\r
+ push %rbp\r
+ mov %rsp, %rbp\r
+ push %rbx\r
+ push %r13\r
+ push %r14\r
+ push %r15\r
+\r
+ #define sp %rsp\r
+\r
+#endif\r
+\r
+\r
+ // if kernel, allocate stack space to save xmm registers\r
+#ifdef KERNEL\r
+#if defined __i386__\r
+ sub $(8*16), %esp\r
+#else\r
+ sub $(16*16), %rsp\r
+#endif\r
+ movaps %xmm0, (sp)\r
+ movaps %xmm1, 16(sp)\r
+ movaps %xmm2, 32(sp)\r
+ movaps %xmm3, 48(sp)\r
+ movaps %xmm4, 64(sp)\r
+ movaps %xmm5, 80(sp)\r
+ movaps %xmm6, 96(sp)\r
+ movaps %xmm7, 112(sp)\r
+#if defined __x86_64__\r
+ movaps %xmm8, 16*8(sp)\r
+ movaps %xmm9, 16*9(sp)\r
+ movaps %xmm10, 16*10(sp)\r
+ movaps %xmm11, 16*11(sp)\r
+ movaps %xmm12, 16*12(sp)\r
+ movaps %xmm13, 16*13(sp)\r
+ movaps %xmm14, 16*14(sp)\r
+ movaps %xmm15, 16*15(sp)\r
+#endif // __x86_64__\r
+#endif\r
+\r
+ #undef iv\r
+ #define iv %xmm0\r
+\r
+#if defined __i386__\r
+ mov 12(%ebp), %eax // in_iv\r
+ mov 24(%ebp), %edx // ctx\r
+ movups (%eax), iv // iv = in_iv \r
+ mov 8(%ebp), %ebx // ibuf\r
+ mov 16(%ebp), %ecx // num_blk\r
+ mov 20(%ebp), %edi // obuf\r
+\r
+ #define ibuf %ebx\r
+ #define obuf %edi\r
+ #define num_blk %ecx \r
+ #define ctx %edx\r
+\r
+#else // __x86_64__, rdi/rsi/rdx/rcx/r8\r
+\r
+ mov %rdi, %rbx // ibuf\r
+ movups (%rsi), iv // iv = in_iv\r
+ mov %rdx, %r13 // num_blk\r
+ mov %rcx, %r14 // obuf\r
+ mov %r8, %r15 // ctx \r
+\r
+ #define ibuf %rbx\r
+ #define num_blk %r13d\r
+ #define obuf %r14 \r
+ #define ctx %r15\r
+\r
+#endif\r
+\r
+ mov 240(ctx), %eax // aes length\r
+ cmp $160, %eax // aes-128 decrypt\r
+ je L_decrypt_128\r
+ cmp $192, %eax // aes-192 decrypt\r
+ je L_decrypt_192\r
+ cmp $224, %eax // aes-256 decrypt\r
+ je L_decrypt_256\r
+\r
+ mov $-1, %eax // wrong aes length, to return -1\r
+ jmp L_error // early exit due to wrong aes length\r
+\r
+\r
+ //\r
+ // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+ //\r
+\r
+L_decrypt_128:\r
+\r
+ cmp $1, num_blk\r
+ jl L_HW_cbc_done // if num_blk < 1, early return\r
+\r
+ // aes-128 decrypt expanded keys\r
+ movups 160(ctx), %xmm3\r
+ movups 144(ctx), %xmm4\r
+ movups 128(ctx), %xmm5\r
+ movups 112(ctx), %xmm6\r
+ movups 96(ctx), %xmm7\r
+#if defined __x86_64__\r
+ movups 80(ctx), %xmm8\r
+ movups 64(ctx), %xmm9\r
+ movups 48(ctx), %xmm10\r
+ movups 32(ctx), %xmm11\r
+ movups 16(ctx), %xmm12\r
+ movups 0(ctx), %xmm13\r
+#endif\r
+\r
+ // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
+\r
+ // while ((num_blk-=4)>=0) {\r
+ // aes_decrypt(ibuf, obuf, ctx);\r
+ // aes_decrypt(ibuf+1, obuf+1, ctx);\r
+ // aes_decrypt(ibuf+2, obuf+2, ctx);\r
+ // aes_decrypt(ibuf+3, obuf+3, ctx);\r
+ // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
+ // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
+ // }\r
+\r
+ sub $4, num_blk // pre decrement num_blk by 4\r
+ jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
+\r
+0:\r
+\r
+\r
+#if defined __x86_64__\r
+\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+ movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
+ movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
+\r
+ // for x86_64, the expanded keys are already stored in xmm3-xmm13\r
+\r
+ // aes-128 decrypt round 0 per 4 blocks\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ pxor %xmm3, %xmm14\r
+ pxor %xmm3, %xmm15\r
+\r
+ // aes-128 decrypt round 1 per 4 blocks\r
+ aesdec %xmm4, %xmm1\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm4, %xmm14\r
+ aesdec %xmm4, %xmm15\r
+\r
+ // aes-128 decrypt round 2 per 4 blocks\r
+ aesdec %xmm5, %xmm1\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm5, %xmm14\r
+ aesdec %xmm5, %xmm15\r
+\r
+ // aes-128 decrypt round 3 per 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm14\r
+ aesdec %xmm6, %xmm15\r
+\r
+ // aes-128 decrypt round 4 per 4 blocks\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm14\r
+ aesdec %xmm7, %xmm15\r
+\r
+ // aes-128 decrypt round 5 per 4 blocks\r
+ aesdec %xmm8, %xmm1\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm8, %xmm14\r
+ aesdec %xmm8, %xmm15\r
+\r
+ // aes-128 decrypt round 6 per 4 blocks\r
+ aesdec %xmm9, %xmm1\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm9, %xmm14\r
+ aesdec %xmm9, %xmm15\r
+\r
+ // aes-128 decrypt round 7 per 4 blocks\r
+ aesdec %xmm10, %xmm1\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm10, %xmm14\r
+ aesdec %xmm10, %xmm15\r
+\r
+ // aes-128 decrypt round 8 per 4 blocks\r
+ aesdec %xmm11, %xmm1\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm11, %xmm14\r
+ aesdec %xmm11, %xmm15\r
+\r
+ // aes-128 decrypt round 9 per 4 blocks\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm12, %xmm14\r
+ aesdec %xmm12, %xmm15\r
+\r
+ // aes-128 decrypt round 10 (last) per 4 blocks\r
+ aesdeclast %xmm13, %xmm1\r
+ aesdeclast %xmm13, %xmm2\r
+ aesdeclast %xmm13, %xmm14\r
+ aesdeclast %xmm13, %xmm15\r
+\r
+ pxor iv, %xmm1 // obuf[0] ^= *iv; \r
+ movups (ibuf), iv // ibuf[0]\r
+ pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; \r
+ movups 16(ibuf), iv // ibuf[1]\r
+ pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; \r
+ movups 32(ibuf), iv // ibuf[2] \r
+ pxor iv, %xmm15 // obuf[3] ^= obuf[2]; \r
+ movups 48(ibuf), iv // *iv = ibuf[3]\r
+\r
+ movups %xmm1, (obuf) // write 1st obuf\r
+ movups %xmm2, 16(obuf) // write 2nd obuf\r
+ movups %xmm14, 32(obuf) // write 3rd obuf\r
+ movups %xmm15, 48(obuf) // write 4th obuf\r
+\r
+\r
+#else\r
+\r
+ // aes_decrypt_cbc per 4 blocks using aes-128 for i386\r
+ // xmm1/xmm2/xmm4/xmm5 used for obuf per block\r
+ // xmm3 = key0\r
+ // xmm0 = iv\r
+ // xmm6/xmm7 dynamically load with other expanded keys\r
+\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+ movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
+ movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
+\r
+ // aes_decrypt\r
+ // for i386, sequentially load expanded keys into xmm6/xmm7\r
+\r
+ movups 144(ctx), %xmm6 // key1\r
+\r
+ // aes-128 decrypt round 0 per 4 blocks\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ pxor %xmm3, %xmm4\r
+ pxor %xmm3, %xmm5\r
+\r
+ movups 128(ctx), %xmm7 // key2\r
+\r
+ // aes-128 decrypt round 1 per 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 112(ctx), %xmm6 // key3\r
+\r
+ // aes-128 decrypt round 2 per 4 blocks\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 96(ctx), %xmm7 // key4\r
+\r
+ // aes-128 decrypt round 3 per 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 80(ctx), %xmm6 // key5\r
+\r
+ // aes-128 decrypt round 4 per 4 blocks\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 64(ctx), %xmm7 // key6\r
+\r
+ // aes-128 decrypt round 5 per 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 48(ctx), %xmm6 // key7\r
+\r
+ // aes-128 decrypt round 6 per 4 blocks\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 32(ctx), %xmm7 // key8\r
+\r
+ // aes-128 decrypt round 7 per 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 16(ctx), %xmm6 // key9\r
+\r
+ // aes-128 decrypt round 8 per 4 blocks\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 0(ctx), %xmm7 // keyA\r
+\r
+ // aes-128 decrypt round 9 per 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ // aes-128 decrypt round 10 (last) per 4 blocks\r
+ aesdeclast %xmm7, %xmm1\r
+ aesdeclast %xmm7, %xmm2\r
+ aesdeclast %xmm7, %xmm4\r
+ aesdeclast %xmm7, %xmm5\r
+\r
+ pxor iv, %xmm1 // 1st obuf ^= iv; \r
+ movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm2 // 2nd obuf ^= iv; \r
+ movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm4 // 3rd obuf ^= iv; \r
+ movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm5 // 4th obuf ^= iv; \r
+ movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+ movups %xmm1, (obuf) // write 1st obuf\r
+ movups %xmm2, 16(obuf) // write 2nd obuf\r
+ movups %xmm4, 32(obuf) // write 3rd obuf\r
+ movups %xmm5, 48(obuf) // write 4th obuf\r
+#endif\r
+\r
+ add $64, ibuf // ibuf += 4; \r
+ add $64, obuf // obuf += 4; \r
+\r
+ sub $4, num_blk // num_blk -= 4\r
+ jge 0b // if num_blk > 0, repeat the loop\r
+\r
+9: add $4, num_blk // post incremtn num_blk by 4\r
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
+\r
+#if defined __i386__\r
+ // updated as they might be needed as expanded keys in the remaining\r
+ movups 144(ctx), %xmm4\r
+ movups 128(ctx), %xmm5\r
+ movups 112(ctx), %xmm6\r
+ movups 96(ctx), %xmm7\r
+#endif\r
+\r
+ test $2, num_blk // check whether num_blk has 2 blocks\r
+ je 9f // if num_blk & 2 == 0, skip the per-pair processing code\r
+\r
+ // do the remaining 2 blocks together\r
+\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+\r
+ // aes_decrypt\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ aesdec %xmm4, %xmm1\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm5, %xmm1\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+#if defined __x86_64__\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm8, %xmm1\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm9, %xmm1\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm10, %xmm1\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm11, %xmm1\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdeclast %xmm13, %xmm1\r
+ aesdeclast %xmm13, %xmm2\r
+#else\r
+ movups 80(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ movups 64(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ movups 48(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ movups 32(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ movups 16(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ movups 0(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdeclast %xmm7, %xmm1\r
+ aesdeclast %xmm7, %xmm2\r
+ movups 112(ctx), %xmm6\r
+ movups 96(ctx), %xmm7\r
+#endif\r
+\r
+ pxor iv, %xmm1 // obuf[0] ^= *iv; \r
+ movups (ibuf), iv // ibuf[0]\r
+ pxor iv, %xmm2 // obuf[1] ^= ibuf[0]\r
+ movups 16(ibuf), iv // *iv = ibuf[1]\r
+\r
+ movups %xmm1, (obuf) // write obuf[0]\r
+ movups %xmm2, 16(obuf) // write obuf[1]\r
+\r
+ add $32, ibuf // ibuf += 2\r
+ add $32, obuf // obuf += 2\r
+\r
+9:\r
+ test $1, num_blk // check whether num_blk has residual 1 block\r
+ je L_HW_cbc_done // if num_blk == 0, no need for residual processing code\r
+ \r
+ movups (ibuf), %xmm2 // tmp = ibuf\r
+ // aes_decrypt\r
+ pxor %xmm3, %xmm2\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm7, %xmm2\r
+#if defined __x86_64__\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm12, %xmm2\r
+ aesdeclast %xmm13, %xmm2\r
+#else\r
+ movups 80(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 64(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 48(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 32(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 16(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups (ctx), %xmm1\r
+ aesdeclast %xmm1, %xmm2\r
+#endif\r
+\r
+ pxor iv, %xmm2 // *obuf ^= *iv; \r
+ movups (ibuf), iv // *iv = *ibuf;\r
+ movups %xmm2, (obuf) // write *obuf\r
+\r
+ jmp L_HW_cbc_done\r
+\r
+ //\r
+ // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+ //\r
+\r
+L_decrypt_192:\r
+\r
+ cmp $1, num_blk\r
+ jl L_HW_cbc_done // if num_blk < 1, early return\r
+\r
+ // aes-192 decryp expanded keys\r
+ movups 192(ctx), %xmm3\r
+ movups 176(ctx), %xmm4\r
+ movups 160(ctx), %xmm5\r
+ movups 144(ctx), %xmm6\r
+ movups 128(ctx), %xmm7\r
+#if defined __x86_64__\r
+ movups 112(ctx), %xmm8\r
+ movups 96(ctx), %xmm9\r
+ movups 80(ctx), %xmm10\r
+ movups 64(ctx), %xmm11\r
+ movups 48(ctx), %xmm12\r
+ movups 32(ctx), %xmm13\r
+ movups 16(ctx), %xmm14\r
+ movups (ctx), %xmm15\r
+#endif\r
+\r
+ // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
+\r
+ // while ((num_blk-=4)>=0) {\r
+ // aes_decrypt(ibuf, obuf, ctx);\r
+ // aes_decrypt(ibuf+1, obuf+1, ctx);\r
+ // aes_decrypt(ibuf+2, obuf+2, ctx);\r
+ // aes_decrypt(ibuf+3, obuf+3, ctx);\r
+ // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
+ // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
+ // }\r
+\r
+ sub $4, num_blk // pre decrement num_blk by 4\r
+ jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
+0:\r
+\r
+#if defined __x86_64__\r
+\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+ movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
+ movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
+\r
+ // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
+ // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards\r
+\r
+ // round 0 for 4 blocks\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ pxor %xmm3, %xmm14\r
+ pxor %xmm3, %xmm15\r
+\r
+ // round 1 for 4 blocks\r
+ aesdec %xmm4, %xmm1\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm4, %xmm14\r
+ aesdec %xmm4, %xmm15\r
+\r
+ // round 2 for 4 blocks\r
+ aesdec %xmm5, %xmm1\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm5, %xmm14\r
+ aesdec %xmm5, %xmm15\r
+\r
+ // round 3 for 4 blocks\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm14\r
+ aesdec %xmm6, %xmm15\r
+\r
+ // round 4 for 4 blocks\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm14\r
+ aesdec %xmm7, %xmm15\r
+\r
+ // round 5 for 4 blocks\r
+ aesdec %xmm8, %xmm1\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm8, %xmm14\r
+ aesdec %xmm8, %xmm15\r
+\r
+ // round 6 for 4 blocks\r
+ aesdec %xmm9, %xmm1\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm9, %xmm14\r
+ aesdec %xmm9, %xmm15\r
+\r
+ // round 7 for 4 blocks\r
+ aesdec %xmm10, %xmm1\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm10, %xmm14\r
+ aesdec %xmm10, %xmm15\r
+\r
+ // round 8 for 4 blocks\r
+ aesdec %xmm11, %xmm1\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm11, %xmm14\r
+ aesdec %xmm11, %xmm15\r
+\r
+ // round 9 for 4 blocks\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm12, %xmm14\r
+ aesdec %xmm12, %xmm15\r
+\r
+ movups 16(ctx), %xmm12\r
+\r
+ // round A for 4 blocks\r
+ aesdec %xmm13, %xmm1\r
+ aesdec %xmm13, %xmm2\r
+ aesdec %xmm13, %xmm14\r
+ aesdec %xmm13, %xmm15\r
+\r
+ movups (ctx), %xmm13\r
+\r
+ // round B for 4 blocks\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm12, %xmm14\r
+ aesdec %xmm12, %xmm15\r
+\r
+ movups 48(ctx), %xmm12 // restore %xmm12 to its original key\r
+\r
+ // round C (last) for 4 blocks\r
+ aesdeclast %xmm13, %xmm1\r
+ aesdeclast %xmm13, %xmm2\r
+ aesdeclast %xmm13, %xmm14\r
+ aesdeclast %xmm13, %xmm15\r
+\r
+ movups 32(ctx), %xmm13 // restore %xmm13 to its original key\r
+\r
+ pxor iv, %xmm1 // obuf[0] ^= *iv; \r
+ movups (ibuf), iv // ibuf[0]\r
+ pxor iv, %xmm2 // obuf[1] ^= ibuf[0] \r
+ movups 16(ibuf), iv // ibuf[1]\r
+ pxor iv, %xmm14 // obuf[2] ^= ibuf[1] \r
+ movups 32(ibuf), iv // ibuf[2] \r
+ pxor iv, %xmm15 // obuf[3] ^= ibuf[2] \r
+ movups 48(ibuf), iv // *iv = ibuf[3] \r
+\r
+ movups %xmm1, (obuf) // write 1st obuf\r
+ movups %xmm2, 16(obuf) // write 2nd obuf\r
+ movups %xmm14, 32(obuf) // write 3rd obuf\r
+ movups %xmm15, 48(obuf) // write 4th obuf\r
+\r
+ add $64, ibuf // ibuf += 4; \r
+ add $64, obuf // obuf += 4; \r
+\r
+ sub $4, num_blk // num_blk -= 4\r
+ jge 0b // if num_blk > 0, repeat the loop\r
+\r
+9: add $4, num_blk // post incremtn num_blk by 4\r
+ je L_HW_cbc_done // if num_blk == 0, prepare to return \r
+\r
+ movups 16(ctx), %xmm14 // restore %xmm14 to its key\r
+ movups (ctx), %xmm15 // restore %xmm15 to its key\r
+\r
+#else\r
+\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+ movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
+ movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
+\r
+ // aes_decrypt\r
+ // for i386, sequentially load expanded keys into xmm6/xmm7\r
+ movups 176(ctx), %xmm6\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ pxor %xmm3, %xmm4\r
+ pxor %xmm3, %xmm5\r
+\r
+ movups 160(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 144(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 128(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 112(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 96(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 80(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 64(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 48(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 32(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 16(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 0(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ aesdeclast %xmm7, %xmm1\r
+ aesdeclast %xmm7, %xmm2\r
+ aesdeclast %xmm7, %xmm4\r
+ aesdeclast %xmm7, %xmm5\r
+\r
+ pxor iv, %xmm1 // 1st obuf ^= iv; \r
+ movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm2 // 2nd obuf ^= iv; \r
+ movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm4 // 3rd obuf ^= iv; \r
+ movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm5 // 4th obuf ^= iv; \r
+ movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ movups %xmm1, (obuf) // write 1st obuf\r
+ movups %xmm2, 16(obuf) // write 2nd obuf\r
+ movups %xmm4, 32(obuf) // write 3rd obuf\r
+ movups %xmm5, 48(obuf) // write 4th obuf\r
+\r
+ add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r
+ add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r
+\r
+ sub $4, num_blk // num_blk -= 4\r
+ jge 0b // if num_blk > 0, repeat the loop\r
+\r
+\r
+9: add $4, num_blk // post incremtn num_blk by 4\r
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
+\r
+ movups 176(ctx), %xmm4\r
+ movups 160(ctx), %xmm5\r
+ movups 144(ctx), %xmm6\r
+ movups 128(ctx), %xmm7\r
+\r
+#endif\r
+\r
+ // per-block aes_decrypt_cbc loop\r
+\r
+0:\r
+ movups (ibuf), %xmm2 // tmp = ibuf\r
+\r
+ // aes_decrypt\r
+ pxor %xmm3, %xmm2\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm7, %xmm2\r
+#if defined __x86_64__\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm13, %xmm2\r
+ aesdec %xmm14, %xmm2\r
+ aesdeclast %xmm15, %xmm2\r
+#else\r
+ movups 112(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 96(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 80(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 64(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 48(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 32(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 16(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups (ctx), %xmm1\r
+ aesdeclast %xmm1, %xmm2\r
+#endif\r
+\r
+ pxor iv, %xmm2 // obuf ^= iv; \r
+ movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+ movups %xmm2, (obuf) // write obuf\r
+\r
+ add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r
+ add $16, obuf // obuf += AES_BLOCK_SIZE; \r
+ sub $1, num_blk // num_blk --\r
+ jg 0b // if num_blk > 0, repeat the loop\r
+\r
+ jmp L_HW_cbc_done\r
+\r
+ //\r
+ // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+ //\r
+\r
+L_decrypt_256:\r
+\r
+ cmp $1, num_blk\r
+ jl L_HW_cbc_done \r
+\r
+ movups 224(ctx), %xmm3\r
+ movups 208(ctx), %xmm4\r
+ movups 192(ctx), %xmm5\r
+ movups 176(ctx), %xmm6\r
+ movups 160(ctx), %xmm7\r
+#if defined __x86_64__\r
+ movups 144(ctx), %xmm8\r
+ movups 128(ctx), %xmm9\r
+ movups 112(ctx), %xmm10\r
+ movups 96(ctx), %xmm11\r
+ movups 80(ctx), %xmm12\r
+ movups 64(ctx), %xmm13\r
+ movups 48(ctx), %xmm14\r
+ movups 32(ctx), %xmm15\r
+// movups 16(ctx), %xmm14\r
+// movups (ctx), %xmm15\r
+#endif\r
+\r
+#if defined __x86_64__\r
+\r
+ sub $4, num_blk // pre decrement num_blk by 4\r
+ jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
+0:\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+ movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
+ movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
+\r
+ // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ pxor %xmm3, %xmm14\r
+ pxor %xmm3, %xmm15\r
+\r
+ aesdec %xmm4, %xmm1\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm4, %xmm14\r
+ aesdec %xmm4, %xmm15\r
+\r
+ aesdec %xmm5, %xmm1\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm5, %xmm14\r
+ aesdec %xmm5, %xmm15\r
+\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm14\r
+ aesdec %xmm6, %xmm15\r
+\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm14\r
+ aesdec %xmm7, %xmm15\r
+\r
+ aesdec %xmm8, %xmm1\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm8, %xmm14\r
+ aesdec %xmm8, %xmm15\r
+\r
+ aesdec %xmm9, %xmm1\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm9, %xmm14\r
+ aesdec %xmm9, %xmm15\r
+\r
+ aesdec %xmm10, %xmm1\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm10, %xmm14\r
+ aesdec %xmm10, %xmm15\r
+\r
+ aesdec %xmm11, %xmm1\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm11, %xmm14\r
+ aesdec %xmm11, %xmm15\r
+\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm12, %xmm14\r
+ aesdec %xmm12, %xmm15\r
+ movups 48(ctx), %xmm12\r
+\r
+ aesdec %xmm13, %xmm1\r
+ aesdec %xmm13, %xmm2\r
+ aesdec %xmm13, %xmm14\r
+ aesdec %xmm13, %xmm15\r
+ movups 32(ctx), %xmm13\r
+\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm12, %xmm14\r
+ aesdec %xmm12, %xmm15\r
+ movups 16(ctx), %xmm12\r
+\r
+ aesdec %xmm13, %xmm1\r
+ aesdec %xmm13, %xmm2\r
+ aesdec %xmm13, %xmm14\r
+ aesdec %xmm13, %xmm15\r
+ movups (ctx), %xmm13\r
+\r
+ aesdec %xmm12, %xmm1\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm12, %xmm14\r
+ aesdec %xmm12, %xmm15\r
+ movups 80(ctx), %xmm12\r
+\r
+ aesdeclast %xmm13, %xmm1\r
+ aesdeclast %xmm13, %xmm2\r
+ aesdeclast %xmm13, %xmm14\r
+ aesdeclast %xmm13, %xmm15\r
+ movups 64(ctx), %xmm13\r
+\r
+ pxor iv, %xmm1 // obuf ^= iv; \r
+ movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm2 // obuf ^= iv; \r
+ movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm14 // obuf ^= iv; \r
+ movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm15 // obuf ^= iv; \r
+ movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+ movups %xmm1, (obuf) // write 1st obuf\r
+ movups %xmm2, 16(obuf) // write 2nd obuf\r
+ movups %xmm14, 32(obuf) // write 3rd obuf\r
+ movups %xmm15, 48(obuf) // write 4th obuf\r
+\r
+ add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; \r
+ add $64, obuf // obuf += AES_BLOCK_SIZE*4; \r
+\r
+ sub $4, num_blk // num_blk -= 4\r
+ jge 0b // if num_blk > 0, repeat the loop\r
+\r
+9: add $4, num_blk // post incremtn num_blk by 4\r
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
+\r
+ movups 48(ctx), %xmm14\r
+ movups 32(ctx), %xmm15\r
+\r
+#else\r
+\r
+ sub $4, num_blk // pre decrement num_blk by 4\r
+ jl 9f // if num_blk < 4, skip the per-pair processing code\r
+0:\r
+ movups (ibuf), %xmm1 // tmp = 1st ibuf\r
+ movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
+ movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
+ movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
+\r
+ // aes_decrypt\r
+ // for i386, sequentially load expanded keys into xmm6/xmm7\r
+ movups 208(ctx), %xmm6\r
+ pxor %xmm3, %xmm1\r
+ pxor %xmm3, %xmm2\r
+ pxor %xmm3, %xmm4\r
+ pxor %xmm3, %xmm5\r
+\r
+ movups 192(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 176(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 160(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 144(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 128(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 112(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 96(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 80(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 64(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 48(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 32(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ movups 16(ctx), %xmm6\r
+ aesdec %xmm7, %xmm1\r
+ aesdec %xmm7, %xmm2\r
+ aesdec %xmm7, %xmm4\r
+ aesdec %xmm7, %xmm5\r
+\r
+ movups 0(ctx), %xmm7\r
+ aesdec %xmm6, %xmm1\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm6, %xmm4\r
+ aesdec %xmm6, %xmm5\r
+\r
+ aesdeclast %xmm7, %xmm1\r
+ aesdeclast %xmm7, %xmm2\r
+ aesdeclast %xmm7, %xmm4\r
+ aesdeclast %xmm7, %xmm5\r
+\r
+ pxor iv, %xmm1 // 1st obuf ^= iv; \r
+ movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm2 // 2nd obuf ^= iv; \r
+ movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm4 // 3rd obuf ^= iv; \r
+ movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ pxor iv, %xmm5 // 4th obuf ^= iv; \r
+ movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+ movups %xmm1, (obuf) // write 1st obuf\r
+ movups %xmm2, 16(obuf) // write 2nd obuf\r
+ movups %xmm4, 32(obuf) // write 3rd obuf\r
+ movups %xmm5, 48(obuf) // write 4th obuf\r
+\r
+ add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r
+ add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r
+\r
+ sub $4, num_blk // num_blk -= 4\r
+ jge 0b // if num_blk > 0, repeat the loop\r
+\r
+\r
+9: add $4, num_blk // post incremtn num_blk by 4\r
+ je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
+\r
+ movups 208(ctx), %xmm4\r
+ movups 192(ctx), %xmm5\r
+ movups 176(ctx), %xmm6\r
+ movups 160(ctx), %xmm7\r
+\r
+#endif\r
+\r
+0:\r
+ movups (ibuf), %xmm2 // tmp = ibuf\r
+\r
+ // aes_decrypt\r
+ pxor %xmm3, %xmm2\r
+ aesdec %xmm4, %xmm2\r
+ aesdec %xmm5, %xmm2\r
+ aesdec %xmm6, %xmm2\r
+ aesdec %xmm7, %xmm2\r
+#if defined __x86_64__\r
+ aesdec %xmm8, %xmm2\r
+ aesdec %xmm9, %xmm2\r
+ aesdec %xmm10, %xmm2\r
+ aesdec %xmm11, %xmm2\r
+ aesdec %xmm12, %xmm2\r
+ aesdec %xmm13, %xmm2\r
+ aesdec %xmm14, %xmm2\r
+ aesdec %xmm15, %xmm2\r
+#else\r
+ movups 144(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 128(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 112(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 96(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 80(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 64(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 48(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups 32(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+#endif\r
+ movups 16(ctx), %xmm1\r
+ aesdec %xmm1, %xmm2\r
+ movups (ctx), %xmm1\r
+ aesdeclast %xmm1, %xmm2\r
+\r
+ pxor iv, %xmm2 // obuf ^= iv; \r
+ movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+ movups %xmm2, (obuf) // write obuf\r
+\r
+ add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r
+ add $16, obuf // obuf += AES_BLOCK_SIZE; \r
+ sub $1, num_blk // num_blk --\r
+ jg 0b // if num_blk > 0, repeat the loop\r
+\r
+ jmp L_HW_cbc_done\r
+\r
+ //\r
+ // --------- END of aes_decrypt_cbc_hw -------------------\r
+ //\r