[apple/xnu.git] / bsd / crypto / aes / i386 / aes_modes_hw.s

/*\r
 ---------------------------------------------------------------------------\r
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.\r
\r
 LICENSE TERMS\r
\r
 The free distribution and use of this software in both source and binary\r
 form is allowed (with or without changes) provided that:\r
\r
   1. distributions of this source code include the above copyright\r
      notice, this list of conditions and the following disclaimer;\r
\r
   2. distributions in binary form include the above copyright\r
      notice, this list of conditions and the following disclaimer\r
      in the documentation and/or other associated materials;\r
\r
   3. the copyright holder's name is not used to endorse products\r
      built using this software without specific written permission.\r
\r
 ALTERNATIVELY, provided that this notice is retained in full, this product\r
 may be distributed under the terms of the GNU General Public License (GPL),\r
 in which case the provisions of the GPL apply INSTEAD OF those given above.\r
\r
 DISCLAIMER\r
\r
 This software is provided 'as is' with no explicit or implied warranties\r
 in respect of its properties, including, but not limited to, correctness\r
 and/or fitness for purpose.\r
 ---------------------------------------------------------------------------\r
 Issue 31/01/2006\r
\r
 These subroutines implement multiple block AES modes for ECB, CBC, CFB,\r
 OFB and CTR encryption,  The code provides support for the VIA Advanced \r
 Cryptography Engine (ACE).\r
\r
 NOTE: In the following subroutines, the AES contexts (ctx) must be\r
 16 byte aligned if VIA ACE is being used\r
*/\r
\r
/* modified 3/5/10 cclee */\r
/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */\r
/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */\r
\r
/* HW-AES specific implementation cclee 3-12-10 */\r
/* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, \r
	and if kHasAES is detected, branch to the hw-specific functions here */\r
\r
\r
/* 	\r
	This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation\r
	of _aes_encrypt_cbc and _aes_decrypt_cbc. \r
\r
	These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. \r
	They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.\r
\r
	The AES HW is detected 1st thing in \r
		_aes_encrypt_cbc (aes_modes_asm.s) \r
		_aes_decrypt_cbc (aes_modes_asm.s)\r
	and, if AES HW is detected, branch without link (ie, jump) to the functions here.\r
\r
	The implementation here follows the examples in an Intel White Paper\r
	"Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01\r
\r
	Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01\r
\r
	cclee 3-13-10\r
*/\r
\r
/* \r
	The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block\r
	in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks\r
	in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.\r
\r
	The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) \r
\r
	This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.\r
	On a K18 (2.4GHz core-i5/2.66GHz core-i7), the x86_64 decrypt throughput (in xnu-iokit) has been improved\r
	from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.\r
	The encrypt throughput is not changed.  \r
\r
	I also enhanced the assembly code comments.\r
\r
	cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)\r
\r
*/\r
\r
/* ---------------------------------------------------------------------------------------------------------------- \r
\r
	aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
\r
	For simplicity, I am assuming all variables are in 128-bit data type.\r
\r
	aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)\r
	{\r
		while(num_blk--) {\r
			*iv ^= *ibuf++;\r
			aes_encrypt(iv, iv, ctx);\r
			*obuf++ = *iv;\r
		}\r
		return 0;\r
	}\r
\r
	The following is an implementation of this function using Intel AESNI.\r
	This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. \r
	Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
	to this aesni-based function should it detecs that aesni is available.\r
	Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
\r
	Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks\r
	are serially chained. This prevents us from arranging several blocks for encryption in parallel.\r
\r
   ----------------------------------------------------------------------------------------------------------------*/\r
\r
	.text\r
	.align	4,0x90\r
	.globl	_aes_encrypt_cbc_hw\r
_aes_encrypt_cbc_hw:\r
\r
	// push/save registers for local use\r
#if	defined	__i386__\r
\r
	push	%ebp\r
	movl	%esp, %ebp\r
	push	%ebx\r
	push	%edi\r
\r
	#define	sp	%esp\r
\r
#else	// __x86_64__\r
\r
	push	%rbp\r
	mov		%rsp, %rbp\r
	push	%rbx\r
	push	%r13\r
	push	%r14\r
	push	%r15\r
\r
	#define	sp	%rsp\r
\r
#endif\r
\r
	// if this is kernel code, need to save used xmm registers\r
#ifdef	KERNEL\r
\r
#if defined __i386__\r
	sub		$(8*16), %esp			// for possible xmm0-xmm7 save/restore\r
#else\r
	sub		$(16*16), %rsp		// xmm0-xmm15 save/restore	\r
#endif\r
\r
	movaps	%xmm0, (sp)\r
	movaps	%xmm1, 16(sp)\r
	movaps	%xmm2, 32(sp)\r
	movaps	%xmm3, 48(sp)\r
	movaps	%xmm4, 64(sp)\r
	movaps	%xmm5, 80(sp)\r
	movaps	%xmm6, 96(sp)\r
	movaps	%xmm7, 112(sp)\r
#if defined	__x86_64__\r
	movaps	%xmm8, 16*8(sp)\r
	movaps	%xmm9, 16*9(sp)\r
	movaps	%xmm10, 16*10(sp)\r
	movaps	%xmm11, 16*11(sp)\r
	movaps	%xmm12, 16*12(sp)\r
	movaps	%xmm13, 16*13(sp)\r
	movaps	%xmm14, 16*14(sp)\r
	movaps	%xmm15, 16*15(sp)\r
#endif	// __x86_64__\r
\r
#endif	// KERNEL\r
\r
	#define	iv	%xmm0\r
\r
#ifdef	__i386__\r
\r
	mov		12(%ebp), %eax			// in_iv\r
	mov		24(%ebp), %edx			// ctx\r
	movups	(%eax), iv				// iv = in_iv	\r
	mov		8(%ebp), %ebx			// ibuf\r
	mov		16(%ebp), %ecx			// num_blk\r
	mov		20(%ebp), %edi			// obuf\r
\r
	#define	ibuf	%ebx\r
	#define	obuf	%edi\r
	#define num_blk	%ecx	\r
	#define	ctx		%edx\r
\r
#else\r
\r
	mov		%rdi, %rbx				// ibuf\r
	movups	(%rsi), iv				// iv = in_iv\r
	mov		%rdx, %r13				// num_blk\r
	mov		%rcx, %r14				// obuf\r
	mov		%r8, %r15				// ctx	\r
\r
	#define	ibuf	%rbx\r
	#define	num_blk	%r13d\r
	#define	obuf	%r14	\r
	#define	ctx		%r15\r
\r
#endif\r
\r
	mov		240(ctx), %eax			// aes length\r
	cmp		$160, %eax				// aes-128 encrypt ?\r
	je		L_encrypt_128\r
	cmp		$192, %eax				// aes-192 encrypt ?\r
	je		L_encrypt_192\r
	cmp		$224, %eax				// aes-256 encrypt ?\r
	je		L_encrypt_256\r
	mov		$-1, %eax				// return error\r
	jmp		L_error	\r
\r
	//\r
	// aes-128 encrypt_cbc operation, up to L_HW_cbc_done\r
	//\r
\r
L_encrypt_128:\r
\r
	cmp		$1, num_blk				// check number of block\r
	jl		L_HW_cbc_done			// should it be less than 1, nothing to do\r
\r
	movups	(ctx), %xmm2			// key0\r
	movups	16(ctx), %xmm3			// key1\r
	movups	32(ctx), %xmm4			// key2\r
	movups	48(ctx), %xmm5			// key3\r
	movups	64(ctx), %xmm6			// key4\r
	movups	80(ctx), %xmm7			// key5\r
#if defined	__x86_64__\r
	movups	96(ctx), %xmm8			// key6\r
	movups	112(ctx), %xmm9			// key7\r
	movups	128(ctx), %xmm10		// key8\r
	movups	144(ctx), %xmm11		// key9\r
	movups	160(ctx), %xmm12		// keyA\r
#endif\r
\r
	// while (num_blk--) {\r
	//			*iv ^= *ibuf++;\r
	//			aes_encrypt(iv, iv, ctx);\r
	//			*obuf++ = *iv;\r
	// }\r
0:\r
	movups	(ibuf), %xmm1				// *ibuf\r
	pxor    %xmm2, iv					// 1st instruction inside aes_encrypt\r
	pxor	%xmm1, iv					// *iv ^= *ibuf\r
\r
	// finishing up the rest of aes_encrypt\r
    aesenc  %xmm3, iv\r
    aesenc  %xmm4, iv\r
    aesenc  %xmm5, iv\r
    aesenc  %xmm6, iv\r
    aesenc  %xmm7, iv\r
#if defined	__x86_64__\r
    aesenc  %xmm8, iv\r
    aesenc  %xmm9, iv\r
    aesenc  %xmm10, iv\r
    aesenc  %xmm11, iv\r
    aesenclast  %xmm12, iv\r
#else\r
	movups	96(ctx), %xmm1				// key6\r
    aesenc  %xmm1, iv\r
	movups	112(ctx), %xmm1				// key7\r
    aesenc  %xmm1, iv\r
	movups	128(ctx), %xmm1				// key8\r
    aesenc  %xmm1, iv\r
	movups	144(ctx), %xmm1				// key9\r
    aesenc  %xmm1, iv\r
	movups	160(ctx), %xmm1				// keyA\r
    aesenclast  %xmm1, iv\r
#endif\r
\r
	movups	iv, (obuf)					// *obuf = *iv;\r
	add		$16, obuf					// obuf++;\r
	add		$16, ibuf					// ibuf++;\r
	sub		$1, num_blk					// num_blk --\r
	jg		0b							// if num_blk > 0, repeat the loop\r
\r
	// the following will be branched to from all other cases (encrypt/decrypt 128/192/256)\r
\r
L_HW_cbc_done:\r
\r
	xor		%eax, %eax				// to return CRYPT_OK\r
\r
L_error:\r
\r
	// if kernel, restore xmm registers\r
#ifdef	KERNEL \r
	movaps	0(sp), %xmm0\r
	movaps	16(sp), %xmm1\r
	movaps	32(sp), %xmm2\r
	movaps	48(sp), %xmm3\r
	movaps	64(sp), %xmm4\r
	movaps	80(sp), %xmm5\r
	movaps	96(sp), %xmm6\r
	movaps	112(sp), %xmm7\r
#if defined	__x86_64__\r
	movaps	16*8(sp), %xmm8\r
	movaps	16*9(sp), %xmm9\r
	movaps	16*10(sp), %xmm10\r
	movaps	16*11(sp), %xmm11\r
	movaps	16*12(sp), %xmm12\r
	movaps	16*13(sp), %xmm13\r
	movaps	16*14(sp), %xmm14\r
	movaps	16*15(sp), %xmm15\r
#endif	// __x86_64__\r
#endif	// KERNEL\r
\r
	// release used stack memory, restore used callee-saved registers, and return \r
#if	defined	__i386__\r
#ifdef	KERNEL\r
	add		$(8*16), %esp\r
#endif\r
	pop		%edi\r
	pop		%ebx\r
#else\r
#ifdef	KERNEL\r
	add		$(16*16), %rsp	\r
#endif\r
	pop		%r15\r
	pop		%r14\r
	pop		%r13\r
	pop		%rbx\r
#endif\r
	leave\r
	ret\r
\r
	//\r
	// aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
	//\r
\r
L_encrypt_192:\r
\r
	cmp		$1, num_blk				// check number of block\r
	jl		L_HW_cbc_done			// should it be less than 1, nothing to do\r
\r
	movups	(ctx), %xmm2			// key0\r
	movups	16(ctx), %xmm3			// key1\r
	movups	32(ctx), %xmm4			// key2\r
	movups	48(ctx), %xmm5			// key3\r
	movups	64(ctx), %xmm6			// key4\r
	movups	80(ctx), %xmm7			// key5\r
#if defined	__x86_64__\r
	movups	96(ctx), %xmm8			// key6\r
	movups	112(ctx), %xmm9			// key7\r
	movups	128(ctx), %xmm10		// key8\r
	movups	144(ctx), %xmm11		// key9\r
	movups	160(ctx), %xmm12		// keyA\r
	movups	176(ctx), %xmm13		// keyB\r
	movups	192(ctx), %xmm14		// keyC\r
#endif\r
	\r
	// while (num_blk--) {\r
	//			*iv ^= *ibuf++;\r
	//			aes_encrypt(iv, iv, ctx);\r
	//			*obuf++ = *iv;\r
	// }\r
0:\r
	movups	(ibuf), %xmm1			// *ibuf\r
	pxor	%xmm1, iv				// *iv ^= ibuf\r
\r
	// aes_encrypt(iv, iv, ctx);\r
\r
	pxor    %xmm2, iv\r
    aesenc  %xmm3, iv\r
    aesenc  %xmm4, iv\r
    aesenc  %xmm5, iv\r
    aesenc  %xmm6, iv\r
    aesenc  %xmm7, iv\r
#if defined	__x86_64__\r
    aesenc  %xmm8, iv\r
    aesenc  %xmm9, iv\r
    aesenc  %xmm10, iv\r
    aesenc  %xmm11, iv\r
    aesenc  %xmm12, iv\r
    aesenc  %xmm13, iv\r
    aesenclast  %xmm14, iv\r
#else\r
	movups	96(ctx), %xmm1\r
    aesenc  %xmm1, iv\r
	movups	112(ctx), %xmm1\r
    aesenc  %xmm1, iv\r
	movups	128(ctx), %xmm1\r
    aesenc  %xmm1, iv\r
	movups	144(ctx), %xmm1\r
    aesenc  %xmm1, iv\r
	movups	160(ctx), %xmm1\r
    aesenc  %xmm1, iv\r
	movups	176(ctx), %xmm1\r
    aesenc  %xmm1, iv\r
	movups	192(ctx), %xmm1\r
    aesenclast  %xmm1, iv\r
#endif\r
\r
	movups	iv, (obuf)				// *obuf = *iv;\r
	add		$16, ibuf				// ibuf++\r
	add		$16, obuf				// obuf++\r
\r
	sub		$1, num_blk				// num_blk --\r
	jg		0b						// if num_blk > 0, repeat the loop\r
\r
	jmp		L_HW_cbc_done			// share with the common exit code\r
\r
	//\r
	// aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
	//\r
\r
L_encrypt_256:\r
\r
	cmp		$1, num_blk				// check number of block\r
	jl		L_HW_cbc_done			// should it be less than 1, nothing to do\r
\r
	movups	(ctx), %xmm2			// key0\r
	movups	16(ctx), %xmm3			// key1\r
	movups	32(ctx), %xmm4			// key2\r
	movups	48(ctx), %xmm5			// key3\r
	movups	64(ctx), %xmm6			// key4\r
	movups	80(ctx), %xmm7			// key5\r
#if defined	__x86_64__\r
	movups	96(ctx), %xmm8			// key6\r
	movups	112(ctx), %xmm9			// key7\r
	movups	128(ctx), %xmm10		// key8\r
	movups	144(ctx), %xmm11		// key9\r
	movups	160(ctx), %xmm12		// keyA\r
	movups	176(ctx), %xmm13		// keyB\r
	movups	192(ctx), %xmm14		// keyC\r
	movups	208(ctx), %xmm15		// keyD\r
	// movups	224(ctx), %xmm1		// keyE\r
#endif\r
\r
	// while (num_blk--) {\r
	//			*iv ^= *ibuf++;\r
	//			aes_encrypt(iv, iv, ctx);\r
	//			*obuf++ = *iv;\r
	// }\r
0:\r
	movups	(ibuf), %xmm1			// *ibuf\r
	pxor	%xmm1, iv				// *iv ^= ibuf\r
	\r
	// aes_encrypt(iv, iv, ctx);\r
	pxor    %xmm2, iv\r
    aesenc  %xmm3, iv\r
    aesenc  %xmm4, iv\r
    aesenc  %xmm5, iv\r
    aesenc  %xmm6, iv\r
    aesenc  %xmm7, iv\r
#if defined	__x86_64__\r
	movups	224(ctx), %xmm1			// keyE\r
    aesenc  %xmm8, iv\r
    aesenc  %xmm9, iv\r
    aesenc  %xmm10, iv\r
    aesenc  %xmm11, iv\r
    aesenc  %xmm12, iv\r
    aesenc  %xmm13, iv\r
    aesenc  %xmm14, iv\r
    aesenc  %xmm15, iv\r
    aesenclast  %xmm1, iv\r
#else\r
	movups	96(ctx), %xmm1			// key6\r
    aesenc  %xmm1, iv\r
	movups	112(ctx), %xmm1			// key7\r
    aesenc  %xmm1, iv\r
	movups	128(ctx), %xmm1			// key8\r
    aesenc  %xmm1, iv\r
	movups	144(ctx), %xmm1			// key9\r
    aesenc  %xmm1, iv\r
	movups	160(ctx), %xmm1			// keyA\r
    aesenc  %xmm1, iv\r
	movups	176(ctx), %xmm1			// keyB\r
    aesenc  %xmm1, iv\r
	movups	192(ctx), %xmm1			// keyC\r
    aesenc  %xmm1, iv\r
	movups	208(ctx), %xmm1			// keyD\r
    aesenc  %xmm1, iv\r
	movups	224(ctx), %xmm1			// keyE\r
    aesenclast  %xmm1, iv\r
#endif\r
\r
	movups	iv, (obuf)				// *obuf = *iv;\r
	add		$16, ibuf				// ibuf++\r
	add		$16, obuf				// obuf++\r
\r
	sub		$1, num_blk				// num_blk --\r
	jg		0b						// if num_blk > 0, repeat the loop\r
\r
	jmp		L_HW_cbc_done			// share with the common exit code\r
\r
\r
\r
	//\r
	// --------- END of aes_encrypt_cbc_hw  -------------------\r
	//\r
\r
\r
/* ---------------------------------------------------------------------------------------------------------------- \r
\r
	aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
\r
	For simplicity, I am assuming all variables are in 128-bit data type.\r
\r
	aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)\r
	{\r
		while(num_blk--) {\r
			aes_decrypt(ibuf, obuf, ctx);\r
			*obuf++ ^= *iv;\r
			*iv = *ibuf++;\r
		}\r
		return 0;\r
	}\r
\r
	The following is an implementation of this function using Intel AESNI.\r
	This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. \r
	Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
	to this aesni-based function should it detecs that aesni is available.\r
	Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
\r
	Note that the decryption operation is not related over blocks.\r
	This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.\r
	This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)\r
	The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.\r
\r
	Example C code for packing 4 blocks in an iteration is shown as follows:\r
\r
		while ((num_blk-=4)>=0) {\r
\r
			// the following 4 functions can be interleaved to exploit parallelism\r
			aes_decrypt(ibuf, obuf, ctx);\r
			aes_decrypt(ibuf+1, obuf+1, ctx);\r
			aes_decrypt(ibuf+2, obuf+2, ctx);\r
			aes_decrypt(ibuf+3, obuf+3, ctx);\r
\r
			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
			*iv = ibuf[3];		ibuf += 4; 	obuf += 4;\r
		}\r
		num_blk+=4;\r
\r
   ----------------------------------------------------------------------------------------------------------------*/\r
\r
	.text\r
	.align	4,0x90\r
	.globl	_aes_decrypt_cbc_hw\r
_aes_decrypt_cbc_hw:\r
\r
	// push/save registers for local use\r
#if	defined	__i386__\r
\r
	push	%ebp\r
	movl	%esp, %ebp\r
	push	%ebx					// ibuf\r
	push	%edi					// obuf\r
\r
	#define	sp	%esp\r
\r
#else	// __x86_64__\r
\r
	push	%rbp\r
	mov		%rsp, %rbp\r
	push	%rbx\r
	push	%r13\r
	push	%r14\r
	push	%r15\r
\r
	#define	sp	%rsp\r
\r
#endif\r
\r
\r
	// if kernel, allocate stack space to save xmm registers\r
#ifdef	KERNEL\r
#if defined __i386__\r
	sub		$(8*16), %esp\r
#else\r
	sub		$(16*16), %rsp\r
#endif\r
	movaps	%xmm0, (sp)\r
	movaps	%xmm1, 16(sp)\r
	movaps	%xmm2, 32(sp)\r
	movaps	%xmm3, 48(sp)\r
	movaps	%xmm4, 64(sp)\r
	movaps	%xmm5, 80(sp)\r
	movaps	%xmm6, 96(sp)\r
	movaps	%xmm7, 112(sp)\r
#if defined	__x86_64__\r
	movaps	%xmm8, 16*8(sp)\r
	movaps	%xmm9, 16*9(sp)\r
	movaps	%xmm10, 16*10(sp)\r
	movaps	%xmm11, 16*11(sp)\r
	movaps	%xmm12, 16*12(sp)\r
	movaps	%xmm13, 16*13(sp)\r
	movaps	%xmm14, 16*14(sp)\r
	movaps	%xmm15, 16*15(sp)\r
#endif	// __x86_64__\r
#endif\r
\r
	#undef	iv\r
	#define	iv	%xmm0\r
\r
#if defined	__i386__\r
	mov		12(%ebp), %eax			// in_iv\r
	mov		24(%ebp), %edx			// ctx\r
	movups	(%eax), iv				// iv = in_iv	\r
	mov		8(%ebp), %ebx			// ibuf\r
	mov		16(%ebp), %ecx			// num_blk\r
	mov		20(%ebp), %edi			// obuf\r
\r
	#define	ibuf	%ebx\r
	#define	obuf	%edi\r
	#define num_blk	%ecx	\r
	#define	ctx		%edx\r
\r
#else	//	__x86_64__, rdi/rsi/rdx/rcx/r8\r
\r
	mov		%rdi, %rbx				// ibuf\r
	movups	(%rsi), iv				// iv = in_iv\r
	mov		%rdx, %r13				// num_blk\r
	mov		%rcx, %r14				// obuf\r
	mov		%r8, %r15				// ctx	\r
\r
	#define	ibuf	%rbx\r
	#define	num_blk	%r13d\r
	#define	obuf	%r14	\r
	#define	ctx		%r15\r
\r
#endif\r
\r
	mov		240(ctx), %eax			// aes length\r
	cmp		$160, %eax				// aes-128 decrypt\r
	je		L_decrypt_128\r
	cmp		$192, %eax				// aes-192 decrypt\r
	je		L_decrypt_192\r
	cmp		$224, %eax				// aes-256 decrypt\r
	je		L_decrypt_256\r
\r
	mov		$-1, %eax				// wrong aes length, to return -1\r
	jmp		L_error					// early exit due to wrong aes length\r
\r
\r
	//\r
	// aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
	//\r
\r
L_decrypt_128:\r
\r
	cmp		$1, num_blk\r
	jl		L_HW_cbc_done			// if num_blk < 1, early return\r
\r
	// aes-128 decrypt expanded keys\r
	movups	160(ctx), %xmm3\r
	movups	144(ctx), %xmm4\r
	movups	128(ctx), %xmm5\r
	movups	112(ctx), %xmm6\r
	movups	96(ctx), %xmm7\r
#if defined	__x86_64__\r
	movups	80(ctx), %xmm8\r
	movups	64(ctx), %xmm9\r
	movups	48(ctx), %xmm10\r
	movups	32(ctx), %xmm11\r
	movups	16(ctx), %xmm12\r
	movups	0(ctx), %xmm13\r
#endif\r
\r
	// performs 4 block decryption in an iteration to exploit decrypt in parallel\r
\r
	//		while ((num_blk-=4)>=0) {\r
	//			aes_decrypt(ibuf, obuf, ctx);\r
	//			aes_decrypt(ibuf+1, obuf+1, ctx);\r
	//			aes_decrypt(ibuf+2, obuf+2, ctx);\r
	//			aes_decrypt(ibuf+3, obuf+3, ctx);\r
	//			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
	//			*iv = ibuf[3]; ibuf += 4; obuf += 4;\r
	//		}\r
\r
	sub		$4, num_blk					// pre decrement num_blk by 4\r
	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code\r
\r
0:\r
\r
\r
#if defined	__x86_64__\r
\r
	movups	(ibuf), %xmm1				// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf\r
	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf\r
	movups	48(ibuf), %xmm15			// tmp = 4th ibuf\r
\r
	// for x86_64, the expanded keys are already stored in xmm3-xmm13\r
\r
	// aes-128 decrypt round 0 per 4 blocks\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
	pxor    %xmm3, %xmm14\r
	pxor    %xmm3, %xmm15\r
\r
	// aes-128 decrypt round 1 per 4 blocks\r
    aesdec  %xmm4, %xmm1\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm4, %xmm14\r
    aesdec  %xmm4, %xmm15\r
\r
	// aes-128 decrypt round 2 per 4 blocks\r
    aesdec  %xmm5, %xmm1\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm5, %xmm14\r
    aesdec  %xmm5, %xmm15\r
\r
	// aes-128 decrypt round 3 per 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm14\r
    aesdec  %xmm6, %xmm15\r
\r
	// aes-128 decrypt round 4 per 4 blocks\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm14\r
    aesdec  %xmm7, %xmm15\r
\r
	// aes-128 decrypt round 5 per 4 blocks\r
    aesdec  %xmm8, %xmm1\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm8, %xmm14\r
    aesdec  %xmm8, %xmm15\r
\r
	// aes-128 decrypt round 6 per 4 blocks\r
    aesdec  %xmm9, %xmm1\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm9, %xmm14\r
    aesdec  %xmm9, %xmm15\r
\r
	// aes-128 decrypt round 7 per 4 blocks\r
    aesdec  %xmm10, %xmm1\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm10, %xmm14\r
    aesdec  %xmm10, %xmm15\r
\r
	// aes-128 decrypt round 8 per 4 blocks\r
    aesdec  %xmm11, %xmm1\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm11, %xmm14\r
    aesdec  %xmm11, %xmm15\r
\r
	// aes-128 decrypt round 9 per 4 blocks\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm12, %xmm14\r
    aesdec  %xmm12, %xmm15\r
\r
	// aes-128 decrypt round 10 (last) per 4 blocks\r
    aesdeclast  %xmm13, %xmm1\r
    aesdeclast  %xmm13, %xmm2\r
    aesdeclast  %xmm13, %xmm14\r
    aesdeclast  %xmm13, %xmm15\r
\r
	pxor	iv, %xmm1				// obuf[0] ^= *iv; \r
	movups	(ibuf), iv				// ibuf[0]\r
	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0]; \r
	movups	16(ibuf), iv			// ibuf[1]\r
	pxor	iv, %xmm14				// obuf[2] ^= ibuf[1]; \r
	movups	32(ibuf), iv			// ibuf[2] \r
	pxor	iv, %xmm15				// obuf[3] ^= obuf[2]; \r
	movups	48(ibuf), iv			// *iv = ibuf[3]\r
\r
	movups	%xmm1, (obuf)			// write 1st obuf\r
	movups	%xmm2, 16(obuf)			// write 2nd obuf\r
	movups	%xmm14, 32(obuf)		// write 3rd obuf\r
	movups	%xmm15, 48(obuf)		// write 4th obuf\r
\r
\r
#else\r
\r
	// aes_decrypt_cbc per 4 blocks using aes-128 for i386\r
	// xmm1/xmm2/xmm4/xmm5 used for obuf per block\r
	// xmm3 = key0\r
	// xmm0 = iv\r
	// xmm6/xmm7 dynamically load with other expanded keys\r
\r
	movups	(ibuf), %xmm1			// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf\r
	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf\r
	movups	48(ibuf), %xmm5			// tmp = 4th ibuf\r
\r
	// aes_decrypt\r
	// for i386, sequentially load expanded keys into xmm6/xmm7\r
\r
	movups	144(ctx), %xmm6			// key1\r
\r
	// aes-128 decrypt round 0 per 4 blocks\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
	pxor    %xmm3, %xmm4\r
	pxor    %xmm3, %xmm5\r
\r
	movups	128(ctx), %xmm7			// key2\r
\r
	// aes-128 decrypt round 1 per 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	112(ctx), %xmm6			// key3\r
\r
	// aes-128 decrypt round 2 per 4 blocks\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	96(ctx), %xmm7			// key4\r
\r
	// aes-128 decrypt round 3 per 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	80(ctx), %xmm6			// key5\r
\r
	// aes-128 decrypt round 4 per 4 blocks\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	64(ctx), %xmm7			// key6\r
\r
	// aes-128 decrypt round 5 per 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	48(ctx), %xmm6			// key7\r
\r
	// aes-128 decrypt round 6 per 4 blocks\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	32(ctx), %xmm7			// key8\r
\r
	// aes-128 decrypt round 7 per 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	16(ctx), %xmm6			// key9\r
\r
	// aes-128 decrypt round 8 per 4 blocks\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	0(ctx), %xmm7			// keyA\r
\r
	// aes-128 decrypt round 9 per 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	// aes-128 decrypt round 10 (last) per 4 blocks\r
    aesdeclast  %xmm7, %xmm1\r
    aesdeclast  %xmm7, %xmm2\r
    aesdeclast  %xmm7, %xmm4\r
    aesdeclast  %xmm7, %xmm5\r
\r
	pxor	iv, %xmm1				// 1st obuf ^= iv; \r
	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm2				// 2nd obuf ^= iv; \r
	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm4				// 3rd obuf ^= iv; \r
	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm5				// 4th obuf ^= iv; \r
	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
\r
	movups	%xmm1, (obuf)			// write 1st obuf\r
	movups	%xmm2, 16(obuf)			// write 2nd obuf\r
	movups	%xmm4, 32(obuf)			// write 3rd obuf\r
	movups	%xmm5, 48(obuf)			// write 4th obuf\r
#endif\r
\r
	add		$64, ibuf				// ibuf += 4; \r
	add		$64, obuf				// obuf += 4;	\r
\r
	sub		$4, num_blk				// num_blk -= 4\r
	jge		0b						// if num_blk > 0, repeat the loop\r
\r
9:	add		$4, num_blk				// post incremtn num_blk by 4\r
	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code\r
\r
#if defined	__i386__\r
	// updated as they might be needed as expanded keys in the remaining\r
	movups	144(ctx), %xmm4\r
	movups	128(ctx), %xmm5\r
	movups	112(ctx), %xmm6\r
	movups	96(ctx), %xmm7\r
#endif\r
\r
	test	$2, num_blk				// check whether num_blk has 2 blocks\r
	je		9f						// if num_blk & 2 == 0, skip the per-pair processing code\r
\r
	// do the remaining 2 blocks together\r
\r
	movups	(ibuf), %xmm1				// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf\r
\r
	// aes_decrypt\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
    aesdec  %xmm4, %xmm1\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm5, %xmm1\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
#if defined	__x86_64__\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm8, %xmm1\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm9, %xmm1\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm10, %xmm1\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm11, %xmm1\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdeclast  %xmm13, %xmm1\r
    aesdeclast  %xmm13, %xmm2\r
#else\r
	movups	80(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
	movups	64(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
	movups	48(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
	movups	32(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
	movups	16(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
	movups	0(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdeclast  %xmm7, %xmm1\r
    aesdeclast  %xmm7, %xmm2\r
	movups	112(ctx), %xmm6\r
	movups	96(ctx), %xmm7\r
#endif\r
\r
	pxor	iv, %xmm1				// obuf[0] ^= *iv; \r
	movups	(ibuf), iv				// ibuf[0]\r
	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0]\r
	movups	16(ibuf), iv			// *iv = ibuf[1]\r
\r
	movups	%xmm1, (obuf)			// write obuf[0]\r
	movups	%xmm2, 16(obuf)			// write obuf[1]\r
\r
	add		$32, ibuf				// ibuf += 2\r
	add		$32, obuf				// obuf += 2\r
\r
9:\r
	test	$1, num_blk				// check whether num_blk has residual 1 block\r
	je		L_HW_cbc_done			// if num_blk == 0, no need for residual processing code\r
	\r
	movups	(ibuf), %xmm2				// tmp = ibuf\r
	// aes_decrypt\r
	pxor    %xmm3, %xmm2\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm7, %xmm2\r
#if defined	__x86_64__\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm12, %xmm2\r
    aesdeclast  %xmm13, %xmm2\r
#else\r
	movups	80(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	64(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	48(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	32(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	16(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	(ctx), %xmm1\r
    aesdeclast  %xmm1, %xmm2\r
#endif\r
\r
	pxor	iv, %xmm2			// *obuf ^= *iv; \r
	movups	(ibuf), iv			// *iv = *ibuf;\r
	movups	%xmm2, (obuf)		// write *obuf\r
\r
	jmp		L_HW_cbc_done\r
\r
	//\r
	// aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
	//\r
\r
L_decrypt_192:\r
\r
	cmp		$1, num_blk\r
	jl		L_HW_cbc_done			// if num_blk < 1, early return\r
\r
	// aes-192 decryp expanded keys\r
	movups	192(ctx), %xmm3\r
	movups	176(ctx), %xmm4\r
	movups	160(ctx), %xmm5\r
	movups	144(ctx), %xmm6\r
	movups	128(ctx), %xmm7\r
#if defined	__x86_64__\r
	movups	112(ctx), %xmm8\r
	movups	96(ctx), %xmm9\r
	movups	80(ctx), %xmm10\r
	movups	64(ctx), %xmm11\r
	movups	48(ctx), %xmm12\r
	movups	32(ctx), %xmm13\r
	movups	16(ctx), %xmm14\r
	movups	(ctx), %xmm15\r
#endif\r
\r
	// performs 4 block decryption in an iteration to exploit decrypt in parallel\r
\r
	//		while ((num_blk-=4)>=0) {\r
	//			aes_decrypt(ibuf, obuf, ctx);\r
	//			aes_decrypt(ibuf+1, obuf+1, ctx);\r
	//			aes_decrypt(ibuf+2, obuf+2, ctx);\r
	//			aes_decrypt(ibuf+3, obuf+3, ctx);\r
	//			obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
	//			*iv = ibuf[3]; ibuf += 4; obuf += 4;\r
	//		}\r
\r
	sub		$4, num_blk					// pre decrement num_blk by 4\r
	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code\r
0:\r
\r
#if defined	__x86_64__\r
\r
	movups	(ibuf), %xmm1				// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf\r
	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf\r
	movups	48(ibuf), %xmm15			// tmp = 4th ibuf\r
\r
	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
	// use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards\r
\r
	// round 0 for 4 blocks\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
	pxor    %xmm3, %xmm14\r
	pxor    %xmm3, %xmm15\r
\r
	// round 1 for 4 blocks\r
    aesdec  %xmm4, %xmm1\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm4, %xmm14\r
    aesdec  %xmm4, %xmm15\r
\r
	// round 2 for 4 blocks\r
    aesdec  %xmm5, %xmm1\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm5, %xmm14\r
    aesdec  %xmm5, %xmm15\r
\r
	// round 3 for 4 blocks\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm14\r
    aesdec  %xmm6, %xmm15\r
\r
	// round 4 for 4 blocks\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm14\r
    aesdec  %xmm7, %xmm15\r
\r
	// round 5 for 4 blocks\r
    aesdec  %xmm8, %xmm1\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm8, %xmm14\r
    aesdec  %xmm8, %xmm15\r
\r
	// round 6 for 4 blocks\r
    aesdec  %xmm9, %xmm1\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm9, %xmm14\r
    aesdec  %xmm9, %xmm15\r
\r
	// round 7 for 4 blocks\r
    aesdec  %xmm10, %xmm1\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm10, %xmm14\r
    aesdec  %xmm10, %xmm15\r
\r
	// round 8 for 4 blocks\r
    aesdec  %xmm11, %xmm1\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm11, %xmm14\r
    aesdec  %xmm11, %xmm15\r
\r
	// round 9 for 4 blocks\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm12, %xmm14\r
    aesdec  %xmm12, %xmm15\r
\r
	movups	16(ctx), %xmm12\r
\r
	// round A for 4 blocks\r
    aesdec  %xmm13, %xmm1\r
    aesdec  %xmm13, %xmm2\r
    aesdec  %xmm13, %xmm14\r
    aesdec  %xmm13, %xmm15\r
\r
	movups	(ctx), %xmm13\r
\r
	// round B for 4 blocks\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm12, %xmm14\r
    aesdec  %xmm12, %xmm15\r
\r
	movups	48(ctx), %xmm12		// restore %xmm12 to its original key\r
\r
	// round C (last) for 4 blocks\r
    aesdeclast  %xmm13, %xmm1\r
    aesdeclast  %xmm13, %xmm2\r
    aesdeclast  %xmm13, %xmm14\r
    aesdeclast  %xmm13, %xmm15\r
\r
	movups	32(ctx), %xmm13		// restore %xmm13 to its original key\r
\r
	pxor	iv, %xmm1				// obuf[0] ^= *iv; \r
	movups	(ibuf), iv				// ibuf[0]\r
	pxor	iv, %xmm2				// obuf[1] ^= ibuf[0] \r
	movups	16(ibuf), iv			// ibuf[1]\r
	pxor	iv, %xmm14				// obuf[2] ^= ibuf[1] \r
	movups	32(ibuf), iv			// ibuf[2] \r
	pxor	iv, %xmm15				// obuf[3] ^= ibuf[2] \r
	movups	48(ibuf), iv			// *iv = ibuf[3] \r
\r
	movups	%xmm1, (obuf)			// write 1st obuf\r
	movups	%xmm2, 16(obuf)			// write 2nd obuf\r
	movups	%xmm14, 32(obuf)		// write 3rd obuf\r
	movups	%xmm15, 48(obuf)		// write 4th obuf\r
\r
	add		$64, ibuf				// ibuf += 4; \r
	add		$64, obuf				// obuf += 4;	\r
\r
	sub		$4, num_blk				// num_blk -= 4\r
	jge		0b						// if num_blk > 0, repeat the loop\r
\r
9:	add		$4, num_blk				// post incremtn num_blk by 4\r
	je		L_HW_cbc_done			// if num_blk == 0, prepare to return \r
\r
	movups	16(ctx), %xmm14			// restore %xmm14 to its key\r
	movups	(ctx), %xmm15			// restore %xmm15 to its key\r
\r
#else\r
\r
	movups	(ibuf), %xmm1			// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf\r
	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf\r
	movups	48(ibuf), %xmm5			// tmp = 4th ibuf\r
\r
	// aes_decrypt\r
	// for i386, sequentially load expanded keys into xmm6/xmm7\r
	movups	176(ctx), %xmm6\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
	pxor    %xmm3, %xmm4\r
	pxor    %xmm3, %xmm5\r
\r
	movups	160(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	144(ctx), %xmm6\r
	aesdec    %xmm7, %xmm1\r
	aesdec    %xmm7, %xmm2\r
	aesdec    %xmm7, %xmm4\r
	aesdec    %xmm7, %xmm5\r
\r
	movups	128(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	112(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	96(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	80(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	64(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	48(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	32(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	16(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	0(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
    aesdeclast  %xmm7, %xmm1\r
    aesdeclast  %xmm7, %xmm2\r
    aesdeclast  %xmm7, %xmm4\r
    aesdeclast  %xmm7, %xmm5\r
\r
	pxor	iv, %xmm1				// 1st obuf ^= iv; \r
	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm2				// 2nd obuf ^= iv; \r
	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm4				// 3rd obuf ^= iv; \r
	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm5				// 4th obuf ^= iv; \r
	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	movups	%xmm1, (obuf)			// write 1st obuf\r
	movups	%xmm2, 16(obuf)			// write 2nd obuf\r
	movups	%xmm4, 32(obuf)			// write 3rd obuf\r
	movups	%xmm5, 48(obuf)			// write 4th obuf\r
\r
	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE * 4; \r
	add		$64, obuf				// obuf += AES_BLOCK_SIZE * 4;	\r
\r
	sub		$4, num_blk				// num_blk -= 4\r
	jge		0b						// if num_blk > 0, repeat the loop\r
\r
\r
9:	add		$4, num_blk				//	post incremtn num_blk by 4\r
	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code\r
\r
	movups	176(ctx), %xmm4\r
	movups	160(ctx), %xmm5\r
	movups	144(ctx), %xmm6\r
	movups	128(ctx), %xmm7\r
\r
#endif\r
\r
	// per-block aes_decrypt_cbc loop\r
\r
0:\r
	movups	(ibuf), %xmm2				// tmp = ibuf\r
\r
	// aes_decrypt\r
	pxor    %xmm3, %xmm2\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm7, %xmm2\r
#if defined	__x86_64__\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm13, %xmm2\r
    aesdec  %xmm14, %xmm2\r
    aesdeclast  %xmm15, %xmm2\r
#else\r
	movups	112(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	96(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	80(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	64(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	48(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	32(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	16(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	(ctx), %xmm1\r
    aesdeclast  %xmm1, %xmm2\r
#endif\r
\r
	pxor	iv, %xmm2			// obuf ^= iv; \r
	movups	(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);\r
\r
	movups	%xmm2, (obuf)		// write obuf\r
\r
	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; \r
	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	\r
	sub		$1, num_blk				// num_blk --\r
	jg		0b						// if num_blk > 0, repeat the loop\r
\r
	jmp		L_HW_cbc_done\r
\r
	//\r
	// aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
	//\r
\r
L_decrypt_256:\r
\r
	cmp		$1, num_blk\r
	jl		L_HW_cbc_done	\r
\r
	movups	224(ctx), %xmm3\r
	movups	208(ctx), %xmm4\r
	movups	192(ctx), %xmm5\r
	movups	176(ctx), %xmm6\r
	movups	160(ctx), %xmm7\r
#if defined	__x86_64__\r
	movups	144(ctx), %xmm8\r
	movups	128(ctx), %xmm9\r
	movups	112(ctx), %xmm10\r
	movups	96(ctx), %xmm11\r
	movups	80(ctx), %xmm12\r
	movups	64(ctx), %xmm13\r
	movups	48(ctx), %xmm14\r
	movups	32(ctx), %xmm15\r
//	movups	16(ctx), %xmm14\r
//	movups	(ctx), %xmm15\r
#endif\r
\r
#if defined	__x86_64__\r
\r
	sub		$4, num_blk					// pre decrement num_blk by 4\r
	jl		9f							// if num_blk < 4, skip the per-4-blocks processing code\r
0:\r
	movups	(ibuf), %xmm1				// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2				// tmp = 2nd ibuf\r
	movups	32(ibuf), %xmm14			// tmp = 3rd ibuf\r
	movups	48(ibuf), %xmm15			// tmp = 4th ibuf\r
\r
	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
	pxor    %xmm3, %xmm14\r
	pxor    %xmm3, %xmm15\r
\r
    aesdec  %xmm4, %xmm1\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm4, %xmm14\r
    aesdec  %xmm4, %xmm15\r
\r
    aesdec  %xmm5, %xmm1\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm5, %xmm14\r
    aesdec  %xmm5, %xmm15\r
\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm14\r
    aesdec  %xmm6, %xmm15\r
\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm14\r
    aesdec  %xmm7, %xmm15\r
\r
    aesdec  %xmm8, %xmm1\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm8, %xmm14\r
    aesdec  %xmm8, %xmm15\r
\r
    aesdec  %xmm9, %xmm1\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm9, %xmm14\r
    aesdec  %xmm9, %xmm15\r
\r
    aesdec  %xmm10, %xmm1\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm10, %xmm14\r
    aesdec  %xmm10, %xmm15\r
\r
    aesdec  %xmm11, %xmm1\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm11, %xmm14\r
    aesdec  %xmm11, %xmm15\r
\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm12, %xmm14\r
    aesdec  %xmm12, %xmm15\r
	movups	48(ctx), %xmm12\r
\r
    aesdec  %xmm13, %xmm1\r
    aesdec  %xmm13, %xmm2\r
    aesdec  %xmm13, %xmm14\r
    aesdec  %xmm13, %xmm15\r
	movups	32(ctx), %xmm13\r
\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm12, %xmm14\r
    aesdec  %xmm12, %xmm15\r
	movups	16(ctx), %xmm12\r
\r
    aesdec  %xmm13, %xmm1\r
    aesdec  %xmm13, %xmm2\r
    aesdec  %xmm13, %xmm14\r
    aesdec  %xmm13, %xmm15\r
	movups	(ctx), %xmm13\r
\r
    aesdec  %xmm12, %xmm1\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm12, %xmm14\r
    aesdec  %xmm12, %xmm15\r
	movups	80(ctx), %xmm12\r
\r
    aesdeclast  %xmm13, %xmm1\r
    aesdeclast  %xmm13, %xmm2\r
    aesdeclast  %xmm13, %xmm14\r
    aesdeclast  %xmm13, %xmm15\r
	movups	64(ctx), %xmm13\r
\r
	pxor	iv, %xmm1				// obuf ^= iv; \r
	movups	(ibuf), iv				// memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm2				// obuf ^= iv; \r
	movups	16(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm14				// obuf ^= iv; \r
	movups	32(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm15				// obuf ^= iv; \r
	movups	48(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);\r
\r
	movups	%xmm1, (obuf)			// write 1st obuf\r
	movups	%xmm2, 16(obuf)			// write 2nd obuf\r
	movups	%xmm14, 32(obuf)		// write 3rd obuf\r
	movups	%xmm15, 48(obuf)		// write 4th obuf\r
\r
	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE*4; \r
	add		$64, obuf				// obuf += AES_BLOCK_SIZE*4;	\r
\r
	sub		$4, num_blk				// num_blk -= 4\r
	jge		0b						// if num_blk > 0, repeat the loop\r
\r
9:	add		$4, num_blk				//	post incremtn num_blk by 4\r
	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code\r
\r
	movups	48(ctx), %xmm14\r
	movups	32(ctx), %xmm15\r
\r
#else\r
\r
	sub		$4, num_blk				// pre decrement num_blk by 4\r
	jl		9f						// if num_blk < 4, skip the per-pair processing code\r
0:\r
	movups	(ibuf), %xmm1			// tmp = 1st ibuf\r
	movups	16(ibuf), %xmm2			// tmp = 2nd ibuf\r
	movups	32(ibuf), %xmm4			// tmp = 3rd ibuf\r
	movups	48(ibuf), %xmm5			// tmp = 4th ibuf\r
\r
	// aes_decrypt\r
	// for i386, sequentially load expanded keys into xmm6/xmm7\r
	movups	208(ctx), %xmm6\r
	pxor    %xmm3, %xmm1\r
	pxor    %xmm3, %xmm2\r
	pxor    %xmm3, %xmm4\r
	pxor    %xmm3, %xmm5\r
\r
	movups	192(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	176(ctx), %xmm6\r
	aesdec  %xmm7, %xmm1\r
	aesdec	%xmm7, %xmm2\r
	aesdec	%xmm7, %xmm4\r
	aesdec	%xmm7, %xmm5\r
\r
	movups	160(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	144(ctx), %xmm6\r
	aesdec	%xmm7, %xmm1\r
	aesdec	%xmm7, %xmm2\r
	aesdec	%xmm7, %xmm4\r
	aesdec	%xmm7, %xmm5\r
\r
	movups	128(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	112(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	96(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	80(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	64(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	48(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	32(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
	movups	16(ctx), %xmm6\r
    aesdec  %xmm7, %xmm1\r
    aesdec  %xmm7, %xmm2\r
    aesdec  %xmm7, %xmm4\r
    aesdec  %xmm7, %xmm5\r
\r
	movups	0(ctx), %xmm7\r
    aesdec  %xmm6, %xmm1\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm6, %xmm4\r
    aesdec  %xmm6, %xmm5\r
\r
    aesdeclast  %xmm7, %xmm1\r
    aesdeclast  %xmm7, %xmm2\r
    aesdeclast  %xmm7, %xmm4\r
    aesdeclast  %xmm7, %xmm5\r
\r
	pxor	iv, %xmm1				// 1st obuf ^= iv; \r
	movups	(ibuf), iv				// 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm2				// 2nd obuf ^= iv; \r
	movups	16(ibuf), iv			// 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm4				// 3rd obuf ^= iv; \r
	movups	32(ibuf), iv			// 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	pxor	iv, %xmm5				// 4th obuf ^= iv; \r
	movups	48(ibuf), iv			// 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
	movups	%xmm1, (obuf)			// write 1st obuf\r
	movups	%xmm2, 16(obuf)			// write 2nd obuf\r
	movups	%xmm4, 32(obuf)			// write 3rd obuf\r
	movups	%xmm5, 48(obuf)			// write 4th obuf\r
\r
	add		$64, ibuf				// ibuf += AES_BLOCK_SIZE * 4; \r
	add		$64, obuf				// obuf += AES_BLOCK_SIZE * 4;	\r
\r
	sub		$4, num_blk				// num_blk -= 4\r
	jge		0b						// if num_blk > 0, repeat the loop\r
\r
\r
9:	add		$4, num_blk				//	post incremtn num_blk by 4\r
	je		L_HW_cbc_done			// if num_blk == 0, no need for forthur processing code\r
\r
	movups	208(ctx), %xmm4\r
	movups	192(ctx), %xmm5\r
	movups	176(ctx), %xmm6\r
	movups	160(ctx), %xmm7\r
\r
#endif\r
\r
0:\r
	movups	(ibuf), %xmm2				// tmp = ibuf\r
\r
	// aes_decrypt\r
	pxor	%xmm3, %xmm2\r
    aesdec  %xmm4, %xmm2\r
    aesdec  %xmm5, %xmm2\r
    aesdec  %xmm6, %xmm2\r
    aesdec  %xmm7, %xmm2\r
#if defined	__x86_64__\r
    aesdec  %xmm8, %xmm2\r
    aesdec  %xmm9, %xmm2\r
    aesdec  %xmm10, %xmm2\r
    aesdec  %xmm11, %xmm2\r
    aesdec  %xmm12, %xmm2\r
    aesdec  %xmm13, %xmm2\r
    aesdec  %xmm14, %xmm2\r
    aesdec  %xmm15, %xmm2\r
#else\r
	movups	144(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	128(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	112(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	96(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	80(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	64(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	48(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	32(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
#endif\r
	movups	16(ctx), %xmm1\r
    aesdec  %xmm1, %xmm2\r
	movups	(ctx), %xmm1\r
    aesdeclast  %xmm1, %xmm2\r
\r
	pxor	iv, %xmm2			// obuf ^= iv; \r
	movups	(ibuf), iv			// memcpy(iv, tmp, AES_BLOCK_SIZE);\r
\r
	movups	%xmm2, (obuf)		// write obuf\r
\r
	add		$16, ibuf				// ibuf += AES_BLOCK_SIZE; \r
	add		$16, obuf				// obuf += AES_BLOCK_SIZE;	\r
	sub		$1, num_blk				// num_blk --\r
	jg		0b						// if num_blk > 0, repeat the loop\r
\r
	jmp		L_HW_cbc_done\r
\r
	//\r
	// --------- END of aes_decrypt_cbc_hw  -------------------\r
	//\r