git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/crypto/aes/i386/aes_modes

... / ...

Commit	Line	Data
	1	/*
	2	---------------------------------------------------------------------------
	3	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	4
	5	LICENSE TERMS
	6
	7	The free distribution and use of this software in both source and binary
	8	form is allowed (with or without changes) provided that:
	9
	10	1. distributions of this source code include the above copyright
	11	notice, this list of conditions and the following disclaimer;
	12
	13	2. distributions in binary form include the above copyright
	14	notice, this list of conditions and the following disclaimer
	15	in the documentation and/or other associated materials;
	16
	17	3. the copyright holder's name is not used to endorse products
	18	built using this software without specific written permission.
	19
	20	ALTERNATIVELY, provided that this notice is retained in full, this product
	21	may be distributed under the terms of the GNU General Public License (GPL),
	22	in which case the provisions of the GPL apply INSTEAD OF those given above.
	23
	24	DISCLAIMER
	25
	26	This software is provided 'as is' with no explicit or implied warranties
	27	in respect of its properties, including, but not limited to, correctness
	28	and/or fitness for purpose.
	29	---------------------------------------------------------------------------
	30	Issue 31/01/2006
	31
	32	These subroutines implement multiple block AES modes for ECB, CBC, CFB,
	33	OFB and CTR encryption, The code provides support for the VIA Advanced
	34	Cryptography Engine (ACE).
	35
	36	NOTE: In the following subroutines, the AES contexts (ctx) must be
	37	16 byte aligned if VIA ACE is being used
	38	*/
	39
	40	/* ----------------------------------------------------------------------------------------------------------------
	41
	42	aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
	43
	44	For simplicity, I am assuming all variables are in 128-bit data type.
	45
	46	aes_rval aes_encrypt_cbc(const __m128 ibuf, __m128 iv, int num_blk, __m128 obuf, const aes_encrypt_ctx ctx)
	47	{
	48	while(num_blk--) {
	49	iv ^= ibuf++;
	50	aes_encrypt(iv, iv, ctx);
	51	obuf++ = iv;
	52	}
	53	return 0;
	54	}
	55
	56	The following is an implementation of this function using Intel AESNI.
	57	This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
	58	Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
	59	to this aesni-based function should it detecs that aesni is available.
	60	Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
	61
	62	Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
	63	are serially chained. This prevents us from arranging several blocks for encryption in parallel.
	64
	65	----------------------------------------------------------------------------------------------------------------*/
	66
	67	.text
	68	.align 4,0x90
	69	.globl _aes_encrypt_cbc_hw
	70	_aes_encrypt_cbc_hw:
	71
	72	// push/save registers for local use
	73	#if defined __i386__
	74
	75	push %ebp
	76	movl %esp, %ebp
	77	push %ebx
	78	push %edi
	79
	80	#define sp %esp
	81
	82	#else // __x86_64__
	83
	84	push %rbp
	85	mov %rsp, %rbp
	86	push %rbx
	87	push %r13
	88	push %r14
	89	push %r15
	90
	91	#define sp %rsp
	92
	93	#endif
	94
	95	// if this is kernel code, need to save used xmm registers
	96	#ifdef KERNEL
	97
	98	#if defined __i386__
	99	sub $(8*16), %esp // for possible xmm0-xmm7 save/restore
	100	#else
	101	sub $(16*16), %rsp // xmm0-xmm15 save/restore
	102	#endif
	103
	104	movaps %xmm0, (sp)
	105	movaps %xmm1, 16(sp)
	106	movaps %xmm2, 32(sp)
	107	movaps %xmm3, 48(sp)
	108	movaps %xmm4, 64(sp)
	109	movaps %xmm5, 80(sp)
	110	movaps %xmm6, 96(sp)
	111	movaps %xmm7, 112(sp)
	112	#if defined __x86_64__
	113	movaps %xmm8, 16*8(sp)
	114	movaps %xmm9, 16*9(sp)
	115	movaps %xmm10, 16*10(sp)
	116	movaps %xmm11, 16*11(sp)
	117	movaps %xmm12, 16*12(sp)
	118	movaps %xmm13, 16*13(sp)
	119	movaps %xmm14, 16*14(sp)
	120	movaps %xmm15, 16*15(sp)
	121	#endif // __x86_64__
	122
	123	#endif // KERNEL
	124
	125	#define iv %xmm0
	126
	127	#ifdef __i386__
	128
	129	mov 12(%ebp), %eax // in_iv
	130	mov 24(%ebp), %edx // ctx
	131	movups (%eax), iv // iv = in_iv
	132	mov 8(%ebp), %ebx // ibuf
	133	mov 16(%ebp), %ecx // num_blk
	134	mov 20(%ebp), %edi // obuf
	135
	136	#define ibuf %ebx
	137	#define obuf %edi
	138	#define num_blk %ecx
	139	#define ctx %edx
	140
	141	#else
	142
	143	mov %rdi, %rbx // ibuf
	144	movups (%rsi), iv // iv = in_iv
	145	mov %rdx, %r13 // num_blk
	146	mov %rcx, %r14 // obuf
	147	mov %r8, %r15 // ctx
	148
	149	#define ibuf %rbx
	150	#define num_blk %r13d
	151	#define obuf %r14
	152	#define ctx %r15
	153
	154	#endif
	155
	156	mov 240(ctx), %eax // aes length
	157	cmp $160, %eax // aes-128 encrypt ?
	158	je L_encrypt_128
	159	cmp $192, %eax // aes-192 encrypt ?
	160	je L_encrypt_192
	161	cmp $224, %eax // aes-256 encrypt ?
	162	je L_encrypt_256
	163	mov $-1, %eax // return error
	164	jmp L_error
	165
	166	//
	167	// aes-128 encrypt_cbc operation, up to L_HW_cbc_done
	168	//
	169
	170	L_encrypt_128:
	171
	172	cmp $1, num_blk // check number of block
	173	jl L_HW_cbc_done // should it be less than 1, nothing to do
	174
	175	movups (ctx), %xmm2 // key0
	176	movups 16(ctx), %xmm3 // key1
	177	movups 32(ctx), %xmm4 // key2
	178	movups 48(ctx), %xmm5 // key3
	179	movups 64(ctx), %xmm6 // key4
	180	movups 80(ctx), %xmm7 // key5
	181	#if defined __x86_64__
	182	movups 96(ctx), %xmm8 // key6
	183	movups 112(ctx), %xmm9 // key7
	184	movups 128(ctx), %xmm10 // key8
	185	movups 144(ctx), %xmm11 // key9
	186	movups 160(ctx), %xmm12 // keyA
	187	#endif
	188
	189	// while (num_blk--) {
	190	// iv ^= ibuf++;
	191	// aes_encrypt(iv, iv, ctx);
	192	// obuf++ = iv;
	193	// }
	194	0:
	195	movups (ibuf), %xmm1 // *ibuf
	196	pxor %xmm2, iv // 1st instruction inside aes_encrypt
	197	pxor %xmm1, iv // iv ^= ibuf
	198
	199	// finishing up the rest of aes_encrypt
	200	aesenc %xmm3, iv
	201	aesenc %xmm4, iv
	202	aesenc %xmm5, iv
	203	aesenc %xmm6, iv
	204	aesenc %xmm7, iv
	205	#if defined __x86_64__
	206	aesenc %xmm8, iv
	207	aesenc %xmm9, iv
	208	aesenc %xmm10, iv
	209	aesenc %xmm11, iv
	210	aesenclast %xmm12, iv
	211	#else
	212	movups 96(ctx), %xmm1 // key6
	213	aesenc %xmm1, iv
	214	movups 112(ctx), %xmm1 // key7
	215	aesenc %xmm1, iv
	216	movups 128(ctx), %xmm1 // key8
	217	aesenc %xmm1, iv
	218	movups 144(ctx), %xmm1 // key9
	219	aesenc %xmm1, iv
	220	movups 160(ctx), %xmm1 // keyA
	221	aesenclast %xmm1, iv
	222	#endif
	223
	224	movups iv, (obuf) // obuf = iv;
	225	add $16, obuf // obuf++;
	226	add $16, ibuf // ibuf++;
	227	sub $1, num_blk // num_blk --
	228	jg 0b // if num_blk > 0, repeat the loop
	229
	230	// the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
	231
	232	L_HW_cbc_done:
	233
	234	xor %eax, %eax // to return CRYPT_OK
	235
	236	L_error:
	237
	238	// if kernel, restore xmm registers
	239	#ifdef KERNEL
	240	movaps 0(sp), %xmm0
	241	movaps 16(sp), %xmm1
	242	movaps 32(sp), %xmm2
	243	movaps 48(sp), %xmm3
	244	movaps 64(sp), %xmm4
	245	movaps 80(sp), %xmm5
	246	movaps 96(sp), %xmm6
	247	movaps 112(sp), %xmm7
	248	#if defined __x86_64__
	249	movaps 16*8(sp), %xmm8
	250	movaps 16*9(sp), %xmm9
	251	movaps 16*10(sp), %xmm10
	252	movaps 16*11(sp), %xmm11
	253	movaps 16*12(sp), %xmm12
	254	movaps 16*13(sp), %xmm13
	255	movaps 16*14(sp), %xmm14
	256	movaps 16*15(sp), %xmm15
	257	#endif // __x86_64__
	258	#endif // KERNEL
	259
	260	// release used stack memory, restore used callee-saved registers, and return
	261	#if defined __i386__
	262	#ifdef KERNEL
	263	add $(8*16), %esp
	264	#endif
	265	pop %edi
	266	pop %ebx
	267	#else
	268	#ifdef KERNEL
	269	add $(16*16), %rsp
	270	#endif
	271	pop %r15
	272	pop %r14
	273	pop %r13
	274	pop %rbx
	275	#endif
	276	leave
	277	ret
	278
	279	//
	280	// aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
	281	//
	282
	283	L_encrypt_192:
	284
	285	cmp $1, num_blk // check number of block
	286	jl L_HW_cbc_done // should it be less than 1, nothing to do
	287
	288	movups (ctx), %xmm2 // key0
	289	movups 16(ctx), %xmm3 // key1
	290	movups 32(ctx), %xmm4 // key2
	291	movups 48(ctx), %xmm5 // key3
	292	movups 64(ctx), %xmm6 // key4
	293	movups 80(ctx), %xmm7 // key5
	294	#if defined __x86_64__
	295	movups 96(ctx), %xmm8 // key6
	296	movups 112(ctx), %xmm9 // key7
	297	movups 128(ctx), %xmm10 // key8
	298	movups 144(ctx), %xmm11 // key9
	299	movups 160(ctx), %xmm12 // keyA
	300	movups 176(ctx), %xmm13 // keyB
	301	movups 192(ctx), %xmm14 // keyC
	302	#endif
	303
	304	// while (num_blk--) {
	305	// iv ^= ibuf++;
	306	// aes_encrypt(iv, iv, ctx);
	307	// obuf++ = iv;
	308	// }
	309	0:
	310	movups (ibuf), %xmm1 // *ibuf
	311	pxor %xmm1, iv // *iv ^= ibuf
	312
	313	// aes_encrypt(iv, iv, ctx);
	314
	315	pxor %xmm2, iv
	316	aesenc %xmm3, iv
	317	aesenc %xmm4, iv
	318	aesenc %xmm5, iv
	319	aesenc %xmm6, iv
	320	aesenc %xmm7, iv
	321	#if defined __x86_64__
	322	aesenc %xmm8, iv
	323	aesenc %xmm9, iv
	324	aesenc %xmm10, iv
	325	aesenc %xmm11, iv
	326	aesenc %xmm12, iv
	327	aesenc %xmm13, iv
	328	aesenclast %xmm14, iv
	329	#else
	330	movups 96(ctx), %xmm1
	331	aesenc %xmm1, iv
	332	movups 112(ctx), %xmm1
	333	aesenc %xmm1, iv
	334	movups 128(ctx), %xmm1
	335	aesenc %xmm1, iv
	336	movups 144(ctx), %xmm1
	337	aesenc %xmm1, iv
	338	movups 160(ctx), %xmm1
	339	aesenc %xmm1, iv
	340	movups 176(ctx), %xmm1
	341	aesenc %xmm1, iv
	342	movups 192(ctx), %xmm1
	343	aesenclast %xmm1, iv
	344	#endif
	345
	346	movups iv, (obuf) // obuf = iv;
	347	add $16, ibuf // ibuf++
	348	add $16, obuf // obuf++
	349
	350	sub $1, num_blk // num_blk --
	351	jg 0b // if num_blk > 0, repeat the loop
	352
	353	jmp L_HW_cbc_done // share with the common exit code
	354
	355	//
	356	// aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
	357	//
	358
	359	L_encrypt_256:
	360
	361	cmp $1, num_blk // check number of block
	362	jl L_HW_cbc_done // should it be less than 1, nothing to do
	363
	364	movups (ctx), %xmm2 // key0
	365	movups 16(ctx), %xmm3 // key1
	366	movups 32(ctx), %xmm4 // key2
	367	movups 48(ctx), %xmm5 // key3
	368	movups 64(ctx), %xmm6 // key4
	369	movups 80(ctx), %xmm7 // key5
	370	#if defined __x86_64__
	371	movups 96(ctx), %xmm8 // key6
	372	movups 112(ctx), %xmm9 // key7
	373	movups 128(ctx), %xmm10 // key8
	374	movups 144(ctx), %xmm11 // key9
	375	movups 160(ctx), %xmm12 // keyA
	376	movups 176(ctx), %xmm13 // keyB
	377	movups 192(ctx), %xmm14 // keyC
	378	movups 208(ctx), %xmm15 // keyD
	379	// movups 224(ctx), %xmm1 // keyE
	380	#endif
	381
	382	// while (num_blk--) {
	383	// iv ^= ibuf++;
	384	// aes_encrypt(iv, iv, ctx);
	385	// obuf++ = iv;
	386	// }
	387	0:
	388	movups (ibuf), %xmm1 // *ibuf
	389	pxor %xmm1, iv // *iv ^= ibuf
	390
	391	// aes_encrypt(iv, iv, ctx);
	392	pxor %xmm2, iv
	393	aesenc %xmm3, iv
	394	aesenc %xmm4, iv
	395	aesenc %xmm5, iv
	396	aesenc %xmm6, iv
	397	aesenc %xmm7, iv
	398	#if defined __x86_64__
	399	movups 224(ctx), %xmm1 // keyE
	400	aesenc %xmm8, iv
	401	aesenc %xmm9, iv
	402	aesenc %xmm10, iv
	403	aesenc %xmm11, iv
	404	aesenc %xmm12, iv
	405	aesenc %xmm13, iv
	406	aesenc %xmm14, iv
	407	aesenc %xmm15, iv
	408	aesenclast %xmm1, iv
	409	#else
	410	movups 96(ctx), %xmm1 // key6
	411	aesenc %xmm1, iv
	412	movups 112(ctx), %xmm1 // key7
	413	aesenc %xmm1, iv
	414	movups 128(ctx), %xmm1 // key8
	415	aesenc %xmm1, iv
	416	movups 144(ctx), %xmm1 // key9
	417	aesenc %xmm1, iv
	418	movups 160(ctx), %xmm1 // keyA
	419	aesenc %xmm1, iv
	420	movups 176(ctx), %xmm1 // keyB
	421	aesenc %xmm1, iv
	422	movups 192(ctx), %xmm1 // keyC
	423	aesenc %xmm1, iv
	424	movups 208(ctx), %xmm1 // keyD
	425	aesenc %xmm1, iv
	426	movups 224(ctx), %xmm1 // keyE
	427	aesenclast %xmm1, iv
	428	#endif
	429
	430	movups iv, (obuf) // obuf = iv;
	431	add $16, ibuf // ibuf++
	432	add $16, obuf // obuf++
	433
	434	sub $1, num_blk // num_blk --
	435	jg 0b // if num_blk > 0, repeat the loop
	436
	437	jmp L_HW_cbc_done // share with the common exit code
	438
	439
	440
	441	//
	442	// --------- END of aes_encrypt_cbc_hw -------------------
	443	//
	444
	445
	446	/* ----------------------------------------------------------------------------------------------------------------
	447
	448	aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
	449
	450	For simplicity, I am assuming all variables are in 128-bit data type.
	451
	452	aes_rval aes_decrypt_cbc(const __m128 ibuf, __m128 iv, int num_blk, __m128 obuf, const aes_decrypt_ctx ctx)
	453	{
	454	while(num_blk--) {
	455	aes_decrypt(ibuf, obuf, ctx);
	456	obuf++ ^= iv;
	457	iv = ibuf++;
	458	}
	459	return 0;
	460	}
	461
	462	The following is an implementation of this function using Intel AESNI.
	463	This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
	464	Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
	465	to this aesni-based function should it detecs that aesni is available.
	466	Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
	467
	468	Note that the decryption operation is not related over blocks.
	469	This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
	470	This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
	471	The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
	472
	473	Example C code for packing 4 blocks in an iteration is shown as follows:
	474
	475	while ((num_blk-=4)>=0) {
	476
	477	// the following 4 functions can be interleaved to exploit parallelism
	478	aes_decrypt(ibuf, obuf, ctx);
	479	aes_decrypt(ibuf+1, obuf+1, ctx);
	480	aes_decrypt(ibuf+2, obuf+2, ctx);
	481	aes_decrypt(ibuf+3, obuf+3, ctx);
	482
	483	obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
	484	*iv = ibuf[3]; ibuf += 4; obuf += 4;
	485	}
	486	num_blk+=4;
	487
	488	----------------------------------------------------------------------------------------------------------------*/
	489
	490	.text
	491	.align 4,0x90
	492	.globl _aes_decrypt_cbc_hw
	493	_aes_decrypt_cbc_hw:
	494
	495	// push/save registers for local use
	496	#if defined __i386__
	497
	498	push %ebp
	499	movl %esp, %ebp
	500	push %ebx // ibuf
	501	push %edi // obuf
	502
	503	#define sp %esp
	504
	505	#else // __x86_64__
	506
	507	push %rbp
	508	mov %rsp, %rbp
	509	push %rbx
	510	push %r13
	511	push %r14
	512	push %r15
	513
	514	#define sp %rsp
	515
	516	#endif
	517
	518
	519	// if kernel, allocate stack space to save xmm registers
	520	#ifdef KERNEL
	521	#if defined __i386__
	522	sub $(8*16), %esp
	523	#else
	524	sub $(16*16), %rsp
	525	#endif
	526	movaps %xmm0, (sp)
	527	movaps %xmm1, 16(sp)
	528	movaps %xmm2, 32(sp)
	529	movaps %xmm3, 48(sp)
	530	movaps %xmm4, 64(sp)
	531	movaps %xmm5, 80(sp)
	532	movaps %xmm6, 96(sp)
	533	movaps %xmm7, 112(sp)
	534	#if defined __x86_64__
	535	movaps %xmm8, 16*8(sp)
	536	movaps %xmm9, 16*9(sp)
	537	movaps %xmm10, 16*10(sp)
	538	movaps %xmm11, 16*11(sp)
	539	movaps %xmm12, 16*12(sp)
	540	movaps %xmm13, 16*13(sp)
	541	movaps %xmm14, 16*14(sp)
	542	movaps %xmm15, 16*15(sp)
	543	#endif // __x86_64__
	544	#endif
	545
	546	#undef iv
	547	#define iv %xmm0
	548
	549	#if defined __i386__
	550	mov 12(%ebp), %eax // in_iv
	551	mov 24(%ebp), %edx // ctx
	552	movups (%eax), iv // iv = in_iv
	553	mov 8(%ebp), %ebx // ibuf
	554	mov 16(%ebp), %ecx // num_blk
	555	mov 20(%ebp), %edi // obuf
	556
	557	#define ibuf %ebx
	558	#define obuf %edi
	559	#define num_blk %ecx
	560	#define ctx %edx
	561
	562	#else // __x86_64__, rdi/rsi/rdx/rcx/r8
	563
	564	mov %rdi, %rbx // ibuf
	565	movups (%rsi), iv // iv = in_iv
	566	mov %rdx, %r13 // num_blk
	567	mov %rcx, %r14 // obuf
	568	mov %r8, %r15 // ctx
	569
	570	#define ibuf %rbx
	571	#define num_blk %r13d
	572	#define obuf %r14
	573	#define ctx %r15
	574
	575	#endif
	576
	577	mov 240(ctx), %eax // aes length
	578	cmp $160, %eax // aes-128 decrypt
	579	je L_decrypt_128
	580	cmp $192, %eax // aes-192 decrypt
	581	je L_decrypt_192
	582	cmp $224, %eax // aes-256 decrypt
	583	je L_decrypt_256
	584
	585	mov $-1, %eax // wrong aes length, to return -1
	586	jmp L_error // early exit due to wrong aes length
	587
	588
	589	//
	590	// aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
	591	//
	592
	593	L_decrypt_128:
	594
	595	cmp $1, num_blk
	596	jl L_HW_cbc_done // if num_blk < 1, early return
	597
	598	// aes-128 decrypt expanded keys
	599	movups 160(ctx), %xmm3
	600	movups 144(ctx), %xmm4
	601	movups 128(ctx), %xmm5
	602	movups 112(ctx), %xmm6
	603	movups 96(ctx), %xmm7
	604	#if defined __x86_64__
	605	movups 80(ctx), %xmm8
	606	movups 64(ctx), %xmm9
	607	movups 48(ctx), %xmm10
	608	movups 32(ctx), %xmm11
	609	movups 16(ctx), %xmm12
	610	movups 0(ctx), %xmm13
	611	#endif
	612
	613	// performs 4 block decryption in an iteration to exploit decrypt in parallel
	614
	615	// while ((num_blk-=4)>=0) {
	616	// aes_decrypt(ibuf, obuf, ctx);
	617	// aes_decrypt(ibuf+1, obuf+1, ctx);
	618	// aes_decrypt(ibuf+2, obuf+2, ctx);
	619	// aes_decrypt(ibuf+3, obuf+3, ctx);
	620	// obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
	621	// *iv = ibuf[3]; ibuf += 4; obuf += 4;
	622	// }
	623
	624	sub $4, num_blk // pre decrement num_blk by 4
	625	jl 9f // if num_blk < 4, skip the per-4-blocks processing code
	626
	627	0:
	628
	629
	630	#if defined __x86_64__
	631
	632	movups (ibuf), %xmm1 // tmp = 1st ibuf
	633	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	634	movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
	635	movups 48(ibuf), %xmm15 // tmp = 4th ibuf
	636
	637	// for x86_64, the expanded keys are already stored in xmm3-xmm13
	638
	639	// aes-128 decrypt round 0 per 4 blocks
	640	pxor %xmm3, %xmm1
	641	pxor %xmm3, %xmm2
	642	pxor %xmm3, %xmm14
	643	pxor %xmm3, %xmm15
	644
	645	// aes-128 decrypt round 1 per 4 blocks
	646	aesdec %xmm4, %xmm1
	647	aesdec %xmm4, %xmm2
	648	aesdec %xmm4, %xmm14
	649	aesdec %xmm4, %xmm15
	650
	651	// aes-128 decrypt round 2 per 4 blocks
	652	aesdec %xmm5, %xmm1
	653	aesdec %xmm5, %xmm2
	654	aesdec %xmm5, %xmm14
	655	aesdec %xmm5, %xmm15
	656
	657	// aes-128 decrypt round 3 per 4 blocks
	658	aesdec %xmm6, %xmm1
	659	aesdec %xmm6, %xmm2
	660	aesdec %xmm6, %xmm14
	661	aesdec %xmm6, %xmm15
	662
	663	// aes-128 decrypt round 4 per 4 blocks
	664	aesdec %xmm7, %xmm1
	665	aesdec %xmm7, %xmm2
	666	aesdec %xmm7, %xmm14
	667	aesdec %xmm7, %xmm15
	668
	669	// aes-128 decrypt round 5 per 4 blocks
	670	aesdec %xmm8, %xmm1
	671	aesdec %xmm8, %xmm2
	672	aesdec %xmm8, %xmm14
	673	aesdec %xmm8, %xmm15
	674
	675	// aes-128 decrypt round 6 per 4 blocks
	676	aesdec %xmm9, %xmm1
	677	aesdec %xmm9, %xmm2
	678	aesdec %xmm9, %xmm14
	679	aesdec %xmm9, %xmm15
	680
	681	// aes-128 decrypt round 7 per 4 blocks
	682	aesdec %xmm10, %xmm1
	683	aesdec %xmm10, %xmm2
	684	aesdec %xmm10, %xmm14
	685	aesdec %xmm10, %xmm15
	686
	687	// aes-128 decrypt round 8 per 4 blocks
	688	aesdec %xmm11, %xmm1
	689	aesdec %xmm11, %xmm2
	690	aesdec %xmm11, %xmm14
	691	aesdec %xmm11, %xmm15
	692
	693	// aes-128 decrypt round 9 per 4 blocks
	694	aesdec %xmm12, %xmm1
	695	aesdec %xmm12, %xmm2
	696	aesdec %xmm12, %xmm14
	697	aesdec %xmm12, %xmm15
	698
	699	// aes-128 decrypt round 10 (last) per 4 blocks
	700	aesdeclast %xmm13, %xmm1
	701	aesdeclast %xmm13, %xmm2
	702	aesdeclast %xmm13, %xmm14
	703	aesdeclast %xmm13, %xmm15
	704
	705	pxor iv, %xmm1 // obuf[0] ^= *iv;
	706	movups (ibuf), iv // ibuf[0]
	707	pxor iv, %xmm2 // obuf[1] ^= ibuf[0];
	708	movups 16(ibuf), iv // ibuf[1]
	709	pxor iv, %xmm14 // obuf[2] ^= ibuf[1];
	710	movups 32(ibuf), iv // ibuf[2]
	711	pxor iv, %xmm15 // obuf[3] ^= obuf[2];
	712	movups 48(ibuf), iv // *iv = ibuf[3]
	713
	714	movups %xmm1, (obuf) // write 1st obuf
	715	movups %xmm2, 16(obuf) // write 2nd obuf
	716	movups %xmm14, 32(obuf) // write 3rd obuf
	717	movups %xmm15, 48(obuf) // write 4th obuf
	718
	719
	720	#else
	721
	722	// aes_decrypt_cbc per 4 blocks using aes-128 for i386
	723	// xmm1/xmm2/xmm4/xmm5 used for obuf per block
	724	// xmm3 = key0
	725	// xmm0 = iv
	726	// xmm6/xmm7 dynamically load with other expanded keys
	727
	728	movups (ibuf), %xmm1 // tmp = 1st ibuf
	729	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	730	movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
	731	movups 48(ibuf), %xmm5 // tmp = 4th ibuf
	732
	733	// aes_decrypt
	734	// for i386, sequentially load expanded keys into xmm6/xmm7
	735
	736	movups 144(ctx), %xmm6 // key1
	737
	738	// aes-128 decrypt round 0 per 4 blocks
	739	pxor %xmm3, %xmm1
	740	pxor %xmm3, %xmm2
	741	pxor %xmm3, %xmm4
	742	pxor %xmm3, %xmm5
	743
	744	movups 128(ctx), %xmm7 // key2
	745
	746	// aes-128 decrypt round 1 per 4 blocks
	747	aesdec %xmm6, %xmm1
	748	aesdec %xmm6, %xmm2
	749	aesdec %xmm6, %xmm4
	750	aesdec %xmm6, %xmm5
	751
	752	movups 112(ctx), %xmm6 // key3
	753
	754	// aes-128 decrypt round 2 per 4 blocks
	755	aesdec %xmm7, %xmm1
	756	aesdec %xmm7, %xmm2
	757	aesdec %xmm7, %xmm4
	758	aesdec %xmm7, %xmm5
	759
	760	movups 96(ctx), %xmm7 // key4
	761
	762	// aes-128 decrypt round 3 per 4 blocks
	763	aesdec %xmm6, %xmm1
	764	aesdec %xmm6, %xmm2
	765	aesdec %xmm6, %xmm4
	766	aesdec %xmm6, %xmm5
	767
	768	movups 80(ctx), %xmm6 // key5
	769
	770	// aes-128 decrypt round 4 per 4 blocks
	771	aesdec %xmm7, %xmm1
	772	aesdec %xmm7, %xmm2
	773	aesdec %xmm7, %xmm4
	774	aesdec %xmm7, %xmm5
	775
	776	movups 64(ctx), %xmm7 // key6
	777
	778	// aes-128 decrypt round 5 per 4 blocks
	779	aesdec %xmm6, %xmm1
	780	aesdec %xmm6, %xmm2
	781	aesdec %xmm6, %xmm4
	782	aesdec %xmm6, %xmm5
	783
	784	movups 48(ctx), %xmm6 // key7
	785
	786	// aes-128 decrypt round 6 per 4 blocks
	787	aesdec %xmm7, %xmm1
	788	aesdec %xmm7, %xmm2
	789	aesdec %xmm7, %xmm4
	790	aesdec %xmm7, %xmm5
	791
	792	movups 32(ctx), %xmm7 // key8
	793
	794	// aes-128 decrypt round 7 per 4 blocks
	795	aesdec %xmm6, %xmm1
	796	aesdec %xmm6, %xmm2
	797	aesdec %xmm6, %xmm4
	798	aesdec %xmm6, %xmm5
	799
	800	movups 16(ctx), %xmm6 // key9
	801
	802	// aes-128 decrypt round 8 per 4 blocks
	803	aesdec %xmm7, %xmm1
	804	aesdec %xmm7, %xmm2
	805	aesdec %xmm7, %xmm4
	806	aesdec %xmm7, %xmm5
	807
	808	movups 0(ctx), %xmm7 // keyA
	809
	810	// aes-128 decrypt round 9 per 4 blocks
	811	aesdec %xmm6, %xmm1
	812	aesdec %xmm6, %xmm2
	813	aesdec %xmm6, %xmm4
	814	aesdec %xmm6, %xmm5
	815
	816	// aes-128 decrypt round 10 (last) per 4 blocks
	817	aesdeclast %xmm7, %xmm1
	818	aesdeclast %xmm7, %xmm2
	819	aesdeclast %xmm7, %xmm4
	820	aesdeclast %xmm7, %xmm5
	821
	822	pxor iv, %xmm1 // 1st obuf ^= iv;
	823	movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
	824	pxor iv, %xmm2 // 2nd obuf ^= iv;
	825	movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
	826	pxor iv, %xmm4 // 3rd obuf ^= iv;
	827	movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
	828	pxor iv, %xmm5 // 4th obuf ^= iv;
	829	movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
	830
	831	movups %xmm1, (obuf) // write 1st obuf
	832	movups %xmm2, 16(obuf) // write 2nd obuf
	833	movups %xmm4, 32(obuf) // write 3rd obuf
	834	movups %xmm5, 48(obuf) // write 4th obuf
	835	#endif
	836
	837	add $64, ibuf // ibuf += 4;
	838	add $64, obuf // obuf += 4;
	839
	840	sub $4, num_blk // num_blk -= 4
	841	jge 0b // if num_blk > 0, repeat the loop
	842
	843	9: add $4, num_blk // post incremtn num_blk by 4
	844	je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
	845
	846	#if defined __i386__
	847	// updated as they might be needed as expanded keys in the remaining
	848	movups 144(ctx), %xmm4
	849	movups 128(ctx), %xmm5
	850	movups 112(ctx), %xmm6
	851	movups 96(ctx), %xmm7
	852	#endif
	853
	854	test $2, num_blk // check whether num_blk has 2 blocks
	855	je 9f // if num_blk & 2 == 0, skip the per-pair processing code
	856
	857	// do the remaining 2 blocks together
	858
	859	movups (ibuf), %xmm1 // tmp = 1st ibuf
	860	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	861
	862	// aes_decrypt
	863	pxor %xmm3, %xmm1
	864	pxor %xmm3, %xmm2
	865	aesdec %xmm4, %xmm1
	866	aesdec %xmm4, %xmm2
	867	aesdec %xmm5, %xmm1
	868	aesdec %xmm5, %xmm2
	869	aesdec %xmm6, %xmm1
	870	aesdec %xmm6, %xmm2
	871	#if defined __x86_64__
	872	aesdec %xmm7, %xmm1
	873	aesdec %xmm7, %xmm2
	874	aesdec %xmm8, %xmm1
	875	aesdec %xmm8, %xmm2
	876	aesdec %xmm9, %xmm1
	877	aesdec %xmm9, %xmm2
	878	aesdec %xmm10, %xmm1
	879	aesdec %xmm10, %xmm2
	880	aesdec %xmm11, %xmm1
	881	aesdec %xmm11, %xmm2
	882	aesdec %xmm12, %xmm1
	883	aesdec %xmm12, %xmm2
	884	aesdeclast %xmm13, %xmm1
	885	aesdeclast %xmm13, %xmm2
	886	#else
	887	movups 80(ctx), %xmm6
	888	aesdec %xmm7, %xmm1
	889	aesdec %xmm7, %xmm2
	890	movups 64(ctx), %xmm7
	891	aesdec %xmm6, %xmm1
	892	aesdec %xmm6, %xmm2
	893	movups 48(ctx), %xmm6
	894	aesdec %xmm7, %xmm1
	895	aesdec %xmm7, %xmm2
	896	movups 32(ctx), %xmm7
	897	aesdec %xmm6, %xmm1
	898	aesdec %xmm6, %xmm2
	899	movups 16(ctx), %xmm6
	900	aesdec %xmm7, %xmm1
	901	aesdec %xmm7, %xmm2
	902	movups 0(ctx), %xmm7
	903	aesdec %xmm6, %xmm1
	904	aesdec %xmm6, %xmm2
	905	aesdeclast %xmm7, %xmm1
	906	aesdeclast %xmm7, %xmm2
	907	movups 112(ctx), %xmm6
	908	movups 96(ctx), %xmm7
	909	#endif
	910
	911	pxor iv, %xmm1 // obuf[0] ^= *iv;
	912	movups (ibuf), iv // ibuf[0]
	913	pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
	914	movups 16(ibuf), iv // *iv = ibuf[1]
	915
	916	movups %xmm1, (obuf) // write obuf[0]
	917	movups %xmm2, 16(obuf) // write obuf[1]
	918
	919	add $32, ibuf // ibuf += 2
	920	add $32, obuf // obuf += 2
	921
	922	9:
	923	test $1, num_blk // check whether num_blk has residual 1 block
	924	je L_HW_cbc_done // if num_blk == 0, no need for residual processing code
	925
	926	movups (ibuf), %xmm2 // tmp = ibuf
	927	// aes_decrypt
	928	pxor %xmm3, %xmm2
	929	aesdec %xmm4, %xmm2
	930	aesdec %xmm5, %xmm2
	931	aesdec %xmm6, %xmm2
	932	aesdec %xmm7, %xmm2
	933	#if defined __x86_64__
	934	aesdec %xmm8, %xmm2
	935	aesdec %xmm9, %xmm2
	936	aesdec %xmm10, %xmm2
	937	aesdec %xmm11, %xmm2
	938	aesdec %xmm12, %xmm2
	939	aesdeclast %xmm13, %xmm2
	940	#else
	941	movups 80(ctx), %xmm1
	942	aesdec %xmm1, %xmm2
	943	movups 64(ctx), %xmm1
	944	aesdec %xmm1, %xmm2
	945	movups 48(ctx), %xmm1
	946	aesdec %xmm1, %xmm2
	947	movups 32(ctx), %xmm1
	948	aesdec %xmm1, %xmm2
	949	movups 16(ctx), %xmm1
	950	aesdec %xmm1, %xmm2
	951	movups (ctx), %xmm1
	952	aesdeclast %xmm1, %xmm2
	953	#endif
	954
	955	pxor iv, %xmm2 // obuf ^= iv;
	956	movups (ibuf), iv // iv = ibuf;
	957	movups %xmm2, (obuf) // write *obuf
	958
	959	jmp L_HW_cbc_done
	960
	961	//
	962	// aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
	963	//
	964
	965	L_decrypt_192:
	966
	967	cmp $1, num_blk
	968	jl L_HW_cbc_done // if num_blk < 1, early return
	969
	970	// aes-192 decryp expanded keys
	971	movups 192(ctx), %xmm3
	972	movups 176(ctx), %xmm4
	973	movups 160(ctx), %xmm5
	974	movups 144(ctx), %xmm6
	975	movups 128(ctx), %xmm7
	976	#if defined __x86_64__
	977	movups 112(ctx), %xmm8
	978	movups 96(ctx), %xmm9
	979	movups 80(ctx), %xmm10
	980	movups 64(ctx), %xmm11
	981	movups 48(ctx), %xmm12
	982	movups 32(ctx), %xmm13
	983	movups 16(ctx), %xmm14
	984	movups (ctx), %xmm15
	985	#endif
	986
	987	// performs 4 block decryption in an iteration to exploit decrypt in parallel
	988
	989	// while ((num_blk-=4)>=0) {
	990	// aes_decrypt(ibuf, obuf, ctx);
	991	// aes_decrypt(ibuf+1, obuf+1, ctx);
	992	// aes_decrypt(ibuf+2, obuf+2, ctx);
	993	// aes_decrypt(ibuf+3, obuf+3, ctx);
	994	// obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
	995	// *iv = ibuf[3]; ibuf += 4; obuf += 4;
	996	// }
	997
	998	sub $4, num_blk // pre decrement num_blk by 4
	999	jl 9f // if num_blk < 4, skip the per-4-blocks processing code
	1000	0:
	1001
	1002	#if defined __x86_64__
	1003
	1004	movups (ibuf), %xmm1 // tmp = 1st ibuf
	1005	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	1006	movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
	1007	movups 48(ibuf), %xmm15 // tmp = 4th ibuf
	1008
	1009	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
	1010	// use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
	1011
	1012	// round 0 for 4 blocks
	1013	pxor %xmm3, %xmm1
	1014	pxor %xmm3, %xmm2
	1015	pxor %xmm3, %xmm14
	1016	pxor %xmm3, %xmm15
	1017
	1018	// round 1 for 4 blocks
	1019	aesdec %xmm4, %xmm1
	1020	aesdec %xmm4, %xmm2
	1021	aesdec %xmm4, %xmm14
	1022	aesdec %xmm4, %xmm15
	1023
	1024	// round 2 for 4 blocks
	1025	aesdec %xmm5, %xmm1
	1026	aesdec %xmm5, %xmm2
	1027	aesdec %xmm5, %xmm14
	1028	aesdec %xmm5, %xmm15
	1029
	1030	// round 3 for 4 blocks
	1031	aesdec %xmm6, %xmm1
	1032	aesdec %xmm6, %xmm2
	1033	aesdec %xmm6, %xmm14
	1034	aesdec %xmm6, %xmm15
	1035
	1036	// round 4 for 4 blocks
	1037	aesdec %xmm7, %xmm1
	1038	aesdec %xmm7, %xmm2
	1039	aesdec %xmm7, %xmm14
	1040	aesdec %xmm7, %xmm15
	1041
	1042	// round 5 for 4 blocks
	1043	aesdec %xmm8, %xmm1
	1044	aesdec %xmm8, %xmm2
	1045	aesdec %xmm8, %xmm14
	1046	aesdec %xmm8, %xmm15
	1047
	1048	// round 6 for 4 blocks
	1049	aesdec %xmm9, %xmm1
	1050	aesdec %xmm9, %xmm2
	1051	aesdec %xmm9, %xmm14
	1052	aesdec %xmm9, %xmm15
	1053
	1054	// round 7 for 4 blocks
	1055	aesdec %xmm10, %xmm1
	1056	aesdec %xmm10, %xmm2
	1057	aesdec %xmm10, %xmm14
	1058	aesdec %xmm10, %xmm15
	1059
	1060	// round 8 for 4 blocks
	1061	aesdec %xmm11, %xmm1
	1062	aesdec %xmm11, %xmm2
	1063	aesdec %xmm11, %xmm14
	1064	aesdec %xmm11, %xmm15
	1065
	1066	// round 9 for 4 blocks
	1067	aesdec %xmm12, %xmm1
	1068	aesdec %xmm12, %xmm2
	1069	aesdec %xmm12, %xmm14
	1070	aesdec %xmm12, %xmm15
	1071
	1072	movups 16(ctx), %xmm12
	1073
	1074	// round A for 4 blocks
	1075	aesdec %xmm13, %xmm1
	1076	aesdec %xmm13, %xmm2
	1077	aesdec %xmm13, %xmm14
	1078	aesdec %xmm13, %xmm15
	1079
	1080	movups (ctx), %xmm13
	1081
	1082	// round B for 4 blocks
	1083	aesdec %xmm12, %xmm1
	1084	aesdec %xmm12, %xmm2
	1085	aesdec %xmm12, %xmm14
	1086	aesdec %xmm12, %xmm15
	1087
	1088	movups 48(ctx), %xmm12 // restore %xmm12 to its original key
	1089
	1090	// round C (last) for 4 blocks
	1091	aesdeclast %xmm13, %xmm1
	1092	aesdeclast %xmm13, %xmm2
	1093	aesdeclast %xmm13, %xmm14
	1094	aesdeclast %xmm13, %xmm15
	1095
	1096	movups 32(ctx), %xmm13 // restore %xmm13 to its original key
	1097
	1098	pxor iv, %xmm1 // obuf[0] ^= *iv;
	1099	movups (ibuf), iv // ibuf[0]
	1100	pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
	1101	movups 16(ibuf), iv // ibuf[1]
	1102	pxor iv, %xmm14 // obuf[2] ^= ibuf[1]
	1103	movups 32(ibuf), iv // ibuf[2]
	1104	pxor iv, %xmm15 // obuf[3] ^= ibuf[2]
	1105	movups 48(ibuf), iv // *iv = ibuf[3]
	1106
	1107	movups %xmm1, (obuf) // write 1st obuf
	1108	movups %xmm2, 16(obuf) // write 2nd obuf
	1109	movups %xmm14, 32(obuf) // write 3rd obuf
	1110	movups %xmm15, 48(obuf) // write 4th obuf
	1111
	1112	add $64, ibuf // ibuf += 4;
	1113	add $64, obuf // obuf += 4;
	1114
	1115	sub $4, num_blk // num_blk -= 4
	1116	jge 0b // if num_blk > 0, repeat the loop
	1117
	1118	9: add $4, num_blk // post incremtn num_blk by 4
	1119	je L_HW_cbc_done // if num_blk == 0, prepare to return
	1120
	1121	movups 16(ctx), %xmm14 // restore %xmm14 to its key
	1122	movups (ctx), %xmm15 // restore %xmm15 to its key
	1123
	1124	#else
	1125
	1126	movups (ibuf), %xmm1 // tmp = 1st ibuf
	1127	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	1128	movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
	1129	movups 48(ibuf), %xmm5 // tmp = 4th ibuf
	1130
	1131	// aes_decrypt
	1132	// for i386, sequentially load expanded keys into xmm6/xmm7
	1133	movups 176(ctx), %xmm6
	1134	pxor %xmm3, %xmm1
	1135	pxor %xmm3, %xmm2
	1136	pxor %xmm3, %xmm4
	1137	pxor %xmm3, %xmm5
	1138
	1139	movups 160(ctx), %xmm7
	1140	aesdec %xmm6, %xmm1
	1141	aesdec %xmm6, %xmm2
	1142	aesdec %xmm6, %xmm4
	1143	aesdec %xmm6, %xmm5
	1144
	1145	movups 144(ctx), %xmm6
	1146	aesdec %xmm7, %xmm1
	1147	aesdec %xmm7, %xmm2
	1148	aesdec %xmm7, %xmm4
	1149	aesdec %xmm7, %xmm5
	1150
	1151	movups 128(ctx), %xmm7
	1152	aesdec %xmm6, %xmm1
	1153	aesdec %xmm6, %xmm2
	1154	aesdec %xmm6, %xmm4
	1155	aesdec %xmm6, %xmm5
	1156
	1157	movups 112(ctx), %xmm6
	1158	aesdec %xmm7, %xmm1
	1159	aesdec %xmm7, %xmm2
	1160	aesdec %xmm7, %xmm4
	1161	aesdec %xmm7, %xmm5
	1162
	1163	movups 96(ctx), %xmm7
	1164	aesdec %xmm6, %xmm1
	1165	aesdec %xmm6, %xmm2
	1166	aesdec %xmm6, %xmm4
	1167	aesdec %xmm6, %xmm5
	1168
	1169	movups 80(ctx), %xmm6
	1170	aesdec %xmm7, %xmm1
	1171	aesdec %xmm7, %xmm2
	1172	aesdec %xmm7, %xmm4
	1173	aesdec %xmm7, %xmm5
	1174
	1175	movups 64(ctx), %xmm7
	1176	aesdec %xmm6, %xmm1
	1177	aesdec %xmm6, %xmm2
	1178	aesdec %xmm6, %xmm4
	1179	aesdec %xmm6, %xmm5
	1180
	1181	movups 48(ctx), %xmm6
	1182	aesdec %xmm7, %xmm1
	1183	aesdec %xmm7, %xmm2
	1184	aesdec %xmm7, %xmm4
	1185	aesdec %xmm7, %xmm5
	1186
	1187	movups 32(ctx), %xmm7
	1188	aesdec %xmm6, %xmm1
	1189	aesdec %xmm6, %xmm2
	1190	aesdec %xmm6, %xmm4
	1191	aesdec %xmm6, %xmm5
	1192
	1193	movups 16(ctx), %xmm6
	1194	aesdec %xmm7, %xmm1
	1195	aesdec %xmm7, %xmm2
	1196	aesdec %xmm7, %xmm4
	1197	aesdec %xmm7, %xmm5
	1198
	1199	movups 0(ctx), %xmm7
	1200	aesdec %xmm6, %xmm1
	1201	aesdec %xmm6, %xmm2
	1202	aesdec %xmm6, %xmm4
	1203	aesdec %xmm6, %xmm5
	1204
	1205	aesdeclast %xmm7, %xmm1
	1206	aesdeclast %xmm7, %xmm2
	1207	aesdeclast %xmm7, %xmm4
	1208	aesdeclast %xmm7, %xmm5
	1209
	1210	pxor iv, %xmm1 // 1st obuf ^= iv;
	1211	movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
	1212	pxor iv, %xmm2 // 2nd obuf ^= iv;
	1213	movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
	1214	pxor iv, %xmm4 // 3rd obuf ^= iv;
	1215	movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
	1216	pxor iv, %xmm5 // 4th obuf ^= iv;
	1217	movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
	1218	movups %xmm1, (obuf) // write 1st obuf
	1219	movups %xmm2, 16(obuf) // write 2nd obuf
	1220	movups %xmm4, 32(obuf) // write 3rd obuf
	1221	movups %xmm5, 48(obuf) // write 4th obuf
	1222
	1223	add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
	1224	add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
	1225
	1226	sub $4, num_blk // num_blk -= 4
	1227	jge 0b // if num_blk > 0, repeat the loop
	1228
	1229
	1230	9: add $4, num_blk // post incremtn num_blk by 4
	1231	je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
	1232
	1233	movups 176(ctx), %xmm4
	1234	movups 160(ctx), %xmm5
	1235	movups 144(ctx), %xmm6
	1236	movups 128(ctx), %xmm7
	1237
	1238	#endif
	1239
	1240	// per-block aes_decrypt_cbc loop
	1241
	1242	0:
	1243	movups (ibuf), %xmm2 // tmp = ibuf
	1244
	1245	// aes_decrypt
	1246	pxor %xmm3, %xmm2
	1247	aesdec %xmm4, %xmm2
	1248	aesdec %xmm5, %xmm2
	1249	aesdec %xmm6, %xmm2
	1250	aesdec %xmm7, %xmm2
	1251	#if defined __x86_64__
	1252	aesdec %xmm8, %xmm2
	1253	aesdec %xmm9, %xmm2
	1254	aesdec %xmm10, %xmm2
	1255	aesdec %xmm11, %xmm2
	1256	aesdec %xmm12, %xmm2
	1257	aesdec %xmm13, %xmm2
	1258	aesdec %xmm14, %xmm2
	1259	aesdeclast %xmm15, %xmm2
	1260	#else
	1261	movups 112(ctx), %xmm1
	1262	aesdec %xmm1, %xmm2
	1263	movups 96(ctx), %xmm1
	1264	aesdec %xmm1, %xmm2
	1265	movups 80(ctx), %xmm1
	1266	aesdec %xmm1, %xmm2
	1267	movups 64(ctx), %xmm1
	1268	aesdec %xmm1, %xmm2
	1269	movups 48(ctx), %xmm1
	1270	aesdec %xmm1, %xmm2
	1271	movups 32(ctx), %xmm1
	1272	aesdec %xmm1, %xmm2
	1273	movups 16(ctx), %xmm1
	1274	aesdec %xmm1, %xmm2
	1275	movups (ctx), %xmm1
	1276	aesdeclast %xmm1, %xmm2
	1277	#endif
	1278
	1279	pxor iv, %xmm2 // obuf ^= iv;
	1280	movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
	1281
	1282	movups %xmm2, (obuf) // write obuf
	1283
	1284	add $16, ibuf // ibuf += AES_BLOCK_SIZE;
	1285	add $16, obuf // obuf += AES_BLOCK_SIZE;
	1286	sub $1, num_blk // num_blk --
	1287	jg 0b // if num_blk > 0, repeat the loop
	1288
	1289	jmp L_HW_cbc_done
	1290
	1291	//
	1292	// aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
	1293	//
	1294
	1295	L_decrypt_256:
	1296
	1297	cmp $1, num_blk
	1298	jl L_HW_cbc_done
	1299
	1300	movups 224(ctx), %xmm3
	1301	movups 208(ctx), %xmm4
	1302	movups 192(ctx), %xmm5
	1303	movups 176(ctx), %xmm6
	1304	movups 160(ctx), %xmm7
	1305	#if defined __x86_64__
	1306	movups 144(ctx), %xmm8
	1307	movups 128(ctx), %xmm9
	1308	movups 112(ctx), %xmm10
	1309	movups 96(ctx), %xmm11
	1310	movups 80(ctx), %xmm12
	1311	movups 64(ctx), %xmm13
	1312	movups 48(ctx), %xmm14
	1313	movups 32(ctx), %xmm15
	1314	// movups 16(ctx), %xmm14
	1315	// movups (ctx), %xmm15
	1316	#endif
	1317
	1318	#if defined __x86_64__
	1319
	1320	sub $4, num_blk // pre decrement num_blk by 4
	1321	jl 9f // if num_blk < 4, skip the per-4-blocks processing code
	1322	0:
	1323	movups (ibuf), %xmm1 // tmp = 1st ibuf
	1324	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	1325	movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
	1326	movups 48(ibuf), %xmm15 // tmp = 4th ibuf
	1327
	1328	// aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
	1329	pxor %xmm3, %xmm1
	1330	pxor %xmm3, %xmm2
	1331	pxor %xmm3, %xmm14
	1332	pxor %xmm3, %xmm15
	1333
	1334	aesdec %xmm4, %xmm1
	1335	aesdec %xmm4, %xmm2
	1336	aesdec %xmm4, %xmm14
	1337	aesdec %xmm4, %xmm15
	1338
	1339	aesdec %xmm5, %xmm1
	1340	aesdec %xmm5, %xmm2
	1341	aesdec %xmm5, %xmm14
	1342	aesdec %xmm5, %xmm15
	1343
	1344	aesdec %xmm6, %xmm1
	1345	aesdec %xmm6, %xmm2
	1346	aesdec %xmm6, %xmm14
	1347	aesdec %xmm6, %xmm15
	1348
	1349	aesdec %xmm7, %xmm1
	1350	aesdec %xmm7, %xmm2
	1351	aesdec %xmm7, %xmm14
	1352	aesdec %xmm7, %xmm15
	1353
	1354	aesdec %xmm8, %xmm1
	1355	aesdec %xmm8, %xmm2
	1356	aesdec %xmm8, %xmm14
	1357	aesdec %xmm8, %xmm15
	1358
	1359	aesdec %xmm9, %xmm1
	1360	aesdec %xmm9, %xmm2
	1361	aesdec %xmm9, %xmm14
	1362	aesdec %xmm9, %xmm15
	1363
	1364	aesdec %xmm10, %xmm1
	1365	aesdec %xmm10, %xmm2
	1366	aesdec %xmm10, %xmm14
	1367	aesdec %xmm10, %xmm15
	1368
	1369	aesdec %xmm11, %xmm1
	1370	aesdec %xmm11, %xmm2
	1371	aesdec %xmm11, %xmm14
	1372	aesdec %xmm11, %xmm15
	1373
	1374	aesdec %xmm12, %xmm1
	1375	aesdec %xmm12, %xmm2
	1376	aesdec %xmm12, %xmm14
	1377	aesdec %xmm12, %xmm15
	1378	movups 48(ctx), %xmm12
	1379
	1380	aesdec %xmm13, %xmm1
	1381	aesdec %xmm13, %xmm2
	1382	aesdec %xmm13, %xmm14
	1383	aesdec %xmm13, %xmm15
	1384	movups 32(ctx), %xmm13
	1385
	1386	aesdec %xmm12, %xmm1
	1387	aesdec %xmm12, %xmm2
	1388	aesdec %xmm12, %xmm14
	1389	aesdec %xmm12, %xmm15
	1390	movups 16(ctx), %xmm12
	1391
	1392	aesdec %xmm13, %xmm1
	1393	aesdec %xmm13, %xmm2
	1394	aesdec %xmm13, %xmm14
	1395	aesdec %xmm13, %xmm15
	1396	movups (ctx), %xmm13
	1397
	1398	aesdec %xmm12, %xmm1
	1399	aesdec %xmm12, %xmm2
	1400	aesdec %xmm12, %xmm14
	1401	aesdec %xmm12, %xmm15
	1402	movups 80(ctx), %xmm12
	1403
	1404	aesdeclast %xmm13, %xmm1
	1405	aesdeclast %xmm13, %xmm2
	1406	aesdeclast %xmm13, %xmm14
	1407	aesdeclast %xmm13, %xmm15
	1408	movups 64(ctx), %xmm13
	1409
	1410	pxor iv, %xmm1 // obuf ^= iv;
	1411	movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
	1412	pxor iv, %xmm2 // obuf ^= iv;
	1413	movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
	1414	pxor iv, %xmm14 // obuf ^= iv;
	1415	movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
	1416	pxor iv, %xmm15 // obuf ^= iv;
	1417	movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
	1418
	1419	movups %xmm1, (obuf) // write 1st obuf
	1420	movups %xmm2, 16(obuf) // write 2nd obuf
	1421	movups %xmm14, 32(obuf) // write 3rd obuf
	1422	movups %xmm15, 48(obuf) // write 4th obuf
	1423
	1424	add $64, ibuf // ibuf += AES_BLOCK_SIZE*4;
	1425	add $64, obuf // obuf += AES_BLOCK_SIZE*4;
	1426
	1427	sub $4, num_blk // num_blk -= 4
	1428	jge 0b // if num_blk > 0, repeat the loop
	1429
	1430	9: add $4, num_blk // post incremtn num_blk by 4
	1431	je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
	1432
	1433	movups 48(ctx), %xmm14
	1434	movups 32(ctx), %xmm15
	1435
	1436	#else
	1437
	1438	sub $4, num_blk // pre decrement num_blk by 4
	1439	jl 9f // if num_blk < 4, skip the per-pair processing code
	1440	0:
	1441	movups (ibuf), %xmm1 // tmp = 1st ibuf
	1442	movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
	1443	movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
	1444	movups 48(ibuf), %xmm5 // tmp = 4th ibuf
	1445
	1446	// aes_decrypt
	1447	// for i386, sequentially load expanded keys into xmm6/xmm7
	1448	movups 208(ctx), %xmm6
	1449	pxor %xmm3, %xmm1
	1450	pxor %xmm3, %xmm2
	1451	pxor %xmm3, %xmm4
	1452	pxor %xmm3, %xmm5
	1453
	1454	movups 192(ctx), %xmm7
	1455	aesdec %xmm6, %xmm1
	1456	aesdec %xmm6, %xmm2
	1457	aesdec %xmm6, %xmm4
	1458	aesdec %xmm6, %xmm5
	1459
	1460	movups 176(ctx), %xmm6
	1461	aesdec %xmm7, %xmm1
	1462	aesdec %xmm7, %xmm2
	1463	aesdec %xmm7, %xmm4
	1464	aesdec %xmm7, %xmm5
	1465
	1466	movups 160(ctx), %xmm7
	1467	aesdec %xmm6, %xmm1
	1468	aesdec %xmm6, %xmm2
	1469	aesdec %xmm6, %xmm4
	1470	aesdec %xmm6, %xmm5
	1471
	1472	movups 144(ctx), %xmm6
	1473	aesdec %xmm7, %xmm1
	1474	aesdec %xmm7, %xmm2
	1475	aesdec %xmm7, %xmm4
	1476	aesdec %xmm7, %xmm5
	1477
	1478	movups 128(ctx), %xmm7
	1479	aesdec %xmm6, %xmm1
	1480	aesdec %xmm6, %xmm2
	1481	aesdec %xmm6, %xmm4
	1482	aesdec %xmm6, %xmm5
	1483
	1484	movups 112(ctx), %xmm6
	1485	aesdec %xmm7, %xmm1
	1486	aesdec %xmm7, %xmm2
	1487	aesdec %xmm7, %xmm4
	1488	aesdec %xmm7, %xmm5
	1489
	1490	movups 96(ctx), %xmm7
	1491	aesdec %xmm6, %xmm1
	1492	aesdec %xmm6, %xmm2
	1493	aesdec %xmm6, %xmm4
	1494	aesdec %xmm6, %xmm5
	1495
	1496	movups 80(ctx), %xmm6
	1497	aesdec %xmm7, %xmm1
	1498	aesdec %xmm7, %xmm2
	1499	aesdec %xmm7, %xmm4
	1500	aesdec %xmm7, %xmm5
	1501
	1502	movups 64(ctx), %xmm7
	1503	aesdec %xmm6, %xmm1
	1504	aesdec %xmm6, %xmm2
	1505	aesdec %xmm6, %xmm4
	1506	aesdec %xmm6, %xmm5
	1507
	1508	movups 48(ctx), %xmm6
	1509	aesdec %xmm7, %xmm1
	1510	aesdec %xmm7, %xmm2
	1511	aesdec %xmm7, %xmm4
	1512	aesdec %xmm7, %xmm5
	1513
	1514	movups 32(ctx), %xmm7
	1515	aesdec %xmm6, %xmm1
	1516	aesdec %xmm6, %xmm2
	1517	aesdec %xmm6, %xmm4
	1518	aesdec %xmm6, %xmm5
	1519
	1520	movups 16(ctx), %xmm6
	1521	aesdec %xmm7, %xmm1
	1522	aesdec %xmm7, %xmm2
	1523	aesdec %xmm7, %xmm4
	1524	aesdec %xmm7, %xmm5
	1525
	1526	movups 0(ctx), %xmm7
	1527	aesdec %xmm6, %xmm1
	1528	aesdec %xmm6, %xmm2
	1529	aesdec %xmm6, %xmm4
	1530	aesdec %xmm6, %xmm5
	1531
	1532	aesdeclast %xmm7, %xmm1
	1533	aesdeclast %xmm7, %xmm2
	1534	aesdeclast %xmm7, %xmm4
	1535	aesdeclast %xmm7, %xmm5
	1536
	1537	pxor iv, %xmm1 // 1st obuf ^= iv;
	1538	movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
	1539	pxor iv, %xmm2 // 2nd obuf ^= iv;
	1540	movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
	1541	pxor iv, %xmm4 // 3rd obuf ^= iv;
	1542	movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
	1543	pxor iv, %xmm5 // 4th obuf ^= iv;
	1544	movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
	1545	movups %xmm1, (obuf) // write 1st obuf
	1546	movups %xmm2, 16(obuf) // write 2nd obuf
	1547	movups %xmm4, 32(obuf) // write 3rd obuf
	1548	movups %xmm5, 48(obuf) // write 4th obuf
	1549
	1550	add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
	1551	add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
	1552
	1553	sub $4, num_blk // num_blk -= 4
	1554	jge 0b // if num_blk > 0, repeat the loop
	1555
	1556
	1557	9: add $4, num_blk // post incremtn num_blk by 4
	1558	je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
	1559
	1560	movups 208(ctx), %xmm4
	1561	movups 192(ctx), %xmm5
	1562	movups 176(ctx), %xmm6
	1563	movups 160(ctx), %xmm7
	1564
	1565	#endif
	1566
	1567	0:
	1568	movups (ibuf), %xmm2 // tmp = ibuf
	1569
	1570	// aes_decrypt
	1571	pxor %xmm3, %xmm2
	1572	aesdec %xmm4, %xmm2
	1573	aesdec %xmm5, %xmm2
	1574	aesdec %xmm6, %xmm2
	1575	aesdec %xmm7, %xmm2
	1576	#if defined __x86_64__
	1577	aesdec %xmm8, %xmm2
	1578	aesdec %xmm9, %xmm2
	1579	aesdec %xmm10, %xmm2
	1580	aesdec %xmm11, %xmm2
	1581	aesdec %xmm12, %xmm2
	1582	aesdec %xmm13, %xmm2
	1583	aesdec %xmm14, %xmm2
	1584	aesdec %xmm15, %xmm2
	1585	#else
	1586	movups 144(ctx), %xmm1
	1587	aesdec %xmm1, %xmm2
	1588	movups 128(ctx), %xmm1
	1589	aesdec %xmm1, %xmm2
	1590	movups 112(ctx), %xmm1
	1591	aesdec %xmm1, %xmm2
	1592	movups 96(ctx), %xmm1
	1593	aesdec %xmm1, %xmm2
	1594	movups 80(ctx), %xmm1
	1595	aesdec %xmm1, %xmm2
	1596	movups 64(ctx), %xmm1
	1597	aesdec %xmm1, %xmm2
	1598	movups 48(ctx), %xmm1
	1599	aesdec %xmm1, %xmm2
	1600	movups 32(ctx), %xmm1
	1601	aesdec %xmm1, %xmm2
	1602	#endif
	1603	movups 16(ctx), %xmm1
	1604	aesdec %xmm1, %xmm2
	1605	movups (ctx), %xmm1
	1606	aesdeclast %xmm1, %xmm2
	1607
	1608	pxor iv, %xmm2 // obuf ^= iv;
	1609	movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
	1610
	1611	movups %xmm2, (obuf) // write obuf
	1612
	1613	add $16, ibuf // ibuf += AES_BLOCK_SIZE;
	1614	add $16, obuf // obuf += AES_BLOCK_SIZE;
	1615	sub $1, num_blk // num_blk --
	1616	jg 0b // if num_blk > 0, repeat the loop
	1617
	1618	jmp L_HW_cbc_done
	1619
	1620	//
	1621	// --------- END of aes_decrypt_cbc_hw -------------------
	1622	//