]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/crypto/aes/i386/aes_modes_hw.s
xnu-1699.22.73.tar.gz
[apple/xnu.git] / bsd / crypto / aes / i386 / aes_modes_hw.s
... / ...
CommitLineData
1/*\r
2 ---------------------------------------------------------------------------\r
3 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.\r
4\r
5 LICENSE TERMS\r
6\r
7 The free distribution and use of this software in both source and binary\r
8 form is allowed (with or without changes) provided that:\r
9\r
10 1. distributions of this source code include the above copyright\r
11 notice, this list of conditions and the following disclaimer;\r
12\r
13 2. distributions in binary form include the above copyright\r
14 notice, this list of conditions and the following disclaimer\r
15 in the documentation and/or other associated materials;\r
16\r
17 3. the copyright holder's name is not used to endorse products\r
18 built using this software without specific written permission.\r
19\r
20 ALTERNATIVELY, provided that this notice is retained in full, this product\r
21 may be distributed under the terms of the GNU General Public License (GPL),\r
22 in which case the provisions of the GPL apply INSTEAD OF those given above.\r
23\r
24 DISCLAIMER\r
25\r
26 This software is provided 'as is' with no explicit or implied warranties\r
27 in respect of its properties, including, but not limited to, correctness\r
28 and/or fitness for purpose.\r
29 ---------------------------------------------------------------------------\r
30 Issue 31/01/2006\r
31\r
32 These subroutines implement multiple block AES modes for ECB, CBC, CFB,\r
33 OFB and CTR encryption, The code provides support for the VIA Advanced \r
34 Cryptography Engine (ACE).\r
35\r
36 NOTE: In the following subroutines, the AES contexts (ctx) must be\r
37 16 byte aligned if VIA ACE is being used\r
38*/\r
39\r
40/* modified 3/5/10 cclee */\r
41/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */\r
42/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */\r
43\r
44/* HW-AES specific implementation cclee 3-12-10 */\r
45/* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, \r
46 and if kHasAES is detected, branch to the hw-specific functions here */\r
47\r
48\r
49/* \r
50 This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation\r
51 of _aes_encrypt_cbc and _aes_decrypt_cbc. \r
52\r
53 These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. \r
54 They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.\r
55\r
56 The AES HW is detected 1st thing in \r
57 _aes_encrypt_cbc (aes_modes_asm.s) \r
58 _aes_decrypt_cbc (aes_modes_asm.s)\r
59 and, if AES HW is detected, branch without link (ie, jump) to the functions here.\r
60\r
61 The implementation here follows the examples in an Intel White Paper\r
62 "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01\r
63\r
64 Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01\r
65\r
66 cclee 3-13-10\r
67*/\r
68\r
69/* \r
70 The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block\r
71 in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks\r
72 in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.\r
73\r
74 The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) \r
75\r
76 This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.\r
77 On a K18 (2.4GHz core-i5/2.66GHz core-i7), the x86_64 decrypt throughput (in xnu-iokit) has been improved\r
78 from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.\r
79 The encrypt throughput is not changed. \r
80\r
81 I also enhanced the assembly code comments.\r
82\r
83 cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)\r
84\r
85*/\r
86\r
87/* ---------------------------------------------------------------------------------------------------------------- \r
88\r
89 aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
90\r
91 For simplicity, I am assuming all variables are in 128-bit data type.\r
92\r
93 aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)\r
94 {\r
95 while(num_blk--) {\r
96 *iv ^= *ibuf++;\r
97 aes_encrypt(iv, iv, ctx);\r
98 *obuf++ = *iv;\r
99 }\r
100 return 0;\r
101 }\r
102\r
103 The following is an implementation of this function using Intel AESNI.\r
104 This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. \r
105 Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
106 to this aesni-based function should it detecs that aesni is available.\r
107 Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
108\r
109 Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks\r
110 are serially chained. This prevents us from arranging several blocks for encryption in parallel.\r
111\r
112 ----------------------------------------------------------------------------------------------------------------*/\r
113\r
114 .text\r
115 .align 4,0x90\r
116 .globl _aes_encrypt_cbc_hw\r
117_aes_encrypt_cbc_hw:\r
118\r
119 // push/save registers for local use\r
120#if defined __i386__\r
121\r
122 push %ebp\r
123 movl %esp, %ebp\r
124 push %ebx\r
125 push %edi\r
126\r
127 #define sp %esp\r
128\r
129#else // __x86_64__\r
130\r
131 push %rbp\r
132 mov %rsp, %rbp\r
133 push %rbx\r
134 push %r13\r
135 push %r14\r
136 push %r15\r
137\r
138 #define sp %rsp\r
139\r
140#endif\r
141\r
142 // if this is kernel code, need to save used xmm registers\r
143#ifdef KERNEL\r
144\r
145#if defined __i386__\r
146 sub $(8*16), %esp // for possible xmm0-xmm7 save/restore\r
147#else\r
148 sub $(16*16), %rsp // xmm0-xmm15 save/restore \r
149#endif\r
150\r
151 movaps %xmm0, (sp)\r
152 movaps %xmm1, 16(sp)\r
153 movaps %xmm2, 32(sp)\r
154 movaps %xmm3, 48(sp)\r
155 movaps %xmm4, 64(sp)\r
156 movaps %xmm5, 80(sp)\r
157 movaps %xmm6, 96(sp)\r
158 movaps %xmm7, 112(sp)\r
159#if defined __x86_64__\r
160 movaps %xmm8, 16*8(sp)\r
161 movaps %xmm9, 16*9(sp)\r
162 movaps %xmm10, 16*10(sp)\r
163 movaps %xmm11, 16*11(sp)\r
164 movaps %xmm12, 16*12(sp)\r
165 movaps %xmm13, 16*13(sp)\r
166 movaps %xmm14, 16*14(sp)\r
167 movaps %xmm15, 16*15(sp)\r
168#endif // __x86_64__\r
169\r
170#endif // KERNEL\r
171\r
172 #define iv %xmm0\r
173\r
174#ifdef __i386__\r
175\r
176 mov 12(%ebp), %eax // in_iv\r
177 mov 24(%ebp), %edx // ctx\r
178 movups (%eax), iv // iv = in_iv \r
179 mov 8(%ebp), %ebx // ibuf\r
180 mov 16(%ebp), %ecx // num_blk\r
181 mov 20(%ebp), %edi // obuf\r
182\r
183 #define ibuf %ebx\r
184 #define obuf %edi\r
185 #define num_blk %ecx \r
186 #define ctx %edx\r
187\r
188#else\r
189\r
190 mov %rdi, %rbx // ibuf\r
191 movups (%rsi), iv // iv = in_iv\r
192 mov %rdx, %r13 // num_blk\r
193 mov %rcx, %r14 // obuf\r
194 mov %r8, %r15 // ctx \r
195\r
196 #define ibuf %rbx\r
197 #define num_blk %r13d\r
198 #define obuf %r14 \r
199 #define ctx %r15\r
200\r
201#endif\r
202\r
203 mov 240(ctx), %eax // aes length\r
204 cmp $160, %eax // aes-128 encrypt ?\r
205 je L_encrypt_128\r
206 cmp $192, %eax // aes-192 encrypt ?\r
207 je L_encrypt_192\r
208 cmp $224, %eax // aes-256 encrypt ?\r
209 je L_encrypt_256\r
210 mov $-1, %eax // return error\r
211 jmp L_error \r
212\r
213 //\r
214 // aes-128 encrypt_cbc operation, up to L_HW_cbc_done\r
215 //\r
216\r
217L_encrypt_128:\r
218\r
219 cmp $1, num_blk // check number of block\r
220 jl L_HW_cbc_done // should it be less than 1, nothing to do\r
221\r
222 movups (ctx), %xmm2 // key0\r
223 movups 16(ctx), %xmm3 // key1\r
224 movups 32(ctx), %xmm4 // key2\r
225 movups 48(ctx), %xmm5 // key3\r
226 movups 64(ctx), %xmm6 // key4\r
227 movups 80(ctx), %xmm7 // key5\r
228#if defined __x86_64__\r
229 movups 96(ctx), %xmm8 // key6\r
230 movups 112(ctx), %xmm9 // key7\r
231 movups 128(ctx), %xmm10 // key8\r
232 movups 144(ctx), %xmm11 // key9\r
233 movups 160(ctx), %xmm12 // keyA\r
234#endif\r
235\r
236 // while (num_blk--) {\r
237 // *iv ^= *ibuf++;\r
238 // aes_encrypt(iv, iv, ctx);\r
239 // *obuf++ = *iv;\r
240 // }\r
2410:\r
242 movups (ibuf), %xmm1 // *ibuf\r
243 pxor %xmm2, iv // 1st instruction inside aes_encrypt\r
244 pxor %xmm1, iv // *iv ^= *ibuf\r
245\r
246 // finishing up the rest of aes_encrypt\r
247 aesenc %xmm3, iv\r
248 aesenc %xmm4, iv\r
249 aesenc %xmm5, iv\r
250 aesenc %xmm6, iv\r
251 aesenc %xmm7, iv\r
252#if defined __x86_64__\r
253 aesenc %xmm8, iv\r
254 aesenc %xmm9, iv\r
255 aesenc %xmm10, iv\r
256 aesenc %xmm11, iv\r
257 aesenclast %xmm12, iv\r
258#else\r
259 movups 96(ctx), %xmm1 // key6\r
260 aesenc %xmm1, iv\r
261 movups 112(ctx), %xmm1 // key7\r
262 aesenc %xmm1, iv\r
263 movups 128(ctx), %xmm1 // key8\r
264 aesenc %xmm1, iv\r
265 movups 144(ctx), %xmm1 // key9\r
266 aesenc %xmm1, iv\r
267 movups 160(ctx), %xmm1 // keyA\r
268 aesenclast %xmm1, iv\r
269#endif\r
270\r
271 movups iv, (obuf) // *obuf = *iv;\r
272 add $16, obuf // obuf++;\r
273 add $16, ibuf // ibuf++;\r
274 sub $1, num_blk // num_blk --\r
275 jg 0b // if num_blk > 0, repeat the loop\r
276\r
277 // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)\r
278\r
279L_HW_cbc_done:\r
280\r
281 xor %eax, %eax // to return CRYPT_OK\r
282\r
283L_error:\r
284\r
285 // if kernel, restore xmm registers\r
286#ifdef KERNEL \r
287 movaps 0(sp), %xmm0\r
288 movaps 16(sp), %xmm1\r
289 movaps 32(sp), %xmm2\r
290 movaps 48(sp), %xmm3\r
291 movaps 64(sp), %xmm4\r
292 movaps 80(sp), %xmm5\r
293 movaps 96(sp), %xmm6\r
294 movaps 112(sp), %xmm7\r
295#if defined __x86_64__\r
296 movaps 16*8(sp), %xmm8\r
297 movaps 16*9(sp), %xmm9\r
298 movaps 16*10(sp), %xmm10\r
299 movaps 16*11(sp), %xmm11\r
300 movaps 16*12(sp), %xmm12\r
301 movaps 16*13(sp), %xmm13\r
302 movaps 16*14(sp), %xmm14\r
303 movaps 16*15(sp), %xmm15\r
304#endif // __x86_64__\r
305#endif // KERNEL\r
306\r
307 // release used stack memory, restore used callee-saved registers, and return \r
308#if defined __i386__\r
309#ifdef KERNEL\r
310 add $(8*16), %esp\r
311#endif\r
312 pop %edi\r
313 pop %ebx\r
314#else\r
315#ifdef KERNEL\r
316 add $(16*16), %rsp \r
317#endif\r
318 pop %r15\r
319 pop %r14\r
320 pop %r13\r
321 pop %rbx\r
322#endif\r
323 leave\r
324 ret\r
325\r
326 //\r
327 // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
328 //\r
329\r
330L_encrypt_192:\r
331\r
332 cmp $1, num_blk // check number of block\r
333 jl L_HW_cbc_done // should it be less than 1, nothing to do\r
334\r
335 movups (ctx), %xmm2 // key0\r
336 movups 16(ctx), %xmm3 // key1\r
337 movups 32(ctx), %xmm4 // key2\r
338 movups 48(ctx), %xmm5 // key3\r
339 movups 64(ctx), %xmm6 // key4\r
340 movups 80(ctx), %xmm7 // key5\r
341#if defined __x86_64__\r
342 movups 96(ctx), %xmm8 // key6\r
343 movups 112(ctx), %xmm9 // key7\r
344 movups 128(ctx), %xmm10 // key8\r
345 movups 144(ctx), %xmm11 // key9\r
346 movups 160(ctx), %xmm12 // keyA\r
347 movups 176(ctx), %xmm13 // keyB\r
348 movups 192(ctx), %xmm14 // keyC\r
349#endif\r
350 \r
351 // while (num_blk--) {\r
352 // *iv ^= *ibuf++;\r
353 // aes_encrypt(iv, iv, ctx);\r
354 // *obuf++ = *iv;\r
355 // }\r
3560:\r
357 movups (ibuf), %xmm1 // *ibuf\r
358 pxor %xmm1, iv // *iv ^= ibuf\r
359\r
360 // aes_encrypt(iv, iv, ctx);\r
361\r
362 pxor %xmm2, iv\r
363 aesenc %xmm3, iv\r
364 aesenc %xmm4, iv\r
365 aesenc %xmm5, iv\r
366 aesenc %xmm6, iv\r
367 aesenc %xmm7, iv\r
368#if defined __x86_64__\r
369 aesenc %xmm8, iv\r
370 aesenc %xmm9, iv\r
371 aesenc %xmm10, iv\r
372 aesenc %xmm11, iv\r
373 aesenc %xmm12, iv\r
374 aesenc %xmm13, iv\r
375 aesenclast %xmm14, iv\r
376#else\r
377 movups 96(ctx), %xmm1\r
378 aesenc %xmm1, iv\r
379 movups 112(ctx), %xmm1\r
380 aesenc %xmm1, iv\r
381 movups 128(ctx), %xmm1\r
382 aesenc %xmm1, iv\r
383 movups 144(ctx), %xmm1\r
384 aesenc %xmm1, iv\r
385 movups 160(ctx), %xmm1\r
386 aesenc %xmm1, iv\r
387 movups 176(ctx), %xmm1\r
388 aesenc %xmm1, iv\r
389 movups 192(ctx), %xmm1\r
390 aesenclast %xmm1, iv\r
391#endif\r
392\r
393 movups iv, (obuf) // *obuf = *iv;\r
394 add $16, ibuf // ibuf++\r
395 add $16, obuf // obuf++\r
396\r
397 sub $1, num_blk // num_blk --\r
398 jg 0b // if num_blk > 0, repeat the loop\r
399\r
400 jmp L_HW_cbc_done // share with the common exit code\r
401\r
402 //\r
403 // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
404 //\r
405\r
406L_encrypt_256:\r
407\r
408 cmp $1, num_blk // check number of block\r
409 jl L_HW_cbc_done // should it be less than 1, nothing to do\r
410\r
411 movups (ctx), %xmm2 // key0\r
412 movups 16(ctx), %xmm3 // key1\r
413 movups 32(ctx), %xmm4 // key2\r
414 movups 48(ctx), %xmm5 // key3\r
415 movups 64(ctx), %xmm6 // key4\r
416 movups 80(ctx), %xmm7 // key5\r
417#if defined __x86_64__\r
418 movups 96(ctx), %xmm8 // key6\r
419 movups 112(ctx), %xmm9 // key7\r
420 movups 128(ctx), %xmm10 // key8\r
421 movups 144(ctx), %xmm11 // key9\r
422 movups 160(ctx), %xmm12 // keyA\r
423 movups 176(ctx), %xmm13 // keyB\r
424 movups 192(ctx), %xmm14 // keyC\r
425 movups 208(ctx), %xmm15 // keyD\r
426 // movups 224(ctx), %xmm1 // keyE\r
427#endif\r
428\r
429 // while (num_blk--) {\r
430 // *iv ^= *ibuf++;\r
431 // aes_encrypt(iv, iv, ctx);\r
432 // *obuf++ = *iv;\r
433 // }\r
4340:\r
435 movups (ibuf), %xmm1 // *ibuf\r
436 pxor %xmm1, iv // *iv ^= ibuf\r
437 \r
438 // aes_encrypt(iv, iv, ctx);\r
439 pxor %xmm2, iv\r
440 aesenc %xmm3, iv\r
441 aesenc %xmm4, iv\r
442 aesenc %xmm5, iv\r
443 aesenc %xmm6, iv\r
444 aesenc %xmm7, iv\r
445#if defined __x86_64__\r
446 movups 224(ctx), %xmm1 // keyE\r
447 aesenc %xmm8, iv\r
448 aesenc %xmm9, iv\r
449 aesenc %xmm10, iv\r
450 aesenc %xmm11, iv\r
451 aesenc %xmm12, iv\r
452 aesenc %xmm13, iv\r
453 aesenc %xmm14, iv\r
454 aesenc %xmm15, iv\r
455 aesenclast %xmm1, iv\r
456#else\r
457 movups 96(ctx), %xmm1 // key6\r
458 aesenc %xmm1, iv\r
459 movups 112(ctx), %xmm1 // key7\r
460 aesenc %xmm1, iv\r
461 movups 128(ctx), %xmm1 // key8\r
462 aesenc %xmm1, iv\r
463 movups 144(ctx), %xmm1 // key9\r
464 aesenc %xmm1, iv\r
465 movups 160(ctx), %xmm1 // keyA\r
466 aesenc %xmm1, iv\r
467 movups 176(ctx), %xmm1 // keyB\r
468 aesenc %xmm1, iv\r
469 movups 192(ctx), %xmm1 // keyC\r
470 aesenc %xmm1, iv\r
471 movups 208(ctx), %xmm1 // keyD\r
472 aesenc %xmm1, iv\r
473 movups 224(ctx), %xmm1 // keyE\r
474 aesenclast %xmm1, iv\r
475#endif\r
476\r
477 movups iv, (obuf) // *obuf = *iv;\r
478 add $16, ibuf // ibuf++\r
479 add $16, obuf // obuf++\r
480\r
481 sub $1, num_blk // num_blk --\r
482 jg 0b // if num_blk > 0, repeat the loop\r
483\r
484 jmp L_HW_cbc_done // share with the common exit code\r
485\r
486\r
487\r
488 //\r
489 // --------- END of aes_encrypt_cbc_hw -------------------\r
490 //\r
491\r
492\r
493/* ---------------------------------------------------------------------------------------------------------------- \r
494\r
495 aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
496\r
497 For simplicity, I am assuming all variables are in 128-bit data type.\r
498\r
499 aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)\r
500 {\r
501 while(num_blk--) {\r
502 aes_decrypt(ibuf, obuf, ctx);\r
503 *obuf++ ^= *iv;\r
504 *iv = *ibuf++;\r
505 }\r
506 return 0;\r
507 }\r
508\r
509 The following is an implementation of this function using Intel AESNI.\r
510 This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. \r
511 Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
512 to this aesni-based function should it detecs that aesni is available.\r
513 Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
514\r
515 Note that the decryption operation is not related over blocks.\r
516 This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.\r
517 This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)\r
518 The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.\r
519\r
520 Example C code for packing 4 blocks in an iteration is shown as follows:\r
521\r
522 while ((num_blk-=4)>=0) {\r
523\r
524 // the following 4 functions can be interleaved to exploit parallelism\r
525 aes_decrypt(ibuf, obuf, ctx);\r
526 aes_decrypt(ibuf+1, obuf+1, ctx);\r
527 aes_decrypt(ibuf+2, obuf+2, ctx);\r
528 aes_decrypt(ibuf+3, obuf+3, ctx);\r
529\r
530 obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
531 *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
532 }\r
533 num_blk+=4;\r
534\r
535 ----------------------------------------------------------------------------------------------------------------*/\r
536\r
537 .text\r
538 .align 4,0x90\r
539 .globl _aes_decrypt_cbc_hw\r
540_aes_decrypt_cbc_hw:\r
541\r
542 // push/save registers for local use\r
543#if defined __i386__\r
544\r
545 push %ebp\r
546 movl %esp, %ebp\r
547 push %ebx // ibuf\r
548 push %edi // obuf\r
549\r
550 #define sp %esp\r
551\r
552#else // __x86_64__\r
553\r
554 push %rbp\r
555 mov %rsp, %rbp\r
556 push %rbx\r
557 push %r13\r
558 push %r14\r
559 push %r15\r
560\r
561 #define sp %rsp\r
562\r
563#endif\r
564\r
565\r
566 // if kernel, allocate stack space to save xmm registers\r
567#ifdef KERNEL\r
568#if defined __i386__\r
569 sub $(8*16), %esp\r
570#else\r
571 sub $(16*16), %rsp\r
572#endif\r
573 movaps %xmm0, (sp)\r
574 movaps %xmm1, 16(sp)\r
575 movaps %xmm2, 32(sp)\r
576 movaps %xmm3, 48(sp)\r
577 movaps %xmm4, 64(sp)\r
578 movaps %xmm5, 80(sp)\r
579 movaps %xmm6, 96(sp)\r
580 movaps %xmm7, 112(sp)\r
581#if defined __x86_64__\r
582 movaps %xmm8, 16*8(sp)\r
583 movaps %xmm9, 16*9(sp)\r
584 movaps %xmm10, 16*10(sp)\r
585 movaps %xmm11, 16*11(sp)\r
586 movaps %xmm12, 16*12(sp)\r
587 movaps %xmm13, 16*13(sp)\r
588 movaps %xmm14, 16*14(sp)\r
589 movaps %xmm15, 16*15(sp)\r
590#endif // __x86_64__\r
591#endif\r
592\r
593 #undef iv\r
594 #define iv %xmm0\r
595\r
596#if defined __i386__\r
597 mov 12(%ebp), %eax // in_iv\r
598 mov 24(%ebp), %edx // ctx\r
599 movups (%eax), iv // iv = in_iv \r
600 mov 8(%ebp), %ebx // ibuf\r
601 mov 16(%ebp), %ecx // num_blk\r
602 mov 20(%ebp), %edi // obuf\r
603\r
604 #define ibuf %ebx\r
605 #define obuf %edi\r
606 #define num_blk %ecx \r
607 #define ctx %edx\r
608\r
609#else // __x86_64__, rdi/rsi/rdx/rcx/r8\r
610\r
611 mov %rdi, %rbx // ibuf\r
612 movups (%rsi), iv // iv = in_iv\r
613 mov %rdx, %r13 // num_blk\r
614 mov %rcx, %r14 // obuf\r
615 mov %r8, %r15 // ctx \r
616\r
617 #define ibuf %rbx\r
618 #define num_blk %r13d\r
619 #define obuf %r14 \r
620 #define ctx %r15\r
621\r
622#endif\r
623\r
624 mov 240(ctx), %eax // aes length\r
625 cmp $160, %eax // aes-128 decrypt\r
626 je L_decrypt_128\r
627 cmp $192, %eax // aes-192 decrypt\r
628 je L_decrypt_192\r
629 cmp $224, %eax // aes-256 decrypt\r
630 je L_decrypt_256\r
631\r
632 mov $-1, %eax // wrong aes length, to return -1\r
633 jmp L_error // early exit due to wrong aes length\r
634\r
635\r
636 //\r
637 // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
638 //\r
639\r
640L_decrypt_128:\r
641\r
642 cmp $1, num_blk\r
643 jl L_HW_cbc_done // if num_blk < 1, early return\r
644\r
645 // aes-128 decrypt expanded keys\r
646 movups 160(ctx), %xmm3\r
647 movups 144(ctx), %xmm4\r
648 movups 128(ctx), %xmm5\r
649 movups 112(ctx), %xmm6\r
650 movups 96(ctx), %xmm7\r
651#if defined __x86_64__\r
652 movups 80(ctx), %xmm8\r
653 movups 64(ctx), %xmm9\r
654 movups 48(ctx), %xmm10\r
655 movups 32(ctx), %xmm11\r
656 movups 16(ctx), %xmm12\r
657 movups 0(ctx), %xmm13\r
658#endif\r
659\r
660 // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
661\r
662 // while ((num_blk-=4)>=0) {\r
663 // aes_decrypt(ibuf, obuf, ctx);\r
664 // aes_decrypt(ibuf+1, obuf+1, ctx);\r
665 // aes_decrypt(ibuf+2, obuf+2, ctx);\r
666 // aes_decrypt(ibuf+3, obuf+3, ctx);\r
667 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
668 // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
669 // }\r
670\r
671 sub $4, num_blk // pre decrement num_blk by 4\r
672 jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
673\r
6740:\r
675\r
676\r
677#if defined __x86_64__\r
678\r
679 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
680 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
681 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
682 movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
683\r
684 // for x86_64, the expanded keys are already stored in xmm3-xmm13\r
685\r
686 // aes-128 decrypt round 0 per 4 blocks\r
687 pxor %xmm3, %xmm1\r
688 pxor %xmm3, %xmm2\r
689 pxor %xmm3, %xmm14\r
690 pxor %xmm3, %xmm15\r
691\r
692 // aes-128 decrypt round 1 per 4 blocks\r
693 aesdec %xmm4, %xmm1\r
694 aesdec %xmm4, %xmm2\r
695 aesdec %xmm4, %xmm14\r
696 aesdec %xmm4, %xmm15\r
697\r
698 // aes-128 decrypt round 2 per 4 blocks\r
699 aesdec %xmm5, %xmm1\r
700 aesdec %xmm5, %xmm2\r
701 aesdec %xmm5, %xmm14\r
702 aesdec %xmm5, %xmm15\r
703\r
704 // aes-128 decrypt round 3 per 4 blocks\r
705 aesdec %xmm6, %xmm1\r
706 aesdec %xmm6, %xmm2\r
707 aesdec %xmm6, %xmm14\r
708 aesdec %xmm6, %xmm15\r
709\r
710 // aes-128 decrypt round 4 per 4 blocks\r
711 aesdec %xmm7, %xmm1\r
712 aesdec %xmm7, %xmm2\r
713 aesdec %xmm7, %xmm14\r
714 aesdec %xmm7, %xmm15\r
715\r
716 // aes-128 decrypt round 5 per 4 blocks\r
717 aesdec %xmm8, %xmm1\r
718 aesdec %xmm8, %xmm2\r
719 aesdec %xmm8, %xmm14\r
720 aesdec %xmm8, %xmm15\r
721\r
722 // aes-128 decrypt round 6 per 4 blocks\r
723 aesdec %xmm9, %xmm1\r
724 aesdec %xmm9, %xmm2\r
725 aesdec %xmm9, %xmm14\r
726 aesdec %xmm9, %xmm15\r
727\r
728 // aes-128 decrypt round 7 per 4 blocks\r
729 aesdec %xmm10, %xmm1\r
730 aesdec %xmm10, %xmm2\r
731 aesdec %xmm10, %xmm14\r
732 aesdec %xmm10, %xmm15\r
733\r
734 // aes-128 decrypt round 8 per 4 blocks\r
735 aesdec %xmm11, %xmm1\r
736 aesdec %xmm11, %xmm2\r
737 aesdec %xmm11, %xmm14\r
738 aesdec %xmm11, %xmm15\r
739\r
740 // aes-128 decrypt round 9 per 4 blocks\r
741 aesdec %xmm12, %xmm1\r
742 aesdec %xmm12, %xmm2\r
743 aesdec %xmm12, %xmm14\r
744 aesdec %xmm12, %xmm15\r
745\r
746 // aes-128 decrypt round 10 (last) per 4 blocks\r
747 aesdeclast %xmm13, %xmm1\r
748 aesdeclast %xmm13, %xmm2\r
749 aesdeclast %xmm13, %xmm14\r
750 aesdeclast %xmm13, %xmm15\r
751\r
752 pxor iv, %xmm1 // obuf[0] ^= *iv; \r
753 movups (ibuf), iv // ibuf[0]\r
754 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; \r
755 movups 16(ibuf), iv // ibuf[1]\r
756 pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; \r
757 movups 32(ibuf), iv // ibuf[2] \r
758 pxor iv, %xmm15 // obuf[3] ^= obuf[2]; \r
759 movups 48(ibuf), iv // *iv = ibuf[3]\r
760\r
761 movups %xmm1, (obuf) // write 1st obuf\r
762 movups %xmm2, 16(obuf) // write 2nd obuf\r
763 movups %xmm14, 32(obuf) // write 3rd obuf\r
764 movups %xmm15, 48(obuf) // write 4th obuf\r
765\r
766\r
767#else\r
768\r
769 // aes_decrypt_cbc per 4 blocks using aes-128 for i386\r
770 // xmm1/xmm2/xmm4/xmm5 used for obuf per block\r
771 // xmm3 = key0\r
772 // xmm0 = iv\r
773 // xmm6/xmm7 dynamically load with other expanded keys\r
774\r
775 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
776 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
777 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
778 movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
779\r
780 // aes_decrypt\r
781 // for i386, sequentially load expanded keys into xmm6/xmm7\r
782\r
783 movups 144(ctx), %xmm6 // key1\r
784\r
785 // aes-128 decrypt round 0 per 4 blocks\r
786 pxor %xmm3, %xmm1\r
787 pxor %xmm3, %xmm2\r
788 pxor %xmm3, %xmm4\r
789 pxor %xmm3, %xmm5\r
790\r
791 movups 128(ctx), %xmm7 // key2\r
792\r
793 // aes-128 decrypt round 1 per 4 blocks\r
794 aesdec %xmm6, %xmm1\r
795 aesdec %xmm6, %xmm2\r
796 aesdec %xmm6, %xmm4\r
797 aesdec %xmm6, %xmm5\r
798\r
799 movups 112(ctx), %xmm6 // key3\r
800\r
801 // aes-128 decrypt round 2 per 4 blocks\r
802 aesdec %xmm7, %xmm1\r
803 aesdec %xmm7, %xmm2\r
804 aesdec %xmm7, %xmm4\r
805 aesdec %xmm7, %xmm5\r
806\r
807 movups 96(ctx), %xmm7 // key4\r
808\r
809 // aes-128 decrypt round 3 per 4 blocks\r
810 aesdec %xmm6, %xmm1\r
811 aesdec %xmm6, %xmm2\r
812 aesdec %xmm6, %xmm4\r
813 aesdec %xmm6, %xmm5\r
814\r
815 movups 80(ctx), %xmm6 // key5\r
816\r
817 // aes-128 decrypt round 4 per 4 blocks\r
818 aesdec %xmm7, %xmm1\r
819 aesdec %xmm7, %xmm2\r
820 aesdec %xmm7, %xmm4\r
821 aesdec %xmm7, %xmm5\r
822\r
823 movups 64(ctx), %xmm7 // key6\r
824\r
825 // aes-128 decrypt round 5 per 4 blocks\r
826 aesdec %xmm6, %xmm1\r
827 aesdec %xmm6, %xmm2\r
828 aesdec %xmm6, %xmm4\r
829 aesdec %xmm6, %xmm5\r
830\r
831 movups 48(ctx), %xmm6 // key7\r
832\r
833 // aes-128 decrypt round 6 per 4 blocks\r
834 aesdec %xmm7, %xmm1\r
835 aesdec %xmm7, %xmm2\r
836 aesdec %xmm7, %xmm4\r
837 aesdec %xmm7, %xmm5\r
838\r
839 movups 32(ctx), %xmm7 // key8\r
840\r
841 // aes-128 decrypt round 7 per 4 blocks\r
842 aesdec %xmm6, %xmm1\r
843 aesdec %xmm6, %xmm2\r
844 aesdec %xmm6, %xmm4\r
845 aesdec %xmm6, %xmm5\r
846\r
847 movups 16(ctx), %xmm6 // key9\r
848\r
849 // aes-128 decrypt round 8 per 4 blocks\r
850 aesdec %xmm7, %xmm1\r
851 aesdec %xmm7, %xmm2\r
852 aesdec %xmm7, %xmm4\r
853 aesdec %xmm7, %xmm5\r
854\r
855 movups 0(ctx), %xmm7 // keyA\r
856\r
857 // aes-128 decrypt round 9 per 4 blocks\r
858 aesdec %xmm6, %xmm1\r
859 aesdec %xmm6, %xmm2\r
860 aesdec %xmm6, %xmm4\r
861 aesdec %xmm6, %xmm5\r
862\r
863 // aes-128 decrypt round 10 (last) per 4 blocks\r
864 aesdeclast %xmm7, %xmm1\r
865 aesdeclast %xmm7, %xmm2\r
866 aesdeclast %xmm7, %xmm4\r
867 aesdeclast %xmm7, %xmm5\r
868\r
869 pxor iv, %xmm1 // 1st obuf ^= iv; \r
870 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
871 pxor iv, %xmm2 // 2nd obuf ^= iv; \r
872 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
873 pxor iv, %xmm4 // 3rd obuf ^= iv; \r
874 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
875 pxor iv, %xmm5 // 4th obuf ^= iv; \r
876 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
877\r
878 movups %xmm1, (obuf) // write 1st obuf\r
879 movups %xmm2, 16(obuf) // write 2nd obuf\r
880 movups %xmm4, 32(obuf) // write 3rd obuf\r
881 movups %xmm5, 48(obuf) // write 4th obuf\r
882#endif\r
883\r
884 add $64, ibuf // ibuf += 4; \r
885 add $64, obuf // obuf += 4; \r
886\r
887 sub $4, num_blk // num_blk -= 4\r
888 jge 0b // if num_blk > 0, repeat the loop\r
889\r
8909: add $4, num_blk // post incremtn num_blk by 4\r
891 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
892\r
893#if defined __i386__\r
894 // updated as they might be needed as expanded keys in the remaining\r
895 movups 144(ctx), %xmm4\r
896 movups 128(ctx), %xmm5\r
897 movups 112(ctx), %xmm6\r
898 movups 96(ctx), %xmm7\r
899#endif\r
900\r
901 test $2, num_blk // check whether num_blk has 2 blocks\r
902 je 9f // if num_blk & 2 == 0, skip the per-pair processing code\r
903\r
904 // do the remaining 2 blocks together\r
905\r
906 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
907 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
908\r
909 // aes_decrypt\r
910 pxor %xmm3, %xmm1\r
911 pxor %xmm3, %xmm2\r
912 aesdec %xmm4, %xmm1\r
913 aesdec %xmm4, %xmm2\r
914 aesdec %xmm5, %xmm1\r
915 aesdec %xmm5, %xmm2\r
916 aesdec %xmm6, %xmm1\r
917 aesdec %xmm6, %xmm2\r
918#if defined __x86_64__\r
919 aesdec %xmm7, %xmm1\r
920 aesdec %xmm7, %xmm2\r
921 aesdec %xmm8, %xmm1\r
922 aesdec %xmm8, %xmm2\r
923 aesdec %xmm9, %xmm1\r
924 aesdec %xmm9, %xmm2\r
925 aesdec %xmm10, %xmm1\r
926 aesdec %xmm10, %xmm2\r
927 aesdec %xmm11, %xmm1\r
928 aesdec %xmm11, %xmm2\r
929 aesdec %xmm12, %xmm1\r
930 aesdec %xmm12, %xmm2\r
931 aesdeclast %xmm13, %xmm1\r
932 aesdeclast %xmm13, %xmm2\r
933#else\r
934 movups 80(ctx), %xmm6\r
935 aesdec %xmm7, %xmm1\r
936 aesdec %xmm7, %xmm2\r
937 movups 64(ctx), %xmm7\r
938 aesdec %xmm6, %xmm1\r
939 aesdec %xmm6, %xmm2\r
940 movups 48(ctx), %xmm6\r
941 aesdec %xmm7, %xmm1\r
942 aesdec %xmm7, %xmm2\r
943 movups 32(ctx), %xmm7\r
944 aesdec %xmm6, %xmm1\r
945 aesdec %xmm6, %xmm2\r
946 movups 16(ctx), %xmm6\r
947 aesdec %xmm7, %xmm1\r
948 aesdec %xmm7, %xmm2\r
949 movups 0(ctx), %xmm7\r
950 aesdec %xmm6, %xmm1\r
951 aesdec %xmm6, %xmm2\r
952 aesdeclast %xmm7, %xmm1\r
953 aesdeclast %xmm7, %xmm2\r
954 movups 112(ctx), %xmm6\r
955 movups 96(ctx), %xmm7\r
956#endif\r
957\r
958 pxor iv, %xmm1 // obuf[0] ^= *iv; \r
959 movups (ibuf), iv // ibuf[0]\r
960 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]\r
961 movups 16(ibuf), iv // *iv = ibuf[1]\r
962\r
963 movups %xmm1, (obuf) // write obuf[0]\r
964 movups %xmm2, 16(obuf) // write obuf[1]\r
965\r
966 add $32, ibuf // ibuf += 2\r
967 add $32, obuf // obuf += 2\r
968\r
9699:\r
970 test $1, num_blk // check whether num_blk has residual 1 block\r
971 je L_HW_cbc_done // if num_blk == 0, no need for residual processing code\r
972 \r
973 movups (ibuf), %xmm2 // tmp = ibuf\r
974 // aes_decrypt\r
975 pxor %xmm3, %xmm2\r
976 aesdec %xmm4, %xmm2\r
977 aesdec %xmm5, %xmm2\r
978 aesdec %xmm6, %xmm2\r
979 aesdec %xmm7, %xmm2\r
980#if defined __x86_64__\r
981 aesdec %xmm8, %xmm2\r
982 aesdec %xmm9, %xmm2\r
983 aesdec %xmm10, %xmm2\r
984 aesdec %xmm11, %xmm2\r
985 aesdec %xmm12, %xmm2\r
986 aesdeclast %xmm13, %xmm2\r
987#else\r
988 movups 80(ctx), %xmm1\r
989 aesdec %xmm1, %xmm2\r
990 movups 64(ctx), %xmm1\r
991 aesdec %xmm1, %xmm2\r
992 movups 48(ctx), %xmm1\r
993 aesdec %xmm1, %xmm2\r
994 movups 32(ctx), %xmm1\r
995 aesdec %xmm1, %xmm2\r
996 movups 16(ctx), %xmm1\r
997 aesdec %xmm1, %xmm2\r
998 movups (ctx), %xmm1\r
999 aesdeclast %xmm1, %xmm2\r
1000#endif\r
1001\r
1002 pxor iv, %xmm2 // *obuf ^= *iv; \r
1003 movups (ibuf), iv // *iv = *ibuf;\r
1004 movups %xmm2, (obuf) // write *obuf\r
1005\r
1006 jmp L_HW_cbc_done\r
1007\r
1008 //\r
1009 // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
1010 //\r
1011\r
1012L_decrypt_192:\r
1013\r
1014 cmp $1, num_blk\r
1015 jl L_HW_cbc_done // if num_blk < 1, early return\r
1016\r
1017 // aes-192 decryp expanded keys\r
1018 movups 192(ctx), %xmm3\r
1019 movups 176(ctx), %xmm4\r
1020 movups 160(ctx), %xmm5\r
1021 movups 144(ctx), %xmm6\r
1022 movups 128(ctx), %xmm7\r
1023#if defined __x86_64__\r
1024 movups 112(ctx), %xmm8\r
1025 movups 96(ctx), %xmm9\r
1026 movups 80(ctx), %xmm10\r
1027 movups 64(ctx), %xmm11\r
1028 movups 48(ctx), %xmm12\r
1029 movups 32(ctx), %xmm13\r
1030 movups 16(ctx), %xmm14\r
1031 movups (ctx), %xmm15\r
1032#endif\r
1033\r
1034 // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
1035\r
1036 // while ((num_blk-=4)>=0) {\r
1037 // aes_decrypt(ibuf, obuf, ctx);\r
1038 // aes_decrypt(ibuf+1, obuf+1, ctx);\r
1039 // aes_decrypt(ibuf+2, obuf+2, ctx);\r
1040 // aes_decrypt(ibuf+3, obuf+3, ctx);\r
1041 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
1042 // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
1043 // }\r
1044\r
1045 sub $4, num_blk // pre decrement num_blk by 4\r
1046 jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
10470:\r
1048\r
1049#if defined __x86_64__\r
1050\r
1051 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
1052 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
1053 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
1054 movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
1055\r
1056 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
1057 // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards\r
1058\r
1059 // round 0 for 4 blocks\r
1060 pxor %xmm3, %xmm1\r
1061 pxor %xmm3, %xmm2\r
1062 pxor %xmm3, %xmm14\r
1063 pxor %xmm3, %xmm15\r
1064\r
1065 // round 1 for 4 blocks\r
1066 aesdec %xmm4, %xmm1\r
1067 aesdec %xmm4, %xmm2\r
1068 aesdec %xmm4, %xmm14\r
1069 aesdec %xmm4, %xmm15\r
1070\r
1071 // round 2 for 4 blocks\r
1072 aesdec %xmm5, %xmm1\r
1073 aesdec %xmm5, %xmm2\r
1074 aesdec %xmm5, %xmm14\r
1075 aesdec %xmm5, %xmm15\r
1076\r
1077 // round 3 for 4 blocks\r
1078 aesdec %xmm6, %xmm1\r
1079 aesdec %xmm6, %xmm2\r
1080 aesdec %xmm6, %xmm14\r
1081 aesdec %xmm6, %xmm15\r
1082\r
1083 // round 4 for 4 blocks\r
1084 aesdec %xmm7, %xmm1\r
1085 aesdec %xmm7, %xmm2\r
1086 aesdec %xmm7, %xmm14\r
1087 aesdec %xmm7, %xmm15\r
1088\r
1089 // round 5 for 4 blocks\r
1090 aesdec %xmm8, %xmm1\r
1091 aesdec %xmm8, %xmm2\r
1092 aesdec %xmm8, %xmm14\r
1093 aesdec %xmm8, %xmm15\r
1094\r
1095 // round 6 for 4 blocks\r
1096 aesdec %xmm9, %xmm1\r
1097 aesdec %xmm9, %xmm2\r
1098 aesdec %xmm9, %xmm14\r
1099 aesdec %xmm9, %xmm15\r
1100\r
1101 // round 7 for 4 blocks\r
1102 aesdec %xmm10, %xmm1\r
1103 aesdec %xmm10, %xmm2\r
1104 aesdec %xmm10, %xmm14\r
1105 aesdec %xmm10, %xmm15\r
1106\r
1107 // round 8 for 4 blocks\r
1108 aesdec %xmm11, %xmm1\r
1109 aesdec %xmm11, %xmm2\r
1110 aesdec %xmm11, %xmm14\r
1111 aesdec %xmm11, %xmm15\r
1112\r
1113 // round 9 for 4 blocks\r
1114 aesdec %xmm12, %xmm1\r
1115 aesdec %xmm12, %xmm2\r
1116 aesdec %xmm12, %xmm14\r
1117 aesdec %xmm12, %xmm15\r
1118\r
1119 movups 16(ctx), %xmm12\r
1120\r
1121 // round A for 4 blocks\r
1122 aesdec %xmm13, %xmm1\r
1123 aesdec %xmm13, %xmm2\r
1124 aesdec %xmm13, %xmm14\r
1125 aesdec %xmm13, %xmm15\r
1126\r
1127 movups (ctx), %xmm13\r
1128\r
1129 // round B for 4 blocks\r
1130 aesdec %xmm12, %xmm1\r
1131 aesdec %xmm12, %xmm2\r
1132 aesdec %xmm12, %xmm14\r
1133 aesdec %xmm12, %xmm15\r
1134\r
1135 movups 48(ctx), %xmm12 // restore %xmm12 to its original key\r
1136\r
1137 // round C (last) for 4 blocks\r
1138 aesdeclast %xmm13, %xmm1\r
1139 aesdeclast %xmm13, %xmm2\r
1140 aesdeclast %xmm13, %xmm14\r
1141 aesdeclast %xmm13, %xmm15\r
1142\r
1143 movups 32(ctx), %xmm13 // restore %xmm13 to its original key\r
1144\r
1145 pxor iv, %xmm1 // obuf[0] ^= *iv; \r
1146 movups (ibuf), iv // ibuf[0]\r
1147 pxor iv, %xmm2 // obuf[1] ^= ibuf[0] \r
1148 movups 16(ibuf), iv // ibuf[1]\r
1149 pxor iv, %xmm14 // obuf[2] ^= ibuf[1] \r
1150 movups 32(ibuf), iv // ibuf[2] \r
1151 pxor iv, %xmm15 // obuf[3] ^= ibuf[2] \r
1152 movups 48(ibuf), iv // *iv = ibuf[3] \r
1153\r
1154 movups %xmm1, (obuf) // write 1st obuf\r
1155 movups %xmm2, 16(obuf) // write 2nd obuf\r
1156 movups %xmm14, 32(obuf) // write 3rd obuf\r
1157 movups %xmm15, 48(obuf) // write 4th obuf\r
1158\r
1159 add $64, ibuf // ibuf += 4; \r
1160 add $64, obuf // obuf += 4; \r
1161\r
1162 sub $4, num_blk // num_blk -= 4\r
1163 jge 0b // if num_blk > 0, repeat the loop\r
1164\r
11659: add $4, num_blk // post incremtn num_blk by 4\r
1166 je L_HW_cbc_done // if num_blk == 0, prepare to return \r
1167\r
1168 movups 16(ctx), %xmm14 // restore %xmm14 to its key\r
1169 movups (ctx), %xmm15 // restore %xmm15 to its key\r
1170\r
1171#else\r
1172\r
1173 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
1174 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
1175 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
1176 movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
1177\r
1178 // aes_decrypt\r
1179 // for i386, sequentially load expanded keys into xmm6/xmm7\r
1180 movups 176(ctx), %xmm6\r
1181 pxor %xmm3, %xmm1\r
1182 pxor %xmm3, %xmm2\r
1183 pxor %xmm3, %xmm4\r
1184 pxor %xmm3, %xmm5\r
1185\r
1186 movups 160(ctx), %xmm7\r
1187 aesdec %xmm6, %xmm1\r
1188 aesdec %xmm6, %xmm2\r
1189 aesdec %xmm6, %xmm4\r
1190 aesdec %xmm6, %xmm5\r
1191\r
1192 movups 144(ctx), %xmm6\r
1193 aesdec %xmm7, %xmm1\r
1194 aesdec %xmm7, %xmm2\r
1195 aesdec %xmm7, %xmm4\r
1196 aesdec %xmm7, %xmm5\r
1197\r
1198 movups 128(ctx), %xmm7\r
1199 aesdec %xmm6, %xmm1\r
1200 aesdec %xmm6, %xmm2\r
1201 aesdec %xmm6, %xmm4\r
1202 aesdec %xmm6, %xmm5\r
1203\r
1204 movups 112(ctx), %xmm6\r
1205 aesdec %xmm7, %xmm1\r
1206 aesdec %xmm7, %xmm2\r
1207 aesdec %xmm7, %xmm4\r
1208 aesdec %xmm7, %xmm5\r
1209\r
1210 movups 96(ctx), %xmm7\r
1211 aesdec %xmm6, %xmm1\r
1212 aesdec %xmm6, %xmm2\r
1213 aesdec %xmm6, %xmm4\r
1214 aesdec %xmm6, %xmm5\r
1215\r
1216 movups 80(ctx), %xmm6\r
1217 aesdec %xmm7, %xmm1\r
1218 aesdec %xmm7, %xmm2\r
1219 aesdec %xmm7, %xmm4\r
1220 aesdec %xmm7, %xmm5\r
1221\r
1222 movups 64(ctx), %xmm7\r
1223 aesdec %xmm6, %xmm1\r
1224 aesdec %xmm6, %xmm2\r
1225 aesdec %xmm6, %xmm4\r
1226 aesdec %xmm6, %xmm5\r
1227\r
1228 movups 48(ctx), %xmm6\r
1229 aesdec %xmm7, %xmm1\r
1230 aesdec %xmm7, %xmm2\r
1231 aesdec %xmm7, %xmm4\r
1232 aesdec %xmm7, %xmm5\r
1233\r
1234 movups 32(ctx), %xmm7\r
1235 aesdec %xmm6, %xmm1\r
1236 aesdec %xmm6, %xmm2\r
1237 aesdec %xmm6, %xmm4\r
1238 aesdec %xmm6, %xmm5\r
1239\r
1240 movups 16(ctx), %xmm6\r
1241 aesdec %xmm7, %xmm1\r
1242 aesdec %xmm7, %xmm2\r
1243 aesdec %xmm7, %xmm4\r
1244 aesdec %xmm7, %xmm5\r
1245\r
1246 movups 0(ctx), %xmm7\r
1247 aesdec %xmm6, %xmm1\r
1248 aesdec %xmm6, %xmm2\r
1249 aesdec %xmm6, %xmm4\r
1250 aesdec %xmm6, %xmm5\r
1251\r
1252 aesdeclast %xmm7, %xmm1\r
1253 aesdeclast %xmm7, %xmm2\r
1254 aesdeclast %xmm7, %xmm4\r
1255 aesdeclast %xmm7, %xmm5\r
1256\r
1257 pxor iv, %xmm1 // 1st obuf ^= iv; \r
1258 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1259 pxor iv, %xmm2 // 2nd obuf ^= iv; \r
1260 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1261 pxor iv, %xmm4 // 3rd obuf ^= iv; \r
1262 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1263 pxor iv, %xmm5 // 4th obuf ^= iv; \r
1264 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1265 movups %xmm1, (obuf) // write 1st obuf\r
1266 movups %xmm2, 16(obuf) // write 2nd obuf\r
1267 movups %xmm4, 32(obuf) // write 3rd obuf\r
1268 movups %xmm5, 48(obuf) // write 4th obuf\r
1269\r
1270 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r
1271 add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r
1272\r
1273 sub $4, num_blk // num_blk -= 4\r
1274 jge 0b // if num_blk > 0, repeat the loop\r
1275\r
1276\r
12779: add $4, num_blk // post incremtn num_blk by 4\r
1278 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
1279\r
1280 movups 176(ctx), %xmm4\r
1281 movups 160(ctx), %xmm5\r
1282 movups 144(ctx), %xmm6\r
1283 movups 128(ctx), %xmm7\r
1284\r
1285#endif\r
1286\r
1287 // per-block aes_decrypt_cbc loop\r
1288\r
12890:\r
1290 movups (ibuf), %xmm2 // tmp = ibuf\r
1291\r
1292 // aes_decrypt\r
1293 pxor %xmm3, %xmm2\r
1294 aesdec %xmm4, %xmm2\r
1295 aesdec %xmm5, %xmm2\r
1296 aesdec %xmm6, %xmm2\r
1297 aesdec %xmm7, %xmm2\r
1298#if defined __x86_64__\r
1299 aesdec %xmm8, %xmm2\r
1300 aesdec %xmm9, %xmm2\r
1301 aesdec %xmm10, %xmm2\r
1302 aesdec %xmm11, %xmm2\r
1303 aesdec %xmm12, %xmm2\r
1304 aesdec %xmm13, %xmm2\r
1305 aesdec %xmm14, %xmm2\r
1306 aesdeclast %xmm15, %xmm2\r
1307#else\r
1308 movups 112(ctx), %xmm1\r
1309 aesdec %xmm1, %xmm2\r
1310 movups 96(ctx), %xmm1\r
1311 aesdec %xmm1, %xmm2\r
1312 movups 80(ctx), %xmm1\r
1313 aesdec %xmm1, %xmm2\r
1314 movups 64(ctx), %xmm1\r
1315 aesdec %xmm1, %xmm2\r
1316 movups 48(ctx), %xmm1\r
1317 aesdec %xmm1, %xmm2\r
1318 movups 32(ctx), %xmm1\r
1319 aesdec %xmm1, %xmm2\r
1320 movups 16(ctx), %xmm1\r
1321 aesdec %xmm1, %xmm2\r
1322 movups (ctx), %xmm1\r
1323 aesdeclast %xmm1, %xmm2\r
1324#endif\r
1325\r
1326 pxor iv, %xmm2 // obuf ^= iv; \r
1327 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1328\r
1329 movups %xmm2, (obuf) // write obuf\r
1330\r
1331 add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r
1332 add $16, obuf // obuf += AES_BLOCK_SIZE; \r
1333 sub $1, num_blk // num_blk --\r
1334 jg 0b // if num_blk > 0, repeat the loop\r
1335\r
1336 jmp L_HW_cbc_done\r
1337\r
1338 //\r
1339 // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
1340 //\r
1341\r
1342L_decrypt_256:\r
1343\r
1344 cmp $1, num_blk\r
1345 jl L_HW_cbc_done \r
1346\r
1347 movups 224(ctx), %xmm3\r
1348 movups 208(ctx), %xmm4\r
1349 movups 192(ctx), %xmm5\r
1350 movups 176(ctx), %xmm6\r
1351 movups 160(ctx), %xmm7\r
1352#if defined __x86_64__\r
1353 movups 144(ctx), %xmm8\r
1354 movups 128(ctx), %xmm9\r
1355 movups 112(ctx), %xmm10\r
1356 movups 96(ctx), %xmm11\r
1357 movups 80(ctx), %xmm12\r
1358 movups 64(ctx), %xmm13\r
1359 movups 48(ctx), %xmm14\r
1360 movups 32(ctx), %xmm15\r
1361// movups 16(ctx), %xmm14\r
1362// movups (ctx), %xmm15\r
1363#endif\r
1364\r
1365#if defined __x86_64__\r
1366\r
1367 sub $4, num_blk // pre decrement num_blk by 4\r
1368 jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r
13690:\r
1370 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
1371 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
1372 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r
1373 movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r
1374\r
1375 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
1376 pxor %xmm3, %xmm1\r
1377 pxor %xmm3, %xmm2\r
1378 pxor %xmm3, %xmm14\r
1379 pxor %xmm3, %xmm15\r
1380\r
1381 aesdec %xmm4, %xmm1\r
1382 aesdec %xmm4, %xmm2\r
1383 aesdec %xmm4, %xmm14\r
1384 aesdec %xmm4, %xmm15\r
1385\r
1386 aesdec %xmm5, %xmm1\r
1387 aesdec %xmm5, %xmm2\r
1388 aesdec %xmm5, %xmm14\r
1389 aesdec %xmm5, %xmm15\r
1390\r
1391 aesdec %xmm6, %xmm1\r
1392 aesdec %xmm6, %xmm2\r
1393 aesdec %xmm6, %xmm14\r
1394 aesdec %xmm6, %xmm15\r
1395\r
1396 aesdec %xmm7, %xmm1\r
1397 aesdec %xmm7, %xmm2\r
1398 aesdec %xmm7, %xmm14\r
1399 aesdec %xmm7, %xmm15\r
1400\r
1401 aesdec %xmm8, %xmm1\r
1402 aesdec %xmm8, %xmm2\r
1403 aesdec %xmm8, %xmm14\r
1404 aesdec %xmm8, %xmm15\r
1405\r
1406 aesdec %xmm9, %xmm1\r
1407 aesdec %xmm9, %xmm2\r
1408 aesdec %xmm9, %xmm14\r
1409 aesdec %xmm9, %xmm15\r
1410\r
1411 aesdec %xmm10, %xmm1\r
1412 aesdec %xmm10, %xmm2\r
1413 aesdec %xmm10, %xmm14\r
1414 aesdec %xmm10, %xmm15\r
1415\r
1416 aesdec %xmm11, %xmm1\r
1417 aesdec %xmm11, %xmm2\r
1418 aesdec %xmm11, %xmm14\r
1419 aesdec %xmm11, %xmm15\r
1420\r
1421 aesdec %xmm12, %xmm1\r
1422 aesdec %xmm12, %xmm2\r
1423 aesdec %xmm12, %xmm14\r
1424 aesdec %xmm12, %xmm15\r
1425 movups 48(ctx), %xmm12\r
1426\r
1427 aesdec %xmm13, %xmm1\r
1428 aesdec %xmm13, %xmm2\r
1429 aesdec %xmm13, %xmm14\r
1430 aesdec %xmm13, %xmm15\r
1431 movups 32(ctx), %xmm13\r
1432\r
1433 aesdec %xmm12, %xmm1\r
1434 aesdec %xmm12, %xmm2\r
1435 aesdec %xmm12, %xmm14\r
1436 aesdec %xmm12, %xmm15\r
1437 movups 16(ctx), %xmm12\r
1438\r
1439 aesdec %xmm13, %xmm1\r
1440 aesdec %xmm13, %xmm2\r
1441 aesdec %xmm13, %xmm14\r
1442 aesdec %xmm13, %xmm15\r
1443 movups (ctx), %xmm13\r
1444\r
1445 aesdec %xmm12, %xmm1\r
1446 aesdec %xmm12, %xmm2\r
1447 aesdec %xmm12, %xmm14\r
1448 aesdec %xmm12, %xmm15\r
1449 movups 80(ctx), %xmm12\r
1450\r
1451 aesdeclast %xmm13, %xmm1\r
1452 aesdeclast %xmm13, %xmm2\r
1453 aesdeclast %xmm13, %xmm14\r
1454 aesdeclast %xmm13, %xmm15\r
1455 movups 64(ctx), %xmm13\r
1456\r
1457 pxor iv, %xmm1 // obuf ^= iv; \r
1458 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1459 pxor iv, %xmm2 // obuf ^= iv; \r
1460 movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1461 pxor iv, %xmm14 // obuf ^= iv; \r
1462 movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1463 pxor iv, %xmm15 // obuf ^= iv; \r
1464 movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1465\r
1466 movups %xmm1, (obuf) // write 1st obuf\r
1467 movups %xmm2, 16(obuf) // write 2nd obuf\r
1468 movups %xmm14, 32(obuf) // write 3rd obuf\r
1469 movups %xmm15, 48(obuf) // write 4th obuf\r
1470\r
1471 add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; \r
1472 add $64, obuf // obuf += AES_BLOCK_SIZE*4; \r
1473\r
1474 sub $4, num_blk // num_blk -= 4\r
1475 jge 0b // if num_blk > 0, repeat the loop\r
1476\r
14779: add $4, num_blk // post incremtn num_blk by 4\r
1478 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
1479\r
1480 movups 48(ctx), %xmm14\r
1481 movups 32(ctx), %xmm15\r
1482\r
1483#else\r
1484\r
1485 sub $4, num_blk // pre decrement num_blk by 4\r
1486 jl 9f // if num_blk < 4, skip the per-pair processing code\r
14870:\r
1488 movups (ibuf), %xmm1 // tmp = 1st ibuf\r
1489 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r
1490 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r
1491 movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r
1492\r
1493 // aes_decrypt\r
1494 // for i386, sequentially load expanded keys into xmm6/xmm7\r
1495 movups 208(ctx), %xmm6\r
1496 pxor %xmm3, %xmm1\r
1497 pxor %xmm3, %xmm2\r
1498 pxor %xmm3, %xmm4\r
1499 pxor %xmm3, %xmm5\r
1500\r
1501 movups 192(ctx), %xmm7\r
1502 aesdec %xmm6, %xmm1\r
1503 aesdec %xmm6, %xmm2\r
1504 aesdec %xmm6, %xmm4\r
1505 aesdec %xmm6, %xmm5\r
1506\r
1507 movups 176(ctx), %xmm6\r
1508 aesdec %xmm7, %xmm1\r
1509 aesdec %xmm7, %xmm2\r
1510 aesdec %xmm7, %xmm4\r
1511 aesdec %xmm7, %xmm5\r
1512\r
1513 movups 160(ctx), %xmm7\r
1514 aesdec %xmm6, %xmm1\r
1515 aesdec %xmm6, %xmm2\r
1516 aesdec %xmm6, %xmm4\r
1517 aesdec %xmm6, %xmm5\r
1518\r
1519 movups 144(ctx), %xmm6\r
1520 aesdec %xmm7, %xmm1\r
1521 aesdec %xmm7, %xmm2\r
1522 aesdec %xmm7, %xmm4\r
1523 aesdec %xmm7, %xmm5\r
1524\r
1525 movups 128(ctx), %xmm7\r
1526 aesdec %xmm6, %xmm1\r
1527 aesdec %xmm6, %xmm2\r
1528 aesdec %xmm6, %xmm4\r
1529 aesdec %xmm6, %xmm5\r
1530\r
1531 movups 112(ctx), %xmm6\r
1532 aesdec %xmm7, %xmm1\r
1533 aesdec %xmm7, %xmm2\r
1534 aesdec %xmm7, %xmm4\r
1535 aesdec %xmm7, %xmm5\r
1536\r
1537 movups 96(ctx), %xmm7\r
1538 aesdec %xmm6, %xmm1\r
1539 aesdec %xmm6, %xmm2\r
1540 aesdec %xmm6, %xmm4\r
1541 aesdec %xmm6, %xmm5\r
1542\r
1543 movups 80(ctx), %xmm6\r
1544 aesdec %xmm7, %xmm1\r
1545 aesdec %xmm7, %xmm2\r
1546 aesdec %xmm7, %xmm4\r
1547 aesdec %xmm7, %xmm5\r
1548\r
1549 movups 64(ctx), %xmm7\r
1550 aesdec %xmm6, %xmm1\r
1551 aesdec %xmm6, %xmm2\r
1552 aesdec %xmm6, %xmm4\r
1553 aesdec %xmm6, %xmm5\r
1554\r
1555 movups 48(ctx), %xmm6\r
1556 aesdec %xmm7, %xmm1\r
1557 aesdec %xmm7, %xmm2\r
1558 aesdec %xmm7, %xmm4\r
1559 aesdec %xmm7, %xmm5\r
1560\r
1561 movups 32(ctx), %xmm7\r
1562 aesdec %xmm6, %xmm1\r
1563 aesdec %xmm6, %xmm2\r
1564 aesdec %xmm6, %xmm4\r
1565 aesdec %xmm6, %xmm5\r
1566\r
1567 movups 16(ctx), %xmm6\r
1568 aesdec %xmm7, %xmm1\r
1569 aesdec %xmm7, %xmm2\r
1570 aesdec %xmm7, %xmm4\r
1571 aesdec %xmm7, %xmm5\r
1572\r
1573 movups 0(ctx), %xmm7\r
1574 aesdec %xmm6, %xmm1\r
1575 aesdec %xmm6, %xmm2\r
1576 aesdec %xmm6, %xmm4\r
1577 aesdec %xmm6, %xmm5\r
1578\r
1579 aesdeclast %xmm7, %xmm1\r
1580 aesdeclast %xmm7, %xmm2\r
1581 aesdeclast %xmm7, %xmm4\r
1582 aesdeclast %xmm7, %xmm5\r
1583\r
1584 pxor iv, %xmm1 // 1st obuf ^= iv; \r
1585 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1586 pxor iv, %xmm2 // 2nd obuf ^= iv; \r
1587 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1588 pxor iv, %xmm4 // 3rd obuf ^= iv; \r
1589 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1590 pxor iv, %xmm5 // 4th obuf ^= iv; \r
1591 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1592 movups %xmm1, (obuf) // write 1st obuf\r
1593 movups %xmm2, 16(obuf) // write 2nd obuf\r
1594 movups %xmm4, 32(obuf) // write 3rd obuf\r
1595 movups %xmm5, 48(obuf) // write 4th obuf\r
1596\r
1597 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r
1598 add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r
1599\r
1600 sub $4, num_blk // num_blk -= 4\r
1601 jge 0b // if num_blk > 0, repeat the loop\r
1602\r
1603\r
16049: add $4, num_blk // post incremtn num_blk by 4\r
1605 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r
1606\r
1607 movups 208(ctx), %xmm4\r
1608 movups 192(ctx), %xmm5\r
1609 movups 176(ctx), %xmm6\r
1610 movups 160(ctx), %xmm7\r
1611\r
1612#endif\r
1613\r
16140:\r
1615 movups (ibuf), %xmm2 // tmp = ibuf\r
1616\r
1617 // aes_decrypt\r
1618 pxor %xmm3, %xmm2\r
1619 aesdec %xmm4, %xmm2\r
1620 aesdec %xmm5, %xmm2\r
1621 aesdec %xmm6, %xmm2\r
1622 aesdec %xmm7, %xmm2\r
1623#if defined __x86_64__\r
1624 aesdec %xmm8, %xmm2\r
1625 aesdec %xmm9, %xmm2\r
1626 aesdec %xmm10, %xmm2\r
1627 aesdec %xmm11, %xmm2\r
1628 aesdec %xmm12, %xmm2\r
1629 aesdec %xmm13, %xmm2\r
1630 aesdec %xmm14, %xmm2\r
1631 aesdec %xmm15, %xmm2\r
1632#else\r
1633 movups 144(ctx), %xmm1\r
1634 aesdec %xmm1, %xmm2\r
1635 movups 128(ctx), %xmm1\r
1636 aesdec %xmm1, %xmm2\r
1637 movups 112(ctx), %xmm1\r
1638 aesdec %xmm1, %xmm2\r
1639 movups 96(ctx), %xmm1\r
1640 aesdec %xmm1, %xmm2\r
1641 movups 80(ctx), %xmm1\r
1642 aesdec %xmm1, %xmm2\r
1643 movups 64(ctx), %xmm1\r
1644 aesdec %xmm1, %xmm2\r
1645 movups 48(ctx), %xmm1\r
1646 aesdec %xmm1, %xmm2\r
1647 movups 32(ctx), %xmm1\r
1648 aesdec %xmm1, %xmm2\r
1649#endif\r
1650 movups 16(ctx), %xmm1\r
1651 aesdec %xmm1, %xmm2\r
1652 movups (ctx), %xmm1\r
1653 aesdeclast %xmm1, %xmm2\r
1654\r
1655 pxor iv, %xmm2 // obuf ^= iv; \r
1656 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
1657\r
1658 movups %xmm2, (obuf) // write obuf\r
1659\r
1660 add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r
1661 add $16, obuf // obuf += AES_BLOCK_SIZE; \r
1662 sub $1, num_blk // num_blk --\r
1663 jg 0b // if num_blk > 0, repeat the loop\r
1664\r
1665 jmp L_HW_cbc_done\r
1666\r
1667 //\r
1668 // --------- END of aes_decrypt_cbc_hw -------------------\r
1669 //\r