]> git.saurik.com Git - apple/xnu.git/blob - bsd/crypto/aes/i386/aes_modes_hw.s
xnu-1699.22.81.tar.gz
[apple/xnu.git] / bsd / crypto / aes / i386 / aes_modes_hw.s
1 /*
2 ---------------------------------------------------------------------------
3 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
4
5 LICENSE TERMS
6
7 The free distribution and use of this software in both source and binary
8 form is allowed (with or without changes) provided that:
9
10 1. distributions of this source code include the above copyright
11 notice, this list of conditions and the following disclaimer;
12
13 2. distributions in binary form include the above copyright
14 notice, this list of conditions and the following disclaimer
15 in the documentation and/or other associated materials;
16
17 3. the copyright holder's name is not used to endorse products
18 built using this software without specific written permission.
19
20 ALTERNATIVELY, provided that this notice is retained in full, this product
21 may be distributed under the terms of the GNU General Public License (GPL),
22 in which case the provisions of the GPL apply INSTEAD OF those given above.
23
24 DISCLAIMER
25
26 This software is provided 'as is' with no explicit or implied warranties
27 in respect of its properties, including, but not limited to, correctness
28 and/or fitness for purpose.
29 ---------------------------------------------------------------------------
30 Issue 31/01/2006
31
32 These subroutines implement multiple block AES modes for ECB, CBC, CFB,
33 OFB and CTR encryption, The code provides support for the VIA Advanced
34 Cryptography Engine (ACE).
35
36 NOTE: In the following subroutines, the AES contexts (ctx) must be
37 16 byte aligned if VIA ACE is being used
38 */
39
40 /* modified 3/5/10 cclee */
41 /* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */
42 /* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */
43
44 /* HW-AES specific implementation cclee 3-12-10 */
45 /* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled,
46 and if kHasAES is detected, branch to the hw-specific functions here */
47
48
49 /*
50 This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation
51 of _aes_encrypt_cbc and _aes_decrypt_cbc.
52
53 These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available.
54 They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.
55
56 The AES HW is detected 1st thing in
57 _aes_encrypt_cbc (aes_modes_asm.s)
58 _aes_decrypt_cbc (aes_modes_asm.s)
59 and, if AES HW is detected, branch without link (ie, jump) to the functions here.
60
61 The implementation here follows the examples in an Intel White Paper
62 "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01
63
64 Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01
65
66 cclee 3-13-10
67 */
68
69 /*
70 The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block
71 in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks
72 in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.
73
74 The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
75
76 This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.
77 On a 2.4GHz core-i5/2.66GHz core-i7, the x86_64 decrypt throughput (in xnu-iokit) has been improved
78 from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.
79 The encrypt throughput is not changed.
80
81 I also enhanced the assembly code comments.
82
83 cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)
84
85 */
86
87 /* ----------------------------------------------------------------------------------------------------------------
88
89 aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
90
91 For simplicity, I am assuming all variables are in 128-bit data type.
92
93 aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
94 {
95 while(num_blk--) {
96 *iv ^= *ibuf++;
97 aes_encrypt(iv, iv, ctx);
98 *obuf++ = *iv;
99 }
100 return 0;
101 }
102
103 The following is an implementation of this function using Intel AESNI.
104 This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
105 Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
106 to this aesni-based function should it detecs that aesni is available.
107 Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
108
109 Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
110 are serially chained. This prevents us from arranging several blocks for encryption in parallel.
111
112 ----------------------------------------------------------------------------------------------------------------*/
113
114 .text
115 .align 4,0x90
116 .globl _aes_encrypt_cbc_hw
117 _aes_encrypt_cbc_hw:
118
119 // push/save registers for local use
120 #if defined __i386__
121
122 push %ebp
123 movl %esp, %ebp
124 push %ebx
125 push %edi
126
127 #define sp %esp
128
129 #else // __x86_64__
130
131 push %rbp
132 mov %rsp, %rbp
133 push %rbx
134 push %r13
135 push %r14
136 push %r15
137
138 #define sp %rsp
139
140 #endif
141
142 // if this is kernel code, need to save used xmm registers
143 #ifdef KERNEL
144
145 #if defined __i386__
146 sub $(8*16), %esp // for possible xmm0-xmm7 save/restore
147 #else
148 sub $(16*16), %rsp // xmm0-xmm15 save/restore
149 #endif
150
151 movaps %xmm0, (sp)
152 movaps %xmm1, 16(sp)
153 movaps %xmm2, 32(sp)
154 movaps %xmm3, 48(sp)
155 movaps %xmm4, 64(sp)
156 movaps %xmm5, 80(sp)
157 movaps %xmm6, 96(sp)
158 movaps %xmm7, 112(sp)
159 #if defined __x86_64__
160 movaps %xmm8, 16*8(sp)
161 movaps %xmm9, 16*9(sp)
162 movaps %xmm10, 16*10(sp)
163 movaps %xmm11, 16*11(sp)
164 movaps %xmm12, 16*12(sp)
165 movaps %xmm13, 16*13(sp)
166 movaps %xmm14, 16*14(sp)
167 movaps %xmm15, 16*15(sp)
168 #endif // __x86_64__
169
170 #endif // KERNEL
171
172 #define iv %xmm0
173
174 #ifdef __i386__
175
176 mov 12(%ebp), %eax // in_iv
177 mov 24(%ebp), %edx // ctx
178 movups (%eax), iv // iv = in_iv
179 mov 8(%ebp), %ebx // ibuf
180 mov 16(%ebp), %ecx // num_blk
181 mov 20(%ebp), %edi // obuf
182
183 #define ibuf %ebx
184 #define obuf %edi
185 #define num_blk %ecx
186 #define ctx %edx
187
188 #else
189
190 mov %rdi, %rbx // ibuf
191 movups (%rsi), iv // iv = in_iv
192 mov %rdx, %r13 // num_blk
193 mov %rcx, %r14 // obuf
194 mov %r8, %r15 // ctx
195
196 #define ibuf %rbx
197 #define num_blk %r13d
198 #define obuf %r14
199 #define ctx %r15
200
201 #endif
202
203 mov 240(ctx), %eax // aes length
204 cmp $160, %eax // aes-128 encrypt ?
205 je L_encrypt_128
206 cmp $192, %eax // aes-192 encrypt ?
207 je L_encrypt_192
208 cmp $224, %eax // aes-256 encrypt ?
209 je L_encrypt_256
210 mov $-1, %eax // return error
211 jmp L_error
212
213 //
214 // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
215 //
216
217 L_encrypt_128:
218
219 cmp $1, num_blk // check number of block
220 jl L_HW_cbc_done // should it be less than 1, nothing to do
221
222 movups (ctx), %xmm2 // key0
223 movups 16(ctx), %xmm3 // key1
224 movups 32(ctx), %xmm4 // key2
225 movups 48(ctx), %xmm5 // key3
226 movups 64(ctx), %xmm6 // key4
227 movups 80(ctx), %xmm7 // key5
228 #if defined __x86_64__
229 movups 96(ctx), %xmm8 // key6
230 movups 112(ctx), %xmm9 // key7
231 movups 128(ctx), %xmm10 // key8
232 movups 144(ctx), %xmm11 // key9
233 movups 160(ctx), %xmm12 // keyA
234 #endif
235
236 // while (num_blk--) {
237 // *iv ^= *ibuf++;
238 // aes_encrypt(iv, iv, ctx);
239 // *obuf++ = *iv;
240 // }
241 0:
242 movups (ibuf), %xmm1 // *ibuf
243 pxor %xmm2, iv // 1st instruction inside aes_encrypt
244 pxor %xmm1, iv // *iv ^= *ibuf
245
246 // finishing up the rest of aes_encrypt
247 aesenc %xmm3, iv
248 aesenc %xmm4, iv
249 aesenc %xmm5, iv
250 aesenc %xmm6, iv
251 aesenc %xmm7, iv
252 #if defined __x86_64__
253 aesenc %xmm8, iv
254 aesenc %xmm9, iv
255 aesenc %xmm10, iv
256 aesenc %xmm11, iv
257 aesenclast %xmm12, iv
258 #else
259 movups 96(ctx), %xmm1 // key6
260 aesenc %xmm1, iv
261 movups 112(ctx), %xmm1 // key7
262 aesenc %xmm1, iv
263 movups 128(ctx), %xmm1 // key8
264 aesenc %xmm1, iv
265 movups 144(ctx), %xmm1 // key9
266 aesenc %xmm1, iv
267 movups 160(ctx), %xmm1 // keyA
268 aesenclast %xmm1, iv
269 #endif
270
271 movups iv, (obuf) // *obuf = *iv;
272 add $16, obuf // obuf++;
273 add $16, ibuf // ibuf++;
274 sub $1, num_blk // num_blk --
275 jg 0b // if num_blk > 0, repeat the loop
276
277 // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
278
279 L_HW_cbc_done:
280
281 xor %eax, %eax // to return CRYPT_OK
282
283 L_error:
284
285 // if kernel, restore xmm registers
286 #ifdef KERNEL
287 movaps 0(sp), %xmm0
288 movaps 16(sp), %xmm1
289 movaps 32(sp), %xmm2
290 movaps 48(sp), %xmm3
291 movaps 64(sp), %xmm4
292 movaps 80(sp), %xmm5
293 movaps 96(sp), %xmm6
294 movaps 112(sp), %xmm7
295 #if defined __x86_64__
296 movaps 16*8(sp), %xmm8
297 movaps 16*9(sp), %xmm9
298 movaps 16*10(sp), %xmm10
299 movaps 16*11(sp), %xmm11
300 movaps 16*12(sp), %xmm12
301 movaps 16*13(sp), %xmm13
302 movaps 16*14(sp), %xmm14
303 movaps 16*15(sp), %xmm15
304 #endif // __x86_64__
305 #endif // KERNEL
306
307 // release used stack memory, restore used callee-saved registers, and return
308 #if defined __i386__
309 #ifdef KERNEL
310 add $(8*16), %esp
311 #endif
312 pop %edi
313 pop %ebx
314 #else
315 #ifdef KERNEL
316 add $(16*16), %rsp
317 #endif
318 pop %r15
319 pop %r14
320 pop %r13
321 pop %rbx
322 #endif
323 leave
324 ret
325
326 //
327 // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
328 //
329
330 L_encrypt_192:
331
332 cmp $1, num_blk // check number of block
333 jl L_HW_cbc_done // should it be less than 1, nothing to do
334
335 movups (ctx), %xmm2 // key0
336 movups 16(ctx), %xmm3 // key1
337 movups 32(ctx), %xmm4 // key2
338 movups 48(ctx), %xmm5 // key3
339 movups 64(ctx), %xmm6 // key4
340 movups 80(ctx), %xmm7 // key5
341 #if defined __x86_64__
342 movups 96(ctx), %xmm8 // key6
343 movups 112(ctx), %xmm9 // key7
344 movups 128(ctx), %xmm10 // key8
345 movups 144(ctx), %xmm11 // key9
346 movups 160(ctx), %xmm12 // keyA
347 movups 176(ctx), %xmm13 // keyB
348 movups 192(ctx), %xmm14 // keyC
349 #endif
350
351 // while (num_blk--) {
352 // *iv ^= *ibuf++;
353 // aes_encrypt(iv, iv, ctx);
354 // *obuf++ = *iv;
355 // }
356 0:
357 movups (ibuf), %xmm1 // *ibuf
358 pxor %xmm1, iv // *iv ^= ibuf
359
360 // aes_encrypt(iv, iv, ctx);
361
362 pxor %xmm2, iv
363 aesenc %xmm3, iv
364 aesenc %xmm4, iv
365 aesenc %xmm5, iv
366 aesenc %xmm6, iv
367 aesenc %xmm7, iv
368 #if defined __x86_64__
369 aesenc %xmm8, iv
370 aesenc %xmm9, iv
371 aesenc %xmm10, iv
372 aesenc %xmm11, iv
373 aesenc %xmm12, iv
374 aesenc %xmm13, iv
375 aesenclast %xmm14, iv
376 #else
377 movups 96(ctx), %xmm1
378 aesenc %xmm1, iv
379 movups 112(ctx), %xmm1
380 aesenc %xmm1, iv
381 movups 128(ctx), %xmm1
382 aesenc %xmm1, iv
383 movups 144(ctx), %xmm1
384 aesenc %xmm1, iv
385 movups 160(ctx), %xmm1
386 aesenc %xmm1, iv
387 movups 176(ctx), %xmm1
388 aesenc %xmm1, iv
389 movups 192(ctx), %xmm1
390 aesenclast %xmm1, iv
391 #endif
392
393 movups iv, (obuf) // *obuf = *iv;
394 add $16, ibuf // ibuf++
395 add $16, obuf // obuf++
396
397 sub $1, num_blk // num_blk --
398 jg 0b // if num_blk > 0, repeat the loop
399
400 jmp L_HW_cbc_done // share with the common exit code
401
402 //
403 // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
404 //
405
406 L_encrypt_256:
407
408 cmp $1, num_blk // check number of block
409 jl L_HW_cbc_done // should it be less than 1, nothing to do
410
411 movups (ctx), %xmm2 // key0
412 movups 16(ctx), %xmm3 // key1
413 movups 32(ctx), %xmm4 // key2
414 movups 48(ctx), %xmm5 // key3
415 movups 64(ctx), %xmm6 // key4
416 movups 80(ctx), %xmm7 // key5
417 #if defined __x86_64__
418 movups 96(ctx), %xmm8 // key6
419 movups 112(ctx), %xmm9 // key7
420 movups 128(ctx), %xmm10 // key8
421 movups 144(ctx), %xmm11 // key9
422 movups 160(ctx), %xmm12 // keyA
423 movups 176(ctx), %xmm13 // keyB
424 movups 192(ctx), %xmm14 // keyC
425 movups 208(ctx), %xmm15 // keyD
426 // movups 224(ctx), %xmm1 // keyE
427 #endif
428
429 // while (num_blk--) {
430 // *iv ^= *ibuf++;
431 // aes_encrypt(iv, iv, ctx);
432 // *obuf++ = *iv;
433 // }
434 0:
435 movups (ibuf), %xmm1 // *ibuf
436 pxor %xmm1, iv // *iv ^= ibuf
437
438 // aes_encrypt(iv, iv, ctx);
439 pxor %xmm2, iv
440 aesenc %xmm3, iv
441 aesenc %xmm4, iv
442 aesenc %xmm5, iv
443 aesenc %xmm6, iv
444 aesenc %xmm7, iv
445 #if defined __x86_64__
446 movups 224(ctx), %xmm1 // keyE
447 aesenc %xmm8, iv
448 aesenc %xmm9, iv
449 aesenc %xmm10, iv
450 aesenc %xmm11, iv
451 aesenc %xmm12, iv
452 aesenc %xmm13, iv
453 aesenc %xmm14, iv
454 aesenc %xmm15, iv
455 aesenclast %xmm1, iv
456 #else
457 movups 96(ctx), %xmm1 // key6
458 aesenc %xmm1, iv
459 movups 112(ctx), %xmm1 // key7
460 aesenc %xmm1, iv
461 movups 128(ctx), %xmm1 // key8
462 aesenc %xmm1, iv
463 movups 144(ctx), %xmm1 // key9
464 aesenc %xmm1, iv
465 movups 160(ctx), %xmm1 // keyA
466 aesenc %xmm1, iv
467 movups 176(ctx), %xmm1 // keyB
468 aesenc %xmm1, iv
469 movups 192(ctx), %xmm1 // keyC
470 aesenc %xmm1, iv
471 movups 208(ctx), %xmm1 // keyD
472 aesenc %xmm1, iv
473 movups 224(ctx), %xmm1 // keyE
474 aesenclast %xmm1, iv
475 #endif
476
477 movups iv, (obuf) // *obuf = *iv;
478 add $16, ibuf // ibuf++
479 add $16, obuf // obuf++
480
481 sub $1, num_blk // num_blk --
482 jg 0b // if num_blk > 0, repeat the loop
483
484 jmp L_HW_cbc_done // share with the common exit code
485
486
487
488 //
489 // --------- END of aes_encrypt_cbc_hw -------------------
490 //
491
492
493 /* ----------------------------------------------------------------------------------------------------------------
494
495 aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
496
497 For simplicity, I am assuming all variables are in 128-bit data type.
498
499 aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
500 {
501 while(num_blk--) {
502 aes_decrypt(ibuf, obuf, ctx);
503 *obuf++ ^= *iv;
504 *iv = *ibuf++;
505 }
506 return 0;
507 }
508
509 The following is an implementation of this function using Intel AESNI.
510 This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
511 Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
512 to this aesni-based function should it detecs that aesni is available.
513 Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
514
515 Note that the decryption operation is not related over blocks.
516 This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
517 This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
518 The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
519
520 Example C code for packing 4 blocks in an iteration is shown as follows:
521
522 while ((num_blk-=4)>=0) {
523
524 // the following 4 functions can be interleaved to exploit parallelism
525 aes_decrypt(ibuf, obuf, ctx);
526 aes_decrypt(ibuf+1, obuf+1, ctx);
527 aes_decrypt(ibuf+2, obuf+2, ctx);
528 aes_decrypt(ibuf+3, obuf+3, ctx);
529
530 obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
531 *iv = ibuf[3]; ibuf += 4; obuf += 4;
532 }
533 num_blk+=4;
534
535 ----------------------------------------------------------------------------------------------------------------*/
536
537 .text
538 .align 4,0x90
539 .globl _aes_decrypt_cbc_hw
540 _aes_decrypt_cbc_hw:
541
542 // push/save registers for local use
543 #if defined __i386__
544
545 push %ebp
546 movl %esp, %ebp
547 push %ebx // ibuf
548 push %edi // obuf
549
550 #define sp %esp
551
552 #else // __x86_64__
553
554 push %rbp
555 mov %rsp, %rbp
556 push %rbx
557 push %r13
558 push %r14
559 push %r15
560
561 #define sp %rsp
562
563 #endif
564
565
566 // if kernel, allocate stack space to save xmm registers
567 #ifdef KERNEL
568 #if defined __i386__
569 sub $(8*16), %esp
570 #else
571 sub $(16*16), %rsp
572 #endif
573 movaps %xmm0, (sp)
574 movaps %xmm1, 16(sp)
575 movaps %xmm2, 32(sp)
576 movaps %xmm3, 48(sp)
577 movaps %xmm4, 64(sp)
578 movaps %xmm5, 80(sp)
579 movaps %xmm6, 96(sp)
580 movaps %xmm7, 112(sp)
581 #if defined __x86_64__
582 movaps %xmm8, 16*8(sp)
583 movaps %xmm9, 16*9(sp)
584 movaps %xmm10, 16*10(sp)
585 movaps %xmm11, 16*11(sp)
586 movaps %xmm12, 16*12(sp)
587 movaps %xmm13, 16*13(sp)
588 movaps %xmm14, 16*14(sp)
589 movaps %xmm15, 16*15(sp)
590 #endif // __x86_64__
591 #endif
592
593 #undef iv
594 #define iv %xmm0
595
596 #if defined __i386__
597 mov 12(%ebp), %eax // in_iv
598 mov 24(%ebp), %edx // ctx
599 movups (%eax), iv // iv = in_iv
600 mov 8(%ebp), %ebx // ibuf
601 mov 16(%ebp), %ecx // num_blk
602 mov 20(%ebp), %edi // obuf
603
604 #define ibuf %ebx
605 #define obuf %edi
606 #define num_blk %ecx
607 #define ctx %edx
608
609 #else // __x86_64__, rdi/rsi/rdx/rcx/r8
610
611 mov %rdi, %rbx // ibuf
612 movups (%rsi), iv // iv = in_iv
613 mov %rdx, %r13 // num_blk
614 mov %rcx, %r14 // obuf
615 mov %r8, %r15 // ctx
616
617 #define ibuf %rbx
618 #define num_blk %r13d
619 #define obuf %r14
620 #define ctx %r15
621
622 #endif
623
624 mov 240(ctx), %eax // aes length
625 cmp $160, %eax // aes-128 decrypt
626 je L_decrypt_128
627 cmp $192, %eax // aes-192 decrypt
628 je L_decrypt_192
629 cmp $224, %eax // aes-256 decrypt
630 je L_decrypt_256
631
632 mov $-1, %eax // wrong aes length, to return -1
633 jmp L_error // early exit due to wrong aes length
634
635
636 //
637 // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
638 //
639
640 L_decrypt_128:
641
642 cmp $1, num_blk
643 jl L_HW_cbc_done // if num_blk < 1, early return
644
645 // aes-128 decrypt expanded keys
646 movups 160(ctx), %xmm3
647 movups 144(ctx), %xmm4
648 movups 128(ctx), %xmm5
649 movups 112(ctx), %xmm6
650 movups 96(ctx), %xmm7
651 #if defined __x86_64__
652 movups 80(ctx), %xmm8
653 movups 64(ctx), %xmm9
654 movups 48(ctx), %xmm10
655 movups 32(ctx), %xmm11
656 movups 16(ctx), %xmm12
657 movups 0(ctx), %xmm13
658 #endif
659
660 // performs 4 block decryption in an iteration to exploit decrypt in parallel
661
662 // while ((num_blk-=4)>=0) {
663 // aes_decrypt(ibuf, obuf, ctx);
664 // aes_decrypt(ibuf+1, obuf+1, ctx);
665 // aes_decrypt(ibuf+2, obuf+2, ctx);
666 // aes_decrypt(ibuf+3, obuf+3, ctx);
667 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
668 // *iv = ibuf[3]; ibuf += 4; obuf += 4;
669 // }
670
671 sub $4, num_blk // pre decrement num_blk by 4
672 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
673
674 0:
675
676
677 #if defined __x86_64__
678
679 movups (ibuf), %xmm1 // tmp = 1st ibuf
680 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
681 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
682 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
683
684 // for x86_64, the expanded keys are already stored in xmm3-xmm13
685
686 // aes-128 decrypt round 0 per 4 blocks
687 pxor %xmm3, %xmm1
688 pxor %xmm3, %xmm2
689 pxor %xmm3, %xmm14
690 pxor %xmm3, %xmm15
691
692 // aes-128 decrypt round 1 per 4 blocks
693 aesdec %xmm4, %xmm1
694 aesdec %xmm4, %xmm2
695 aesdec %xmm4, %xmm14
696 aesdec %xmm4, %xmm15
697
698 // aes-128 decrypt round 2 per 4 blocks
699 aesdec %xmm5, %xmm1
700 aesdec %xmm5, %xmm2
701 aesdec %xmm5, %xmm14
702 aesdec %xmm5, %xmm15
703
704 // aes-128 decrypt round 3 per 4 blocks
705 aesdec %xmm6, %xmm1
706 aesdec %xmm6, %xmm2
707 aesdec %xmm6, %xmm14
708 aesdec %xmm6, %xmm15
709
710 // aes-128 decrypt round 4 per 4 blocks
711 aesdec %xmm7, %xmm1
712 aesdec %xmm7, %xmm2
713 aesdec %xmm7, %xmm14
714 aesdec %xmm7, %xmm15
715
716 // aes-128 decrypt round 5 per 4 blocks
717 aesdec %xmm8, %xmm1
718 aesdec %xmm8, %xmm2
719 aesdec %xmm8, %xmm14
720 aesdec %xmm8, %xmm15
721
722 // aes-128 decrypt round 6 per 4 blocks
723 aesdec %xmm9, %xmm1
724 aesdec %xmm9, %xmm2
725 aesdec %xmm9, %xmm14
726 aesdec %xmm9, %xmm15
727
728 // aes-128 decrypt round 7 per 4 blocks
729 aesdec %xmm10, %xmm1
730 aesdec %xmm10, %xmm2
731 aesdec %xmm10, %xmm14
732 aesdec %xmm10, %xmm15
733
734 // aes-128 decrypt round 8 per 4 blocks
735 aesdec %xmm11, %xmm1
736 aesdec %xmm11, %xmm2
737 aesdec %xmm11, %xmm14
738 aesdec %xmm11, %xmm15
739
740 // aes-128 decrypt round 9 per 4 blocks
741 aesdec %xmm12, %xmm1
742 aesdec %xmm12, %xmm2
743 aesdec %xmm12, %xmm14
744 aesdec %xmm12, %xmm15
745
746 // aes-128 decrypt round 10 (last) per 4 blocks
747 aesdeclast %xmm13, %xmm1
748 aesdeclast %xmm13, %xmm2
749 aesdeclast %xmm13, %xmm14
750 aesdeclast %xmm13, %xmm15
751
752 pxor iv, %xmm1 // obuf[0] ^= *iv;
753 movups (ibuf), iv // ibuf[0]
754 pxor iv, %xmm2 // obuf[1] ^= ibuf[0];
755 movups 16(ibuf), iv // ibuf[1]
756 pxor iv, %xmm14 // obuf[2] ^= ibuf[1];
757 movups 32(ibuf), iv // ibuf[2]
758 pxor iv, %xmm15 // obuf[3] ^= obuf[2];
759 movups 48(ibuf), iv // *iv = ibuf[3]
760
761 movups %xmm1, (obuf) // write 1st obuf
762 movups %xmm2, 16(obuf) // write 2nd obuf
763 movups %xmm14, 32(obuf) // write 3rd obuf
764 movups %xmm15, 48(obuf) // write 4th obuf
765
766
767 #else
768
769 // aes_decrypt_cbc per 4 blocks using aes-128 for i386
770 // xmm1/xmm2/xmm4/xmm5 used for obuf per block
771 // xmm3 = key0
772 // xmm0 = iv
773 // xmm6/xmm7 dynamically load with other expanded keys
774
775 movups (ibuf), %xmm1 // tmp = 1st ibuf
776 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
777 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
778 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
779
780 // aes_decrypt
781 // for i386, sequentially load expanded keys into xmm6/xmm7
782
783 movups 144(ctx), %xmm6 // key1
784
785 // aes-128 decrypt round 0 per 4 blocks
786 pxor %xmm3, %xmm1
787 pxor %xmm3, %xmm2
788 pxor %xmm3, %xmm4
789 pxor %xmm3, %xmm5
790
791 movups 128(ctx), %xmm7 // key2
792
793 // aes-128 decrypt round 1 per 4 blocks
794 aesdec %xmm6, %xmm1
795 aesdec %xmm6, %xmm2
796 aesdec %xmm6, %xmm4
797 aesdec %xmm6, %xmm5
798
799 movups 112(ctx), %xmm6 // key3
800
801 // aes-128 decrypt round 2 per 4 blocks
802 aesdec %xmm7, %xmm1
803 aesdec %xmm7, %xmm2
804 aesdec %xmm7, %xmm4
805 aesdec %xmm7, %xmm5
806
807 movups 96(ctx), %xmm7 // key4
808
809 // aes-128 decrypt round 3 per 4 blocks
810 aesdec %xmm6, %xmm1
811 aesdec %xmm6, %xmm2
812 aesdec %xmm6, %xmm4
813 aesdec %xmm6, %xmm5
814
815 movups 80(ctx), %xmm6 // key5
816
817 // aes-128 decrypt round 4 per 4 blocks
818 aesdec %xmm7, %xmm1
819 aesdec %xmm7, %xmm2
820 aesdec %xmm7, %xmm4
821 aesdec %xmm7, %xmm5
822
823 movups 64(ctx), %xmm7 // key6
824
825 // aes-128 decrypt round 5 per 4 blocks
826 aesdec %xmm6, %xmm1
827 aesdec %xmm6, %xmm2
828 aesdec %xmm6, %xmm4
829 aesdec %xmm6, %xmm5
830
831 movups 48(ctx), %xmm6 // key7
832
833 // aes-128 decrypt round 6 per 4 blocks
834 aesdec %xmm7, %xmm1
835 aesdec %xmm7, %xmm2
836 aesdec %xmm7, %xmm4
837 aesdec %xmm7, %xmm5
838
839 movups 32(ctx), %xmm7 // key8
840
841 // aes-128 decrypt round 7 per 4 blocks
842 aesdec %xmm6, %xmm1
843 aesdec %xmm6, %xmm2
844 aesdec %xmm6, %xmm4
845 aesdec %xmm6, %xmm5
846
847 movups 16(ctx), %xmm6 // key9
848
849 // aes-128 decrypt round 8 per 4 blocks
850 aesdec %xmm7, %xmm1
851 aesdec %xmm7, %xmm2
852 aesdec %xmm7, %xmm4
853 aesdec %xmm7, %xmm5
854
855 movups 0(ctx), %xmm7 // keyA
856
857 // aes-128 decrypt round 9 per 4 blocks
858 aesdec %xmm6, %xmm1
859 aesdec %xmm6, %xmm2
860 aesdec %xmm6, %xmm4
861 aesdec %xmm6, %xmm5
862
863 // aes-128 decrypt round 10 (last) per 4 blocks
864 aesdeclast %xmm7, %xmm1
865 aesdeclast %xmm7, %xmm2
866 aesdeclast %xmm7, %xmm4
867 aesdeclast %xmm7, %xmm5
868
869 pxor iv, %xmm1 // 1st obuf ^= iv;
870 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
871 pxor iv, %xmm2 // 2nd obuf ^= iv;
872 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
873 pxor iv, %xmm4 // 3rd obuf ^= iv;
874 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
875 pxor iv, %xmm5 // 4th obuf ^= iv;
876 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
877
878 movups %xmm1, (obuf) // write 1st obuf
879 movups %xmm2, 16(obuf) // write 2nd obuf
880 movups %xmm4, 32(obuf) // write 3rd obuf
881 movups %xmm5, 48(obuf) // write 4th obuf
882 #endif
883
884 add $64, ibuf // ibuf += 4;
885 add $64, obuf // obuf += 4;
886
887 sub $4, num_blk // num_blk -= 4
888 jge 0b // if num_blk > 0, repeat the loop
889
890 9: add $4, num_blk // post incremtn num_blk by 4
891 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
892
893 #if defined __i386__
894 // updated as they might be needed as expanded keys in the remaining
895 movups 144(ctx), %xmm4
896 movups 128(ctx), %xmm5
897 movups 112(ctx), %xmm6
898 movups 96(ctx), %xmm7
899 #endif
900
901 test $2, num_blk // check whether num_blk has 2 blocks
902 je 9f // if num_blk & 2 == 0, skip the per-pair processing code
903
904 // do the remaining 2 blocks together
905
906 movups (ibuf), %xmm1 // tmp = 1st ibuf
907 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
908
909 // aes_decrypt
910 pxor %xmm3, %xmm1
911 pxor %xmm3, %xmm2
912 aesdec %xmm4, %xmm1
913 aesdec %xmm4, %xmm2
914 aesdec %xmm5, %xmm1
915 aesdec %xmm5, %xmm2
916 aesdec %xmm6, %xmm1
917 aesdec %xmm6, %xmm2
918 #if defined __x86_64__
919 aesdec %xmm7, %xmm1
920 aesdec %xmm7, %xmm2
921 aesdec %xmm8, %xmm1
922 aesdec %xmm8, %xmm2
923 aesdec %xmm9, %xmm1
924 aesdec %xmm9, %xmm2
925 aesdec %xmm10, %xmm1
926 aesdec %xmm10, %xmm2
927 aesdec %xmm11, %xmm1
928 aesdec %xmm11, %xmm2
929 aesdec %xmm12, %xmm1
930 aesdec %xmm12, %xmm2
931 aesdeclast %xmm13, %xmm1
932 aesdeclast %xmm13, %xmm2
933 #else
934 movups 80(ctx), %xmm6
935 aesdec %xmm7, %xmm1
936 aesdec %xmm7, %xmm2
937 movups 64(ctx), %xmm7
938 aesdec %xmm6, %xmm1
939 aesdec %xmm6, %xmm2
940 movups 48(ctx), %xmm6
941 aesdec %xmm7, %xmm1
942 aesdec %xmm7, %xmm2
943 movups 32(ctx), %xmm7
944 aesdec %xmm6, %xmm1
945 aesdec %xmm6, %xmm2
946 movups 16(ctx), %xmm6
947 aesdec %xmm7, %xmm1
948 aesdec %xmm7, %xmm2
949 movups 0(ctx), %xmm7
950 aesdec %xmm6, %xmm1
951 aesdec %xmm6, %xmm2
952 aesdeclast %xmm7, %xmm1
953 aesdeclast %xmm7, %xmm2
954 movups 112(ctx), %xmm6
955 movups 96(ctx), %xmm7
956 #endif
957
958 pxor iv, %xmm1 // obuf[0] ^= *iv;
959 movups (ibuf), iv // ibuf[0]
960 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
961 movups 16(ibuf), iv // *iv = ibuf[1]
962
963 movups %xmm1, (obuf) // write obuf[0]
964 movups %xmm2, 16(obuf) // write obuf[1]
965
966 add $32, ibuf // ibuf += 2
967 add $32, obuf // obuf += 2
968
969 9:
970 test $1, num_blk // check whether num_blk has residual 1 block
971 je L_HW_cbc_done // if num_blk == 0, no need for residual processing code
972
973 movups (ibuf), %xmm2 // tmp = ibuf
974 // aes_decrypt
975 pxor %xmm3, %xmm2
976 aesdec %xmm4, %xmm2
977 aesdec %xmm5, %xmm2
978 aesdec %xmm6, %xmm2
979 aesdec %xmm7, %xmm2
980 #if defined __x86_64__
981 aesdec %xmm8, %xmm2
982 aesdec %xmm9, %xmm2
983 aesdec %xmm10, %xmm2
984 aesdec %xmm11, %xmm2
985 aesdec %xmm12, %xmm2
986 aesdeclast %xmm13, %xmm2
987 #else
988 movups 80(ctx), %xmm1
989 aesdec %xmm1, %xmm2
990 movups 64(ctx), %xmm1
991 aesdec %xmm1, %xmm2
992 movups 48(ctx), %xmm1
993 aesdec %xmm1, %xmm2
994 movups 32(ctx), %xmm1
995 aesdec %xmm1, %xmm2
996 movups 16(ctx), %xmm1
997 aesdec %xmm1, %xmm2
998 movups (ctx), %xmm1
999 aesdeclast %xmm1, %xmm2
1000 #endif
1001
1002 pxor iv, %xmm2 // *obuf ^= *iv;
1003 movups (ibuf), iv // *iv = *ibuf;
1004 movups %xmm2, (obuf) // write *obuf
1005
1006 jmp L_HW_cbc_done
1007
1008 //
1009 // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1010 //
1011
1012 L_decrypt_192:
1013
1014 cmp $1, num_blk
1015 jl L_HW_cbc_done // if num_blk < 1, early return
1016
1017 // aes-192 decryp expanded keys
1018 movups 192(ctx), %xmm3
1019 movups 176(ctx), %xmm4
1020 movups 160(ctx), %xmm5
1021 movups 144(ctx), %xmm6
1022 movups 128(ctx), %xmm7
1023 #if defined __x86_64__
1024 movups 112(ctx), %xmm8
1025 movups 96(ctx), %xmm9
1026 movups 80(ctx), %xmm10
1027 movups 64(ctx), %xmm11
1028 movups 48(ctx), %xmm12
1029 movups 32(ctx), %xmm13
1030 movups 16(ctx), %xmm14
1031 movups (ctx), %xmm15
1032 #endif
1033
1034 // performs 4 block decryption in an iteration to exploit decrypt in parallel
1035
1036 // while ((num_blk-=4)>=0) {
1037 // aes_decrypt(ibuf, obuf, ctx);
1038 // aes_decrypt(ibuf+1, obuf+1, ctx);
1039 // aes_decrypt(ibuf+2, obuf+2, ctx);
1040 // aes_decrypt(ibuf+3, obuf+3, ctx);
1041 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
1042 // *iv = ibuf[3]; ibuf += 4; obuf += 4;
1043 // }
1044
1045 sub $4, num_blk // pre decrement num_blk by 4
1046 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
1047 0:
1048
1049 #if defined __x86_64__
1050
1051 movups (ibuf), %xmm1 // tmp = 1st ibuf
1052 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1053 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
1054 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
1055
1056 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1057 // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
1058
1059 // round 0 for 4 blocks
1060 pxor %xmm3, %xmm1
1061 pxor %xmm3, %xmm2
1062 pxor %xmm3, %xmm14
1063 pxor %xmm3, %xmm15
1064
1065 // round 1 for 4 blocks
1066 aesdec %xmm4, %xmm1
1067 aesdec %xmm4, %xmm2
1068 aesdec %xmm4, %xmm14
1069 aesdec %xmm4, %xmm15
1070
1071 // round 2 for 4 blocks
1072 aesdec %xmm5, %xmm1
1073 aesdec %xmm5, %xmm2
1074 aesdec %xmm5, %xmm14
1075 aesdec %xmm5, %xmm15
1076
1077 // round 3 for 4 blocks
1078 aesdec %xmm6, %xmm1
1079 aesdec %xmm6, %xmm2
1080 aesdec %xmm6, %xmm14
1081 aesdec %xmm6, %xmm15
1082
1083 // round 4 for 4 blocks
1084 aesdec %xmm7, %xmm1
1085 aesdec %xmm7, %xmm2
1086 aesdec %xmm7, %xmm14
1087 aesdec %xmm7, %xmm15
1088
1089 // round 5 for 4 blocks
1090 aesdec %xmm8, %xmm1
1091 aesdec %xmm8, %xmm2
1092 aesdec %xmm8, %xmm14
1093 aesdec %xmm8, %xmm15
1094
1095 // round 6 for 4 blocks
1096 aesdec %xmm9, %xmm1
1097 aesdec %xmm9, %xmm2
1098 aesdec %xmm9, %xmm14
1099 aesdec %xmm9, %xmm15
1100
1101 // round 7 for 4 blocks
1102 aesdec %xmm10, %xmm1
1103 aesdec %xmm10, %xmm2
1104 aesdec %xmm10, %xmm14
1105 aesdec %xmm10, %xmm15
1106
1107 // round 8 for 4 blocks
1108 aesdec %xmm11, %xmm1
1109 aesdec %xmm11, %xmm2
1110 aesdec %xmm11, %xmm14
1111 aesdec %xmm11, %xmm15
1112
1113 // round 9 for 4 blocks
1114 aesdec %xmm12, %xmm1
1115 aesdec %xmm12, %xmm2
1116 aesdec %xmm12, %xmm14
1117 aesdec %xmm12, %xmm15
1118
1119 movups 16(ctx), %xmm12
1120
1121 // round A for 4 blocks
1122 aesdec %xmm13, %xmm1
1123 aesdec %xmm13, %xmm2
1124 aesdec %xmm13, %xmm14
1125 aesdec %xmm13, %xmm15
1126
1127 movups (ctx), %xmm13
1128
1129 // round B for 4 blocks
1130 aesdec %xmm12, %xmm1
1131 aesdec %xmm12, %xmm2
1132 aesdec %xmm12, %xmm14
1133 aesdec %xmm12, %xmm15
1134
1135 movups 48(ctx), %xmm12 // restore %xmm12 to its original key
1136
1137 // round C (last) for 4 blocks
1138 aesdeclast %xmm13, %xmm1
1139 aesdeclast %xmm13, %xmm2
1140 aesdeclast %xmm13, %xmm14
1141 aesdeclast %xmm13, %xmm15
1142
1143 movups 32(ctx), %xmm13 // restore %xmm13 to its original key
1144
1145 pxor iv, %xmm1 // obuf[0] ^= *iv;
1146 movups (ibuf), iv // ibuf[0]
1147 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
1148 movups 16(ibuf), iv // ibuf[1]
1149 pxor iv, %xmm14 // obuf[2] ^= ibuf[1]
1150 movups 32(ibuf), iv // ibuf[2]
1151 pxor iv, %xmm15 // obuf[3] ^= ibuf[2]
1152 movups 48(ibuf), iv // *iv = ibuf[3]
1153
1154 movups %xmm1, (obuf) // write 1st obuf
1155 movups %xmm2, 16(obuf) // write 2nd obuf
1156 movups %xmm14, 32(obuf) // write 3rd obuf
1157 movups %xmm15, 48(obuf) // write 4th obuf
1158
1159 add $64, ibuf // ibuf += 4;
1160 add $64, obuf // obuf += 4;
1161
1162 sub $4, num_blk // num_blk -= 4
1163 jge 0b // if num_blk > 0, repeat the loop
1164
1165 9: add $4, num_blk // post incremtn num_blk by 4
1166 je L_HW_cbc_done // if num_blk == 0, prepare to return
1167
1168 movups 16(ctx), %xmm14 // restore %xmm14 to its key
1169 movups (ctx), %xmm15 // restore %xmm15 to its key
1170
1171 #else
1172
1173 movups (ibuf), %xmm1 // tmp = 1st ibuf
1174 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1175 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
1176 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
1177
1178 // aes_decrypt
1179 // for i386, sequentially load expanded keys into xmm6/xmm7
1180 movups 176(ctx), %xmm6
1181 pxor %xmm3, %xmm1
1182 pxor %xmm3, %xmm2
1183 pxor %xmm3, %xmm4
1184 pxor %xmm3, %xmm5
1185
1186 movups 160(ctx), %xmm7
1187 aesdec %xmm6, %xmm1
1188 aesdec %xmm6, %xmm2
1189 aesdec %xmm6, %xmm4
1190 aesdec %xmm6, %xmm5
1191
1192 movups 144(ctx), %xmm6
1193 aesdec %xmm7, %xmm1
1194 aesdec %xmm7, %xmm2
1195 aesdec %xmm7, %xmm4
1196 aesdec %xmm7, %xmm5
1197
1198 movups 128(ctx), %xmm7
1199 aesdec %xmm6, %xmm1
1200 aesdec %xmm6, %xmm2
1201 aesdec %xmm6, %xmm4
1202 aesdec %xmm6, %xmm5
1203
1204 movups 112(ctx), %xmm6
1205 aesdec %xmm7, %xmm1
1206 aesdec %xmm7, %xmm2
1207 aesdec %xmm7, %xmm4
1208 aesdec %xmm7, %xmm5
1209
1210 movups 96(ctx), %xmm7
1211 aesdec %xmm6, %xmm1
1212 aesdec %xmm6, %xmm2
1213 aesdec %xmm6, %xmm4
1214 aesdec %xmm6, %xmm5
1215
1216 movups 80(ctx), %xmm6
1217 aesdec %xmm7, %xmm1
1218 aesdec %xmm7, %xmm2
1219 aesdec %xmm7, %xmm4
1220 aesdec %xmm7, %xmm5
1221
1222 movups 64(ctx), %xmm7
1223 aesdec %xmm6, %xmm1
1224 aesdec %xmm6, %xmm2
1225 aesdec %xmm6, %xmm4
1226 aesdec %xmm6, %xmm5
1227
1228 movups 48(ctx), %xmm6
1229 aesdec %xmm7, %xmm1
1230 aesdec %xmm7, %xmm2
1231 aesdec %xmm7, %xmm4
1232 aesdec %xmm7, %xmm5
1233
1234 movups 32(ctx), %xmm7
1235 aesdec %xmm6, %xmm1
1236 aesdec %xmm6, %xmm2
1237 aesdec %xmm6, %xmm4
1238 aesdec %xmm6, %xmm5
1239
1240 movups 16(ctx), %xmm6
1241 aesdec %xmm7, %xmm1
1242 aesdec %xmm7, %xmm2
1243 aesdec %xmm7, %xmm4
1244 aesdec %xmm7, %xmm5
1245
1246 movups 0(ctx), %xmm7
1247 aesdec %xmm6, %xmm1
1248 aesdec %xmm6, %xmm2
1249 aesdec %xmm6, %xmm4
1250 aesdec %xmm6, %xmm5
1251
1252 aesdeclast %xmm7, %xmm1
1253 aesdeclast %xmm7, %xmm2
1254 aesdeclast %xmm7, %xmm4
1255 aesdeclast %xmm7, %xmm5
1256
1257 pxor iv, %xmm1 // 1st obuf ^= iv;
1258 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1259 pxor iv, %xmm2 // 2nd obuf ^= iv;
1260 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1261 pxor iv, %xmm4 // 3rd obuf ^= iv;
1262 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1263 pxor iv, %xmm5 // 4th obuf ^= iv;
1264 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1265 movups %xmm1, (obuf) // write 1st obuf
1266 movups %xmm2, 16(obuf) // write 2nd obuf
1267 movups %xmm4, 32(obuf) // write 3rd obuf
1268 movups %xmm5, 48(obuf) // write 4th obuf
1269
1270 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
1271 add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
1272
1273 sub $4, num_blk // num_blk -= 4
1274 jge 0b // if num_blk > 0, repeat the loop
1275
1276
1277 9: add $4, num_blk // post incremtn num_blk by 4
1278 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1279
1280 movups 176(ctx), %xmm4
1281 movups 160(ctx), %xmm5
1282 movups 144(ctx), %xmm6
1283 movups 128(ctx), %xmm7
1284
1285 #endif
1286
1287 // per-block aes_decrypt_cbc loop
1288
1289 0:
1290 movups (ibuf), %xmm2 // tmp = ibuf
1291
1292 // aes_decrypt
1293 pxor %xmm3, %xmm2
1294 aesdec %xmm4, %xmm2
1295 aesdec %xmm5, %xmm2
1296 aesdec %xmm6, %xmm2
1297 aesdec %xmm7, %xmm2
1298 #if defined __x86_64__
1299 aesdec %xmm8, %xmm2
1300 aesdec %xmm9, %xmm2
1301 aesdec %xmm10, %xmm2
1302 aesdec %xmm11, %xmm2
1303 aesdec %xmm12, %xmm2
1304 aesdec %xmm13, %xmm2
1305 aesdec %xmm14, %xmm2
1306 aesdeclast %xmm15, %xmm2
1307 #else
1308 movups 112(ctx), %xmm1
1309 aesdec %xmm1, %xmm2
1310 movups 96(ctx), %xmm1
1311 aesdec %xmm1, %xmm2
1312 movups 80(ctx), %xmm1
1313 aesdec %xmm1, %xmm2
1314 movups 64(ctx), %xmm1
1315 aesdec %xmm1, %xmm2
1316 movups 48(ctx), %xmm1
1317 aesdec %xmm1, %xmm2
1318 movups 32(ctx), %xmm1
1319 aesdec %xmm1, %xmm2
1320 movups 16(ctx), %xmm1
1321 aesdec %xmm1, %xmm2
1322 movups (ctx), %xmm1
1323 aesdeclast %xmm1, %xmm2
1324 #endif
1325
1326 pxor iv, %xmm2 // obuf ^= iv;
1327 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1328
1329 movups %xmm2, (obuf) // write obuf
1330
1331 add $16, ibuf // ibuf += AES_BLOCK_SIZE;
1332 add $16, obuf // obuf += AES_BLOCK_SIZE;
1333 sub $1, num_blk // num_blk --
1334 jg 0b // if num_blk > 0, repeat the loop
1335
1336 jmp L_HW_cbc_done
1337
1338 //
1339 // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1340 //
1341
1342 L_decrypt_256:
1343
1344 cmp $1, num_blk
1345 jl L_HW_cbc_done
1346
1347 movups 224(ctx), %xmm3
1348 movups 208(ctx), %xmm4
1349 movups 192(ctx), %xmm5
1350 movups 176(ctx), %xmm6
1351 movups 160(ctx), %xmm7
1352 #if defined __x86_64__
1353 movups 144(ctx), %xmm8
1354 movups 128(ctx), %xmm9
1355 movups 112(ctx), %xmm10
1356 movups 96(ctx), %xmm11
1357 movups 80(ctx), %xmm12
1358 movups 64(ctx), %xmm13
1359 movups 48(ctx), %xmm14
1360 movups 32(ctx), %xmm15
1361 // movups 16(ctx), %xmm14
1362 // movups (ctx), %xmm15
1363 #endif
1364
1365 #if defined __x86_64__
1366
1367 sub $4, num_blk // pre decrement num_blk by 4
1368 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
1369 0:
1370 movups (ibuf), %xmm1 // tmp = 1st ibuf
1371 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1372 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
1373 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
1374
1375 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1376 pxor %xmm3, %xmm1
1377 pxor %xmm3, %xmm2
1378 pxor %xmm3, %xmm14
1379 pxor %xmm3, %xmm15
1380
1381 aesdec %xmm4, %xmm1
1382 aesdec %xmm4, %xmm2
1383 aesdec %xmm4, %xmm14
1384 aesdec %xmm4, %xmm15
1385
1386 aesdec %xmm5, %xmm1
1387 aesdec %xmm5, %xmm2
1388 aesdec %xmm5, %xmm14
1389 aesdec %xmm5, %xmm15
1390
1391 aesdec %xmm6, %xmm1
1392 aesdec %xmm6, %xmm2
1393 aesdec %xmm6, %xmm14
1394 aesdec %xmm6, %xmm15
1395
1396 aesdec %xmm7, %xmm1
1397 aesdec %xmm7, %xmm2
1398 aesdec %xmm7, %xmm14
1399 aesdec %xmm7, %xmm15
1400
1401 aesdec %xmm8, %xmm1
1402 aesdec %xmm8, %xmm2
1403 aesdec %xmm8, %xmm14
1404 aesdec %xmm8, %xmm15
1405
1406 aesdec %xmm9, %xmm1
1407 aesdec %xmm9, %xmm2
1408 aesdec %xmm9, %xmm14
1409 aesdec %xmm9, %xmm15
1410
1411 aesdec %xmm10, %xmm1
1412 aesdec %xmm10, %xmm2
1413 aesdec %xmm10, %xmm14
1414 aesdec %xmm10, %xmm15
1415
1416 aesdec %xmm11, %xmm1
1417 aesdec %xmm11, %xmm2
1418 aesdec %xmm11, %xmm14
1419 aesdec %xmm11, %xmm15
1420
1421 aesdec %xmm12, %xmm1
1422 aesdec %xmm12, %xmm2
1423 aesdec %xmm12, %xmm14
1424 aesdec %xmm12, %xmm15
1425 movups 48(ctx), %xmm12
1426
1427 aesdec %xmm13, %xmm1
1428 aesdec %xmm13, %xmm2
1429 aesdec %xmm13, %xmm14
1430 aesdec %xmm13, %xmm15
1431 movups 32(ctx), %xmm13
1432
1433 aesdec %xmm12, %xmm1
1434 aesdec %xmm12, %xmm2
1435 aesdec %xmm12, %xmm14
1436 aesdec %xmm12, %xmm15
1437 movups 16(ctx), %xmm12
1438
1439 aesdec %xmm13, %xmm1
1440 aesdec %xmm13, %xmm2
1441 aesdec %xmm13, %xmm14
1442 aesdec %xmm13, %xmm15
1443 movups (ctx), %xmm13
1444
1445 aesdec %xmm12, %xmm1
1446 aesdec %xmm12, %xmm2
1447 aesdec %xmm12, %xmm14
1448 aesdec %xmm12, %xmm15
1449 movups 80(ctx), %xmm12
1450
1451 aesdeclast %xmm13, %xmm1
1452 aesdeclast %xmm13, %xmm2
1453 aesdeclast %xmm13, %xmm14
1454 aesdeclast %xmm13, %xmm15
1455 movups 64(ctx), %xmm13
1456
1457 pxor iv, %xmm1 // obuf ^= iv;
1458 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1459 pxor iv, %xmm2 // obuf ^= iv;
1460 movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1461 pxor iv, %xmm14 // obuf ^= iv;
1462 movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1463 pxor iv, %xmm15 // obuf ^= iv;
1464 movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1465
1466 movups %xmm1, (obuf) // write 1st obuf
1467 movups %xmm2, 16(obuf) // write 2nd obuf
1468 movups %xmm14, 32(obuf) // write 3rd obuf
1469 movups %xmm15, 48(obuf) // write 4th obuf
1470
1471 add $64, ibuf // ibuf += AES_BLOCK_SIZE*4;
1472 add $64, obuf // obuf += AES_BLOCK_SIZE*4;
1473
1474 sub $4, num_blk // num_blk -= 4
1475 jge 0b // if num_blk > 0, repeat the loop
1476
1477 9: add $4, num_blk // post incremtn num_blk by 4
1478 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1479
1480 movups 48(ctx), %xmm14
1481 movups 32(ctx), %xmm15
1482
1483 #else
1484
1485 sub $4, num_blk // pre decrement num_blk by 4
1486 jl 9f // if num_blk < 4, skip the per-pair processing code
1487 0:
1488 movups (ibuf), %xmm1 // tmp = 1st ibuf
1489 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1490 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
1491 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
1492
1493 // aes_decrypt
1494 // for i386, sequentially load expanded keys into xmm6/xmm7
1495 movups 208(ctx), %xmm6
1496 pxor %xmm3, %xmm1
1497 pxor %xmm3, %xmm2
1498 pxor %xmm3, %xmm4
1499 pxor %xmm3, %xmm5
1500
1501 movups 192(ctx), %xmm7
1502 aesdec %xmm6, %xmm1
1503 aesdec %xmm6, %xmm2
1504 aesdec %xmm6, %xmm4
1505 aesdec %xmm6, %xmm5
1506
1507 movups 176(ctx), %xmm6
1508 aesdec %xmm7, %xmm1
1509 aesdec %xmm7, %xmm2
1510 aesdec %xmm7, %xmm4
1511 aesdec %xmm7, %xmm5
1512
1513 movups 160(ctx), %xmm7
1514 aesdec %xmm6, %xmm1
1515 aesdec %xmm6, %xmm2
1516 aesdec %xmm6, %xmm4
1517 aesdec %xmm6, %xmm5
1518
1519 movups 144(ctx), %xmm6
1520 aesdec %xmm7, %xmm1
1521 aesdec %xmm7, %xmm2
1522 aesdec %xmm7, %xmm4
1523 aesdec %xmm7, %xmm5
1524
1525 movups 128(ctx), %xmm7
1526 aesdec %xmm6, %xmm1
1527 aesdec %xmm6, %xmm2
1528 aesdec %xmm6, %xmm4
1529 aesdec %xmm6, %xmm5
1530
1531 movups 112(ctx), %xmm6
1532 aesdec %xmm7, %xmm1
1533 aesdec %xmm7, %xmm2
1534 aesdec %xmm7, %xmm4
1535 aesdec %xmm7, %xmm5
1536
1537 movups 96(ctx), %xmm7
1538 aesdec %xmm6, %xmm1
1539 aesdec %xmm6, %xmm2
1540 aesdec %xmm6, %xmm4
1541 aesdec %xmm6, %xmm5
1542
1543 movups 80(ctx), %xmm6
1544 aesdec %xmm7, %xmm1
1545 aesdec %xmm7, %xmm2
1546 aesdec %xmm7, %xmm4
1547 aesdec %xmm7, %xmm5
1548
1549 movups 64(ctx), %xmm7
1550 aesdec %xmm6, %xmm1
1551 aesdec %xmm6, %xmm2
1552 aesdec %xmm6, %xmm4
1553 aesdec %xmm6, %xmm5
1554
1555 movups 48(ctx), %xmm6
1556 aesdec %xmm7, %xmm1
1557 aesdec %xmm7, %xmm2
1558 aesdec %xmm7, %xmm4
1559 aesdec %xmm7, %xmm5
1560
1561 movups 32(ctx), %xmm7
1562 aesdec %xmm6, %xmm1
1563 aesdec %xmm6, %xmm2
1564 aesdec %xmm6, %xmm4
1565 aesdec %xmm6, %xmm5
1566
1567 movups 16(ctx), %xmm6
1568 aesdec %xmm7, %xmm1
1569 aesdec %xmm7, %xmm2
1570 aesdec %xmm7, %xmm4
1571 aesdec %xmm7, %xmm5
1572
1573 movups 0(ctx), %xmm7
1574 aesdec %xmm6, %xmm1
1575 aesdec %xmm6, %xmm2
1576 aesdec %xmm6, %xmm4
1577 aesdec %xmm6, %xmm5
1578
1579 aesdeclast %xmm7, %xmm1
1580 aesdeclast %xmm7, %xmm2
1581 aesdeclast %xmm7, %xmm4
1582 aesdeclast %xmm7, %xmm5
1583
1584 pxor iv, %xmm1 // 1st obuf ^= iv;
1585 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1586 pxor iv, %xmm2 // 2nd obuf ^= iv;
1587 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1588 pxor iv, %xmm4 // 3rd obuf ^= iv;
1589 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1590 pxor iv, %xmm5 // 4th obuf ^= iv;
1591 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1592 movups %xmm1, (obuf) // write 1st obuf
1593 movups %xmm2, 16(obuf) // write 2nd obuf
1594 movups %xmm4, 32(obuf) // write 3rd obuf
1595 movups %xmm5, 48(obuf) // write 4th obuf
1596
1597 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
1598 add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
1599
1600 sub $4, num_blk // num_blk -= 4
1601 jge 0b // if num_blk > 0, repeat the loop
1602
1603
1604 9: add $4, num_blk // post incremtn num_blk by 4
1605 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1606
1607 movups 208(ctx), %xmm4
1608 movups 192(ctx), %xmm5
1609 movups 176(ctx), %xmm6
1610 movups 160(ctx), %xmm7
1611
1612 #endif
1613
1614 0:
1615 movups (ibuf), %xmm2 // tmp = ibuf
1616
1617 // aes_decrypt
1618 pxor %xmm3, %xmm2
1619 aesdec %xmm4, %xmm2
1620 aesdec %xmm5, %xmm2
1621 aesdec %xmm6, %xmm2
1622 aesdec %xmm7, %xmm2
1623 #if defined __x86_64__
1624 aesdec %xmm8, %xmm2
1625 aesdec %xmm9, %xmm2
1626 aesdec %xmm10, %xmm2
1627 aesdec %xmm11, %xmm2
1628 aesdec %xmm12, %xmm2
1629 aesdec %xmm13, %xmm2
1630 aesdec %xmm14, %xmm2
1631 aesdec %xmm15, %xmm2
1632 #else
1633 movups 144(ctx), %xmm1
1634 aesdec %xmm1, %xmm2
1635 movups 128(ctx), %xmm1
1636 aesdec %xmm1, %xmm2
1637 movups 112(ctx), %xmm1
1638 aesdec %xmm1, %xmm2
1639 movups 96(ctx), %xmm1
1640 aesdec %xmm1, %xmm2
1641 movups 80(ctx), %xmm1
1642 aesdec %xmm1, %xmm2
1643 movups 64(ctx), %xmm1
1644 aesdec %xmm1, %xmm2
1645 movups 48(ctx), %xmm1
1646 aesdec %xmm1, %xmm2
1647 movups 32(ctx), %xmm1
1648 aesdec %xmm1, %xmm2
1649 #endif
1650 movups 16(ctx), %xmm1
1651 aesdec %xmm1, %xmm2
1652 movups (ctx), %xmm1
1653 aesdeclast %xmm1, %xmm2
1654
1655 pxor iv, %xmm2 // obuf ^= iv;
1656 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1657
1658 movups %xmm2, (obuf) // write obuf
1659
1660 add $16, ibuf // ibuf += AES_BLOCK_SIZE;
1661 add $16, obuf // obuf += AES_BLOCK_SIZE;
1662 sub $1, num_blk // num_blk --
1663 jg 0b // if num_blk > 0, repeat the loop
1664
1665 jmp L_HW_cbc_done
1666
1667 //
1668 // --------- END of aes_decrypt_cbc_hw -------------------
1669 //