2 ---------------------------------------------------------------------------
3 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
7 The free distribution and use of this software in both source and binary
8 form is allowed (with or without changes) provided that:
10 1. distributions of this source code include the above copyright
11 notice, this list of conditions and the following disclaimer;
13 2. distributions in binary form include the above copyright
14 notice, this list of conditions and the following disclaimer
15 in the documentation and/or other associated materials;
17 3. the copyright holder's name is not used to endorse products
18 built using this software without specific written permission.
20 ALTERNATIVELY, provided that this notice is retained in full, this product
21 may be distributed under the terms of the GNU General Public License (GPL),
22 in which case the provisions of the GPL apply INSTEAD OF those given above.
26 This software is provided 'as is' with no explicit or implied warranties
27 in respect of its properties, including, but not limited to, correctness
28 and/or fitness for purpose.
29 ---------------------------------------------------------------------------
32 These subroutines implement multiple block AES modes for ECB, CBC, CFB,
33 OFB and CTR encryption, The code provides support for the VIA Advanced
34 Cryptography Engine (ACE).
36 NOTE: In the following subroutines, the AES contexts (ctx) must be
37 16 byte aligned if VIA ACE is being used
40 /* modified 3/5/10 cclee */
41 /* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */
42 /* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */
44 /* HW-AES specific implementation cclee 3-12-10 */
45 /* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled,
46 and if kHasAES is detected, branch to the hw-specific functions here */
50 This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation
51 of _aes_encrypt_cbc and _aes_decrypt_cbc.
53 These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available.
54 They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.
56 The AES HW is detected 1st thing in
57 _aes_encrypt_cbc (aes_modes_asm.s)
58 _aes_decrypt_cbc (aes_modes_asm.s)
59 and, if AES HW is detected, branch without link (ie, jump) to the functions here.
61 The implementation here follows the examples in an Intel White Paper
62 "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01
64 Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01
70 The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block
71 in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks
72 in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.
74 The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
76 This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.
77 On a 2.4GHz core-i5/2.66GHz core-i7, the x86_64 decrypt throughput (in xnu-iokit) has been improved
78 from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.
79 The encrypt throughput is not changed.
81 I also enhanced the assembly code comments.
83 cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)
87 /* ----------------------------------------------------------------------------------------------------------------
89 aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
91 For simplicity, I am assuming all variables are in 128-bit data type.
93 aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
97 aes_encrypt(iv, iv, ctx);
103 The following is an implementation of this function using Intel AESNI.
104 This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
105 Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
106 to this aesni-based function should it detecs that aesni is available.
107 Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
109 Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
110 are serially chained. This prevents us from arranging several blocks for encryption in parallel.
112 ----------------------------------------------------------------------------------------------------------------*/
116 .globl _aes_encrypt_cbc_hw
119 // push/save registers for local use
142 // if this is kernel code, need to save used xmm registers
146 sub $(8*16), %esp // for possible xmm0-xmm7 save/restore
148 sub $(16*16), %rsp // xmm0-xmm15 save/restore
158 movaps %xmm7, 112(sp)
159 #if defined __x86_64__
160 movaps %xmm8, 16*8(sp)
161 movaps %xmm9, 16*9(sp)
162 movaps %xmm10, 16*10(sp)
163 movaps %xmm11, 16*11(sp)
164 movaps %xmm12, 16*12(sp)
165 movaps %xmm13, 16*13(sp)
166 movaps %xmm14, 16*14(sp)
167 movaps %xmm15, 16*15(sp)
176 mov 12(%ebp), %eax // in_iv
177 mov 24(%ebp), %edx // ctx
178 movups (%eax), iv // iv = in_iv
179 mov 8(%ebp), %ebx // ibuf
180 mov 16(%ebp), %ecx // num_blk
181 mov 20(%ebp), %edi // obuf
190 mov %rdi, %rbx // ibuf
191 movups (%rsi), iv // iv = in_iv
192 mov %rdx, %r13 // num_blk
193 mov %rcx, %r14 // obuf
197 #define num_blk %r13d
203 mov 240(ctx), %eax // aes length
204 cmp $160, %eax // aes-128 encrypt ?
206 cmp $192, %eax // aes-192 encrypt ?
208 cmp $224, %eax // aes-256 encrypt ?
210 mov $-1, %eax // return error
214 // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
219 cmp $1, num_blk // check number of block
220 jl L_HW_cbc_done // should it be less than 1, nothing to do
222 movups (ctx), %xmm2 // key0
223 movups 16(ctx), %xmm3 // key1
224 movups 32(ctx), %xmm4 // key2
225 movups 48(ctx), %xmm5 // key3
226 movups 64(ctx), %xmm6 // key4
227 movups 80(ctx), %xmm7 // key5
228 #if defined __x86_64__
229 movups 96(ctx), %xmm8 // key6
230 movups 112(ctx), %xmm9 // key7
231 movups 128(ctx), %xmm10 // key8
232 movups 144(ctx), %xmm11 // key9
233 movups 160(ctx), %xmm12 // keyA
236 // while (num_blk--) {
238 // aes_encrypt(iv, iv, ctx);
242 movups (ibuf), %xmm1 // *ibuf
243 pxor %xmm2, iv // 1st instruction inside aes_encrypt
244 pxor %xmm1, iv // *iv ^= *ibuf
246 // finishing up the rest of aes_encrypt
252 #if defined __x86_64__
257 aesenclast %xmm12, iv
259 movups 96(ctx), %xmm1 // key6
261 movups 112(ctx), %xmm1 // key7
263 movups 128(ctx), %xmm1 // key8
265 movups 144(ctx), %xmm1 // key9
267 movups 160(ctx), %xmm1 // keyA
271 movups iv, (obuf) // *obuf = *iv;
272 add $16, obuf // obuf++;
273 add $16, ibuf // ibuf++;
274 sub $1, num_blk // num_blk --
275 jg 0b // if num_blk > 0, repeat the loop
277 // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
281 xor %eax, %eax // to return CRYPT_OK
285 // if kernel, restore xmm registers
294 movaps 112(sp), %xmm7
295 #if defined __x86_64__
296 movaps 16*8(sp), %xmm8
297 movaps 16*9(sp), %xmm9
298 movaps 16*10(sp), %xmm10
299 movaps 16*11(sp), %xmm11
300 movaps 16*12(sp), %xmm12
301 movaps 16*13(sp), %xmm13
302 movaps 16*14(sp), %xmm14
303 movaps 16*15(sp), %xmm15
307 // release used stack memory, restore used callee-saved registers, and return
327 // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
332 cmp $1, num_blk // check number of block
333 jl L_HW_cbc_done // should it be less than 1, nothing to do
335 movups (ctx), %xmm2 // key0
336 movups 16(ctx), %xmm3 // key1
337 movups 32(ctx), %xmm4 // key2
338 movups 48(ctx), %xmm5 // key3
339 movups 64(ctx), %xmm6 // key4
340 movups 80(ctx), %xmm7 // key5
341 #if defined __x86_64__
342 movups 96(ctx), %xmm8 // key6
343 movups 112(ctx), %xmm9 // key7
344 movups 128(ctx), %xmm10 // key8
345 movups 144(ctx), %xmm11 // key9
346 movups 160(ctx), %xmm12 // keyA
347 movups 176(ctx), %xmm13 // keyB
348 movups 192(ctx), %xmm14 // keyC
351 // while (num_blk--) {
353 // aes_encrypt(iv, iv, ctx);
357 movups (ibuf), %xmm1 // *ibuf
358 pxor %xmm1, iv // *iv ^= ibuf
360 // aes_encrypt(iv, iv, ctx);
368 #if defined __x86_64__
375 aesenclast %xmm14, iv
377 movups 96(ctx), %xmm1
379 movups 112(ctx), %xmm1
381 movups 128(ctx), %xmm1
383 movups 144(ctx), %xmm1
385 movups 160(ctx), %xmm1
387 movups 176(ctx), %xmm1
389 movups 192(ctx), %xmm1
393 movups iv, (obuf) // *obuf = *iv;
394 add $16, ibuf // ibuf++
395 add $16, obuf // obuf++
397 sub $1, num_blk // num_blk --
398 jg 0b // if num_blk > 0, repeat the loop
400 jmp L_HW_cbc_done // share with the common exit code
403 // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
408 cmp $1, num_blk // check number of block
409 jl L_HW_cbc_done // should it be less than 1, nothing to do
411 movups (ctx), %xmm2 // key0
412 movups 16(ctx), %xmm3 // key1
413 movups 32(ctx), %xmm4 // key2
414 movups 48(ctx), %xmm5 // key3
415 movups 64(ctx), %xmm6 // key4
416 movups 80(ctx), %xmm7 // key5
417 #if defined __x86_64__
418 movups 96(ctx), %xmm8 // key6
419 movups 112(ctx), %xmm9 // key7
420 movups 128(ctx), %xmm10 // key8
421 movups 144(ctx), %xmm11 // key9
422 movups 160(ctx), %xmm12 // keyA
423 movups 176(ctx), %xmm13 // keyB
424 movups 192(ctx), %xmm14 // keyC
425 movups 208(ctx), %xmm15 // keyD
426 // movups 224(ctx), %xmm1 // keyE
429 // while (num_blk--) {
431 // aes_encrypt(iv, iv, ctx);
435 movups (ibuf), %xmm1 // *ibuf
436 pxor %xmm1, iv // *iv ^= ibuf
438 // aes_encrypt(iv, iv, ctx);
445 #if defined __x86_64__
446 movups 224(ctx), %xmm1 // keyE
457 movups 96(ctx), %xmm1 // key6
459 movups 112(ctx), %xmm1 // key7
461 movups 128(ctx), %xmm1 // key8
463 movups 144(ctx), %xmm1 // key9
465 movups 160(ctx), %xmm1 // keyA
467 movups 176(ctx), %xmm1 // keyB
469 movups 192(ctx), %xmm1 // keyC
471 movups 208(ctx), %xmm1 // keyD
473 movups 224(ctx), %xmm1 // keyE
477 movups iv, (obuf) // *obuf = *iv;
478 add $16, ibuf // ibuf++
479 add $16, obuf // obuf++
481 sub $1, num_blk // num_blk --
482 jg 0b // if num_blk > 0, repeat the loop
484 jmp L_HW_cbc_done // share with the common exit code
489 // --------- END of aes_encrypt_cbc_hw -------------------
493 /* ----------------------------------------------------------------------------------------------------------------
495 aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
497 For simplicity, I am assuming all variables are in 128-bit data type.
499 aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
502 aes_decrypt(ibuf, obuf, ctx);
509 The following is an implementation of this function using Intel AESNI.
510 This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
511 Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
512 to this aesni-based function should it detecs that aesni is available.
513 Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
515 Note that the decryption operation is not related over blocks.
516 This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
517 This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
518 The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
520 Example C code for packing 4 blocks in an iteration is shown as follows:
522 while ((num_blk-=4)>=0) {
524 // the following 4 functions can be interleaved to exploit parallelism
525 aes_decrypt(ibuf, obuf, ctx);
526 aes_decrypt(ibuf+1, obuf+1, ctx);
527 aes_decrypt(ibuf+2, obuf+2, ctx);
528 aes_decrypt(ibuf+3, obuf+3, ctx);
530 obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
531 *iv = ibuf[3]; ibuf += 4; obuf += 4;
535 ----------------------------------------------------------------------------------------------------------------*/
539 .globl _aes_decrypt_cbc_hw
542 // push/save registers for local use
566 // if kernel, allocate stack space to save xmm registers
580 movaps %xmm7, 112(sp)
581 #if defined __x86_64__
582 movaps %xmm8, 16*8(sp)
583 movaps %xmm9, 16*9(sp)
584 movaps %xmm10, 16*10(sp)
585 movaps %xmm11, 16*11(sp)
586 movaps %xmm12, 16*12(sp)
587 movaps %xmm13, 16*13(sp)
588 movaps %xmm14, 16*14(sp)
589 movaps %xmm15, 16*15(sp)
597 mov 12(%ebp), %eax // in_iv
598 mov 24(%ebp), %edx // ctx
599 movups (%eax), iv // iv = in_iv
600 mov 8(%ebp), %ebx // ibuf
601 mov 16(%ebp), %ecx // num_blk
602 mov 20(%ebp), %edi // obuf
609 #else // __x86_64__, rdi/rsi/rdx/rcx/r8
611 mov %rdi, %rbx // ibuf
612 movups (%rsi), iv // iv = in_iv
613 mov %rdx, %r13 // num_blk
614 mov %rcx, %r14 // obuf
618 #define num_blk %r13d
624 mov 240(ctx), %eax // aes length
625 cmp $160, %eax // aes-128 decrypt
627 cmp $192, %eax // aes-192 decrypt
629 cmp $224, %eax // aes-256 decrypt
632 mov $-1, %eax // wrong aes length, to return -1
633 jmp L_error // early exit due to wrong aes length
637 // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
643 jl L_HW_cbc_done // if num_blk < 1, early return
645 // aes-128 decrypt expanded keys
646 movups 160(ctx), %xmm3
647 movups 144(ctx), %xmm4
648 movups 128(ctx), %xmm5
649 movups 112(ctx), %xmm6
650 movups 96(ctx), %xmm7
651 #if defined __x86_64__
652 movups 80(ctx), %xmm8
653 movups 64(ctx), %xmm9
654 movups 48(ctx), %xmm10
655 movups 32(ctx), %xmm11
656 movups 16(ctx), %xmm12
657 movups 0(ctx), %xmm13
660 // performs 4 block decryption in an iteration to exploit decrypt in parallel
662 // while ((num_blk-=4)>=0) {
663 // aes_decrypt(ibuf, obuf, ctx);
664 // aes_decrypt(ibuf+1, obuf+1, ctx);
665 // aes_decrypt(ibuf+2, obuf+2, ctx);
666 // aes_decrypt(ibuf+3, obuf+3, ctx);
667 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
668 // *iv = ibuf[3]; ibuf += 4; obuf += 4;
671 sub $4, num_blk // pre decrement num_blk by 4
672 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
677 #if defined __x86_64__
679 movups (ibuf), %xmm1 // tmp = 1st ibuf
680 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
681 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
682 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
684 // for x86_64, the expanded keys are already stored in xmm3-xmm13
686 // aes-128 decrypt round 0 per 4 blocks
692 // aes-128 decrypt round 1 per 4 blocks
698 // aes-128 decrypt round 2 per 4 blocks
704 // aes-128 decrypt round 3 per 4 blocks
710 // aes-128 decrypt round 4 per 4 blocks
716 // aes-128 decrypt round 5 per 4 blocks
722 // aes-128 decrypt round 6 per 4 blocks
728 // aes-128 decrypt round 7 per 4 blocks
731 aesdec %xmm10, %xmm14
732 aesdec %xmm10, %xmm15
734 // aes-128 decrypt round 8 per 4 blocks
737 aesdec %xmm11, %xmm14
738 aesdec %xmm11, %xmm15
740 // aes-128 decrypt round 9 per 4 blocks
743 aesdec %xmm12, %xmm14
744 aesdec %xmm12, %xmm15
746 // aes-128 decrypt round 10 (last) per 4 blocks
747 aesdeclast %xmm13, %xmm1
748 aesdeclast %xmm13, %xmm2
749 aesdeclast %xmm13, %xmm14
750 aesdeclast %xmm13, %xmm15
752 pxor iv, %xmm1 // obuf[0] ^= *iv;
753 movups (ibuf), iv // ibuf[0]
754 pxor iv, %xmm2 // obuf[1] ^= ibuf[0];
755 movups 16(ibuf), iv // ibuf[1]
756 pxor iv, %xmm14 // obuf[2] ^= ibuf[1];
757 movups 32(ibuf), iv // ibuf[2]
758 pxor iv, %xmm15 // obuf[3] ^= obuf[2];
759 movups 48(ibuf), iv // *iv = ibuf[3]
761 movups %xmm1, (obuf) // write 1st obuf
762 movups %xmm2, 16(obuf) // write 2nd obuf
763 movups %xmm14, 32(obuf) // write 3rd obuf
764 movups %xmm15, 48(obuf) // write 4th obuf
769 // aes_decrypt_cbc per 4 blocks using aes-128 for i386
770 // xmm1/xmm2/xmm4/xmm5 used for obuf per block
773 // xmm6/xmm7 dynamically load with other expanded keys
775 movups (ibuf), %xmm1 // tmp = 1st ibuf
776 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
777 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
778 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
781 // for i386, sequentially load expanded keys into xmm6/xmm7
783 movups 144(ctx), %xmm6 // key1
785 // aes-128 decrypt round 0 per 4 blocks
791 movups 128(ctx), %xmm7 // key2
793 // aes-128 decrypt round 1 per 4 blocks
799 movups 112(ctx), %xmm6 // key3
801 // aes-128 decrypt round 2 per 4 blocks
807 movups 96(ctx), %xmm7 // key4
809 // aes-128 decrypt round 3 per 4 blocks
815 movups 80(ctx), %xmm6 // key5
817 // aes-128 decrypt round 4 per 4 blocks
823 movups 64(ctx), %xmm7 // key6
825 // aes-128 decrypt round 5 per 4 blocks
831 movups 48(ctx), %xmm6 // key7
833 // aes-128 decrypt round 6 per 4 blocks
839 movups 32(ctx), %xmm7 // key8
841 // aes-128 decrypt round 7 per 4 blocks
847 movups 16(ctx), %xmm6 // key9
849 // aes-128 decrypt round 8 per 4 blocks
855 movups 0(ctx), %xmm7 // keyA
857 // aes-128 decrypt round 9 per 4 blocks
863 // aes-128 decrypt round 10 (last) per 4 blocks
864 aesdeclast %xmm7, %xmm1
865 aesdeclast %xmm7, %xmm2
866 aesdeclast %xmm7, %xmm4
867 aesdeclast %xmm7, %xmm5
869 pxor iv, %xmm1 // 1st obuf ^= iv;
870 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
871 pxor iv, %xmm2 // 2nd obuf ^= iv;
872 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
873 pxor iv, %xmm4 // 3rd obuf ^= iv;
874 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
875 pxor iv, %xmm5 // 4th obuf ^= iv;
876 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
878 movups %xmm1, (obuf) // write 1st obuf
879 movups %xmm2, 16(obuf) // write 2nd obuf
880 movups %xmm4, 32(obuf) // write 3rd obuf
881 movups %xmm5, 48(obuf) // write 4th obuf
884 add $64, ibuf // ibuf += 4;
885 add $64, obuf // obuf += 4;
887 sub $4, num_blk // num_blk -= 4
888 jge 0b // if num_blk > 0, repeat the loop
890 9: add $4, num_blk // post incremtn num_blk by 4
891 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
894 // updated as they might be needed as expanded keys in the remaining
895 movups 144(ctx), %xmm4
896 movups 128(ctx), %xmm5
897 movups 112(ctx), %xmm6
898 movups 96(ctx), %xmm7
901 test $2, num_blk // check whether num_blk has 2 blocks
902 je 9f // if num_blk & 2 == 0, skip the per-pair processing code
904 // do the remaining 2 blocks together
906 movups (ibuf), %xmm1 // tmp = 1st ibuf
907 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
918 #if defined __x86_64__
931 aesdeclast %xmm13, %xmm1
932 aesdeclast %xmm13, %xmm2
934 movups 80(ctx), %xmm6
937 movups 64(ctx), %xmm7
940 movups 48(ctx), %xmm6
943 movups 32(ctx), %xmm7
946 movups 16(ctx), %xmm6
952 aesdeclast %xmm7, %xmm1
953 aesdeclast %xmm7, %xmm2
954 movups 112(ctx), %xmm6
955 movups 96(ctx), %xmm7
958 pxor iv, %xmm1 // obuf[0] ^= *iv;
959 movups (ibuf), iv // ibuf[0]
960 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
961 movups 16(ibuf), iv // *iv = ibuf[1]
963 movups %xmm1, (obuf) // write obuf[0]
964 movups %xmm2, 16(obuf) // write obuf[1]
966 add $32, ibuf // ibuf += 2
967 add $32, obuf // obuf += 2
970 test $1, num_blk // check whether num_blk has residual 1 block
971 je L_HW_cbc_done // if num_blk == 0, no need for residual processing code
973 movups (ibuf), %xmm2 // tmp = ibuf
980 #if defined __x86_64__
986 aesdeclast %xmm13, %xmm2
988 movups 80(ctx), %xmm1
990 movups 64(ctx), %xmm1
992 movups 48(ctx), %xmm1
994 movups 32(ctx), %xmm1
996 movups 16(ctx), %xmm1
999 aesdeclast %xmm1, %xmm2
1002 pxor iv, %xmm2 // *obuf ^= *iv;
1003 movups (ibuf), iv // *iv = *ibuf;
1004 movups %xmm2, (obuf) // write *obuf
1009 // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1015 jl L_HW_cbc_done // if num_blk < 1, early return
1017 // aes-192 decryp expanded keys
1018 movups 192(ctx), %xmm3
1019 movups 176(ctx), %xmm4
1020 movups 160(ctx), %xmm5
1021 movups 144(ctx), %xmm6
1022 movups 128(ctx), %xmm7
1023 #if defined __x86_64__
1024 movups 112(ctx), %xmm8
1025 movups 96(ctx), %xmm9
1026 movups 80(ctx), %xmm10
1027 movups 64(ctx), %xmm11
1028 movups 48(ctx), %xmm12
1029 movups 32(ctx), %xmm13
1030 movups 16(ctx), %xmm14
1031 movups (ctx), %xmm15
1034 // performs 4 block decryption in an iteration to exploit decrypt in parallel
1036 // while ((num_blk-=4)>=0) {
1037 // aes_decrypt(ibuf, obuf, ctx);
1038 // aes_decrypt(ibuf+1, obuf+1, ctx);
1039 // aes_decrypt(ibuf+2, obuf+2, ctx);
1040 // aes_decrypt(ibuf+3, obuf+3, ctx);
1041 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
1042 // *iv = ibuf[3]; ibuf += 4; obuf += 4;
1045 sub $4, num_blk // pre decrement num_blk by 4
1046 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
1049 #if defined __x86_64__
1051 movups (ibuf), %xmm1 // tmp = 1st ibuf
1052 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1053 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
1054 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
1056 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1057 // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
1059 // round 0 for 4 blocks
1065 // round 1 for 4 blocks
1068 aesdec %xmm4, %xmm14
1069 aesdec %xmm4, %xmm15
1071 // round 2 for 4 blocks
1074 aesdec %xmm5, %xmm14
1075 aesdec %xmm5, %xmm15
1077 // round 3 for 4 blocks
1080 aesdec %xmm6, %xmm14
1081 aesdec %xmm6, %xmm15
1083 // round 4 for 4 blocks
1086 aesdec %xmm7, %xmm14
1087 aesdec %xmm7, %xmm15
1089 // round 5 for 4 blocks
1092 aesdec %xmm8, %xmm14
1093 aesdec %xmm8, %xmm15
1095 // round 6 for 4 blocks
1098 aesdec %xmm9, %xmm14
1099 aesdec %xmm9, %xmm15
1101 // round 7 for 4 blocks
1102 aesdec %xmm10, %xmm1
1103 aesdec %xmm10, %xmm2
1104 aesdec %xmm10, %xmm14
1105 aesdec %xmm10, %xmm15
1107 // round 8 for 4 blocks
1108 aesdec %xmm11, %xmm1
1109 aesdec %xmm11, %xmm2
1110 aesdec %xmm11, %xmm14
1111 aesdec %xmm11, %xmm15
1113 // round 9 for 4 blocks
1114 aesdec %xmm12, %xmm1
1115 aesdec %xmm12, %xmm2
1116 aesdec %xmm12, %xmm14
1117 aesdec %xmm12, %xmm15
1119 movups 16(ctx), %xmm12
1121 // round A for 4 blocks
1122 aesdec %xmm13, %xmm1
1123 aesdec %xmm13, %xmm2
1124 aesdec %xmm13, %xmm14
1125 aesdec %xmm13, %xmm15
1127 movups (ctx), %xmm13
1129 // round B for 4 blocks
1130 aesdec %xmm12, %xmm1
1131 aesdec %xmm12, %xmm2
1132 aesdec %xmm12, %xmm14
1133 aesdec %xmm12, %xmm15
1135 movups 48(ctx), %xmm12 // restore %xmm12 to its original key
1137 // round C (last) for 4 blocks
1138 aesdeclast %xmm13, %xmm1
1139 aesdeclast %xmm13, %xmm2
1140 aesdeclast %xmm13, %xmm14
1141 aesdeclast %xmm13, %xmm15
1143 movups 32(ctx), %xmm13 // restore %xmm13 to its original key
1145 pxor iv, %xmm1 // obuf[0] ^= *iv;
1146 movups (ibuf), iv // ibuf[0]
1147 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
1148 movups 16(ibuf), iv // ibuf[1]
1149 pxor iv, %xmm14 // obuf[2] ^= ibuf[1]
1150 movups 32(ibuf), iv // ibuf[2]
1151 pxor iv, %xmm15 // obuf[3] ^= ibuf[2]
1152 movups 48(ibuf), iv // *iv = ibuf[3]
1154 movups %xmm1, (obuf) // write 1st obuf
1155 movups %xmm2, 16(obuf) // write 2nd obuf
1156 movups %xmm14, 32(obuf) // write 3rd obuf
1157 movups %xmm15, 48(obuf) // write 4th obuf
1159 add $64, ibuf // ibuf += 4;
1160 add $64, obuf // obuf += 4;
1162 sub $4, num_blk // num_blk -= 4
1163 jge 0b // if num_blk > 0, repeat the loop
1165 9: add $4, num_blk // post incremtn num_blk by 4
1166 je L_HW_cbc_done // if num_blk == 0, prepare to return
1168 movups 16(ctx), %xmm14 // restore %xmm14 to its key
1169 movups (ctx), %xmm15 // restore %xmm15 to its key
1173 movups (ibuf), %xmm1 // tmp = 1st ibuf
1174 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1175 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
1176 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
1179 // for i386, sequentially load expanded keys into xmm6/xmm7
1180 movups 176(ctx), %xmm6
1186 movups 160(ctx), %xmm7
1192 movups 144(ctx), %xmm6
1198 movups 128(ctx), %xmm7
1204 movups 112(ctx), %xmm6
1210 movups 96(ctx), %xmm7
1216 movups 80(ctx), %xmm6
1222 movups 64(ctx), %xmm7
1228 movups 48(ctx), %xmm6
1234 movups 32(ctx), %xmm7
1240 movups 16(ctx), %xmm6
1246 movups 0(ctx), %xmm7
1252 aesdeclast %xmm7, %xmm1
1253 aesdeclast %xmm7, %xmm2
1254 aesdeclast %xmm7, %xmm4
1255 aesdeclast %xmm7, %xmm5
1257 pxor iv, %xmm1 // 1st obuf ^= iv;
1258 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1259 pxor iv, %xmm2 // 2nd obuf ^= iv;
1260 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1261 pxor iv, %xmm4 // 3rd obuf ^= iv;
1262 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1263 pxor iv, %xmm5 // 4th obuf ^= iv;
1264 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1265 movups %xmm1, (obuf) // write 1st obuf
1266 movups %xmm2, 16(obuf) // write 2nd obuf
1267 movups %xmm4, 32(obuf) // write 3rd obuf
1268 movups %xmm5, 48(obuf) // write 4th obuf
1270 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
1271 add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
1273 sub $4, num_blk // num_blk -= 4
1274 jge 0b // if num_blk > 0, repeat the loop
1277 9: add $4, num_blk // post incremtn num_blk by 4
1278 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1280 movups 176(ctx), %xmm4
1281 movups 160(ctx), %xmm5
1282 movups 144(ctx), %xmm6
1283 movups 128(ctx), %xmm7
1287 // per-block aes_decrypt_cbc loop
1290 movups (ibuf), %xmm2 // tmp = ibuf
1298 #if defined __x86_64__
1301 aesdec %xmm10, %xmm2
1302 aesdec %xmm11, %xmm2
1303 aesdec %xmm12, %xmm2
1304 aesdec %xmm13, %xmm2
1305 aesdec %xmm14, %xmm2
1306 aesdeclast %xmm15, %xmm2
1308 movups 112(ctx), %xmm1
1310 movups 96(ctx), %xmm1
1312 movups 80(ctx), %xmm1
1314 movups 64(ctx), %xmm1
1316 movups 48(ctx), %xmm1
1318 movups 32(ctx), %xmm1
1320 movups 16(ctx), %xmm1
1323 aesdeclast %xmm1, %xmm2
1326 pxor iv, %xmm2 // obuf ^= iv;
1327 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1329 movups %xmm2, (obuf) // write obuf
1331 add $16, ibuf // ibuf += AES_BLOCK_SIZE;
1332 add $16, obuf // obuf += AES_BLOCK_SIZE;
1333 sub $1, num_blk // num_blk --
1334 jg 0b // if num_blk > 0, repeat the loop
1339 // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1347 movups 224(ctx), %xmm3
1348 movups 208(ctx), %xmm4
1349 movups 192(ctx), %xmm5
1350 movups 176(ctx), %xmm6
1351 movups 160(ctx), %xmm7
1352 #if defined __x86_64__
1353 movups 144(ctx), %xmm8
1354 movups 128(ctx), %xmm9
1355 movups 112(ctx), %xmm10
1356 movups 96(ctx), %xmm11
1357 movups 80(ctx), %xmm12
1358 movups 64(ctx), %xmm13
1359 movups 48(ctx), %xmm14
1360 movups 32(ctx), %xmm15
1361 // movups 16(ctx), %xmm14
1362 // movups (ctx), %xmm15
1365 #if defined __x86_64__
1367 sub $4, num_blk // pre decrement num_blk by 4
1368 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
1370 movups (ibuf), %xmm1 // tmp = 1st ibuf
1371 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1372 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
1373 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
1375 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1383 aesdec %xmm4, %xmm14
1384 aesdec %xmm4, %xmm15
1388 aesdec %xmm5, %xmm14
1389 aesdec %xmm5, %xmm15
1393 aesdec %xmm6, %xmm14
1394 aesdec %xmm6, %xmm15
1398 aesdec %xmm7, %xmm14
1399 aesdec %xmm7, %xmm15
1403 aesdec %xmm8, %xmm14
1404 aesdec %xmm8, %xmm15
1408 aesdec %xmm9, %xmm14
1409 aesdec %xmm9, %xmm15
1411 aesdec %xmm10, %xmm1
1412 aesdec %xmm10, %xmm2
1413 aesdec %xmm10, %xmm14
1414 aesdec %xmm10, %xmm15
1416 aesdec %xmm11, %xmm1
1417 aesdec %xmm11, %xmm2
1418 aesdec %xmm11, %xmm14
1419 aesdec %xmm11, %xmm15
1421 aesdec %xmm12, %xmm1
1422 aesdec %xmm12, %xmm2
1423 aesdec %xmm12, %xmm14
1424 aesdec %xmm12, %xmm15
1425 movups 48(ctx), %xmm12
1427 aesdec %xmm13, %xmm1
1428 aesdec %xmm13, %xmm2
1429 aesdec %xmm13, %xmm14
1430 aesdec %xmm13, %xmm15
1431 movups 32(ctx), %xmm13
1433 aesdec %xmm12, %xmm1
1434 aesdec %xmm12, %xmm2
1435 aesdec %xmm12, %xmm14
1436 aesdec %xmm12, %xmm15
1437 movups 16(ctx), %xmm12
1439 aesdec %xmm13, %xmm1
1440 aesdec %xmm13, %xmm2
1441 aesdec %xmm13, %xmm14
1442 aesdec %xmm13, %xmm15
1443 movups (ctx), %xmm13
1445 aesdec %xmm12, %xmm1
1446 aesdec %xmm12, %xmm2
1447 aesdec %xmm12, %xmm14
1448 aesdec %xmm12, %xmm15
1449 movups 80(ctx), %xmm12
1451 aesdeclast %xmm13, %xmm1
1452 aesdeclast %xmm13, %xmm2
1453 aesdeclast %xmm13, %xmm14
1454 aesdeclast %xmm13, %xmm15
1455 movups 64(ctx), %xmm13
1457 pxor iv, %xmm1 // obuf ^= iv;
1458 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1459 pxor iv, %xmm2 // obuf ^= iv;
1460 movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1461 pxor iv, %xmm14 // obuf ^= iv;
1462 movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1463 pxor iv, %xmm15 // obuf ^= iv;
1464 movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1466 movups %xmm1, (obuf) // write 1st obuf
1467 movups %xmm2, 16(obuf) // write 2nd obuf
1468 movups %xmm14, 32(obuf) // write 3rd obuf
1469 movups %xmm15, 48(obuf) // write 4th obuf
1471 add $64, ibuf // ibuf += AES_BLOCK_SIZE*4;
1472 add $64, obuf // obuf += AES_BLOCK_SIZE*4;
1474 sub $4, num_blk // num_blk -= 4
1475 jge 0b // if num_blk > 0, repeat the loop
1477 9: add $4, num_blk // post incremtn num_blk by 4
1478 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1480 movups 48(ctx), %xmm14
1481 movups 32(ctx), %xmm15
1485 sub $4, num_blk // pre decrement num_blk by 4
1486 jl 9f // if num_blk < 4, skip the per-pair processing code
1488 movups (ibuf), %xmm1 // tmp = 1st ibuf
1489 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1490 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
1491 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
1494 // for i386, sequentially load expanded keys into xmm6/xmm7
1495 movups 208(ctx), %xmm6
1501 movups 192(ctx), %xmm7
1507 movups 176(ctx), %xmm6
1513 movups 160(ctx), %xmm7
1519 movups 144(ctx), %xmm6
1525 movups 128(ctx), %xmm7
1531 movups 112(ctx), %xmm6
1537 movups 96(ctx), %xmm7
1543 movups 80(ctx), %xmm6
1549 movups 64(ctx), %xmm7
1555 movups 48(ctx), %xmm6
1561 movups 32(ctx), %xmm7
1567 movups 16(ctx), %xmm6
1573 movups 0(ctx), %xmm7
1579 aesdeclast %xmm7, %xmm1
1580 aesdeclast %xmm7, %xmm2
1581 aesdeclast %xmm7, %xmm4
1582 aesdeclast %xmm7, %xmm5
1584 pxor iv, %xmm1 // 1st obuf ^= iv;
1585 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1586 pxor iv, %xmm2 // 2nd obuf ^= iv;
1587 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1588 pxor iv, %xmm4 // 3rd obuf ^= iv;
1589 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1590 pxor iv, %xmm5 // 4th obuf ^= iv;
1591 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1592 movups %xmm1, (obuf) // write 1st obuf
1593 movups %xmm2, 16(obuf) // write 2nd obuf
1594 movups %xmm4, 32(obuf) // write 3rd obuf
1595 movups %xmm5, 48(obuf) // write 4th obuf
1597 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
1598 add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
1600 sub $4, num_blk // num_blk -= 4
1601 jge 0b // if num_blk > 0, repeat the loop
1604 9: add $4, num_blk // post incremtn num_blk by 4
1605 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1607 movups 208(ctx), %xmm4
1608 movups 192(ctx), %xmm5
1609 movups 176(ctx), %xmm6
1610 movups 160(ctx), %xmm7
1615 movups (ibuf), %xmm2 // tmp = ibuf
1623 #if defined __x86_64__
1626 aesdec %xmm10, %xmm2
1627 aesdec %xmm11, %xmm2
1628 aesdec %xmm12, %xmm2
1629 aesdec %xmm13, %xmm2
1630 aesdec %xmm14, %xmm2
1631 aesdec %xmm15, %xmm2
1633 movups 144(ctx), %xmm1
1635 movups 128(ctx), %xmm1
1637 movups 112(ctx), %xmm1
1639 movups 96(ctx), %xmm1
1641 movups 80(ctx), %xmm1
1643 movups 64(ctx), %xmm1
1645 movups 48(ctx), %xmm1
1647 movups 32(ctx), %xmm1
1650 movups 16(ctx), %xmm1
1653 aesdeclast %xmm1, %xmm2
1655 pxor iv, %xmm2 // obuf ^= iv;
1656 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1658 movups %xmm2, (obuf) // write obuf
1660 add $16, ibuf // ibuf += AES_BLOCK_SIZE;
1661 add $16, obuf // obuf += AES_BLOCK_SIZE;
1662 sub $1, num_blk // num_blk --
1663 jg 0b // if num_blk > 0, repeat the loop
1668 // --------- END of aes_decrypt_cbc_hw -------------------