2 ---------------------------------------------------------------------------
3 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
7 The free distribution and use of this software in both source and binary
8 form is allowed (with or without changes) provided that:
10 1. distributions of this source code include the above copyright
11 notice, this list of conditions and the following disclaimer;
13 2. distributions in binary form include the above copyright
14 notice, this list of conditions and the following disclaimer
15 in the documentation and/or other associated materials;
17 3. the copyright holder's name is not used to endorse products
18 built using this software without specific written permission.
20 ALTERNATIVELY, provided that this notice is retained in full, this product
21 may be distributed under the terms of the GNU General Public License (GPL),
22 in which case the provisions of the GPL apply INSTEAD OF those given above.
26 This software is provided 'as is' with no explicit or implied warranties
27 in respect of its properties, including, but not limited to, correctness
28 and/or fitness for purpose.
29 ---------------------------------------------------------------------------
32 These subroutines implement multiple block AES modes for ECB, CBC, CFB,
33 OFB and CTR encryption, The code provides support for the VIA Advanced
34 Cryptography Engine (ACE).
36 NOTE: In the following subroutines, the AES contexts (ctx) must be
37 16 byte aligned if VIA ACE is being used
40 /* ----------------------------------------------------------------------------------------------------------------
42 aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
44 For simplicity, I am assuming all variables are in 128-bit data type.
46 aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
50 aes_encrypt(iv, iv, ctx);
56 The following is an implementation of this function using Intel AESNI.
57 This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
58 Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
59 to this aesni-based function should it detecs that aesni is available.
60 Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
62 Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
63 are serially chained. This prevents us from arranging several blocks for encryption in parallel.
65 ----------------------------------------------------------------------------------------------------------------*/
69 .globl _aes_encrypt_cbc_hw
72 // push/save registers for local use
95 // if this is kernel code, need to save used xmm registers
99 sub $(8*16), %esp // for possible xmm0-xmm7 save/restore
101 sub $(16*16), %rsp // xmm0-xmm15 save/restore
111 movaps %xmm7, 112(sp)
112 #if defined __x86_64__
113 movaps %xmm8, 16*8(sp)
114 movaps %xmm9, 16*9(sp)
115 movaps %xmm10, 16*10(sp)
116 movaps %xmm11, 16*11(sp)
117 movaps %xmm12, 16*12(sp)
118 movaps %xmm13, 16*13(sp)
119 movaps %xmm14, 16*14(sp)
120 movaps %xmm15, 16*15(sp)
129 mov 12(%ebp), %eax // in_iv
130 mov 24(%ebp), %edx // ctx
131 movups (%eax), iv // iv = in_iv
132 mov 8(%ebp), %ebx // ibuf
133 mov 16(%ebp), %ecx // num_blk
134 mov 20(%ebp), %edi // obuf
143 mov %rdi, %rbx // ibuf
144 movups (%rsi), iv // iv = in_iv
145 mov %rdx, %r13 // num_blk
146 mov %rcx, %r14 // obuf
150 #define num_blk %r13d
156 mov 240(ctx), %eax // aes length
157 cmp $160, %eax // aes-128 encrypt ?
159 cmp $192, %eax // aes-192 encrypt ?
161 cmp $224, %eax // aes-256 encrypt ?
163 mov $-1, %eax // return error
167 // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
172 cmp $1, num_blk // check number of block
173 jl L_HW_cbc_done // should it be less than 1, nothing to do
175 movups (ctx), %xmm2 // key0
176 movups 16(ctx), %xmm3 // key1
177 movups 32(ctx), %xmm4 // key2
178 movups 48(ctx), %xmm5 // key3
179 movups 64(ctx), %xmm6 // key4
180 movups 80(ctx), %xmm7 // key5
181 #if defined __x86_64__
182 movups 96(ctx), %xmm8 // key6
183 movups 112(ctx), %xmm9 // key7
184 movups 128(ctx), %xmm10 // key8
185 movups 144(ctx), %xmm11 // key9
186 movups 160(ctx), %xmm12 // keyA
189 // while (num_blk--) {
191 // aes_encrypt(iv, iv, ctx);
195 movups (ibuf), %xmm1 // *ibuf
196 pxor %xmm2, iv // 1st instruction inside aes_encrypt
197 pxor %xmm1, iv // *iv ^= *ibuf
199 // finishing up the rest of aes_encrypt
205 #if defined __x86_64__
210 aesenclast %xmm12, iv
212 movups 96(ctx), %xmm1 // key6
214 movups 112(ctx), %xmm1 // key7
216 movups 128(ctx), %xmm1 // key8
218 movups 144(ctx), %xmm1 // key9
220 movups 160(ctx), %xmm1 // keyA
224 movups iv, (obuf) // *obuf = *iv;
225 add $16, obuf // obuf++;
226 add $16, ibuf // ibuf++;
227 sub $1, num_blk // num_blk --
228 jg 0b // if num_blk > 0, repeat the loop
230 // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
234 xor %eax, %eax // to return CRYPT_OK
238 // if kernel, restore xmm registers
247 movaps 112(sp), %xmm7
248 #if defined __x86_64__
249 movaps 16*8(sp), %xmm8
250 movaps 16*9(sp), %xmm9
251 movaps 16*10(sp), %xmm10
252 movaps 16*11(sp), %xmm11
253 movaps 16*12(sp), %xmm12
254 movaps 16*13(sp), %xmm13
255 movaps 16*14(sp), %xmm14
256 movaps 16*15(sp), %xmm15
260 // release used stack memory, restore used callee-saved registers, and return
280 // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
285 cmp $1, num_blk // check number of block
286 jl L_HW_cbc_done // should it be less than 1, nothing to do
288 movups (ctx), %xmm2 // key0
289 movups 16(ctx), %xmm3 // key1
290 movups 32(ctx), %xmm4 // key2
291 movups 48(ctx), %xmm5 // key3
292 movups 64(ctx), %xmm6 // key4
293 movups 80(ctx), %xmm7 // key5
294 #if defined __x86_64__
295 movups 96(ctx), %xmm8 // key6
296 movups 112(ctx), %xmm9 // key7
297 movups 128(ctx), %xmm10 // key8
298 movups 144(ctx), %xmm11 // key9
299 movups 160(ctx), %xmm12 // keyA
300 movups 176(ctx), %xmm13 // keyB
301 movups 192(ctx), %xmm14 // keyC
304 // while (num_blk--) {
306 // aes_encrypt(iv, iv, ctx);
310 movups (ibuf), %xmm1 // *ibuf
311 pxor %xmm1, iv // *iv ^= ibuf
313 // aes_encrypt(iv, iv, ctx);
321 #if defined __x86_64__
328 aesenclast %xmm14, iv
330 movups 96(ctx), %xmm1
332 movups 112(ctx), %xmm1
334 movups 128(ctx), %xmm1
336 movups 144(ctx), %xmm1
338 movups 160(ctx), %xmm1
340 movups 176(ctx), %xmm1
342 movups 192(ctx), %xmm1
346 movups iv, (obuf) // *obuf = *iv;
347 add $16, ibuf // ibuf++
348 add $16, obuf // obuf++
350 sub $1, num_blk // num_blk --
351 jg 0b // if num_blk > 0, repeat the loop
353 jmp L_HW_cbc_done // share with the common exit code
356 // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
361 cmp $1, num_blk // check number of block
362 jl L_HW_cbc_done // should it be less than 1, nothing to do
364 movups (ctx), %xmm2 // key0
365 movups 16(ctx), %xmm3 // key1
366 movups 32(ctx), %xmm4 // key2
367 movups 48(ctx), %xmm5 // key3
368 movups 64(ctx), %xmm6 // key4
369 movups 80(ctx), %xmm7 // key5
370 #if defined __x86_64__
371 movups 96(ctx), %xmm8 // key6
372 movups 112(ctx), %xmm9 // key7
373 movups 128(ctx), %xmm10 // key8
374 movups 144(ctx), %xmm11 // key9
375 movups 160(ctx), %xmm12 // keyA
376 movups 176(ctx), %xmm13 // keyB
377 movups 192(ctx), %xmm14 // keyC
378 movups 208(ctx), %xmm15 // keyD
379 // movups 224(ctx), %xmm1 // keyE
382 // while (num_blk--) {
384 // aes_encrypt(iv, iv, ctx);
388 movups (ibuf), %xmm1 // *ibuf
389 pxor %xmm1, iv // *iv ^= ibuf
391 // aes_encrypt(iv, iv, ctx);
398 #if defined __x86_64__
399 movups 224(ctx), %xmm1 // keyE
410 movups 96(ctx), %xmm1 // key6
412 movups 112(ctx), %xmm1 // key7
414 movups 128(ctx), %xmm1 // key8
416 movups 144(ctx), %xmm1 // key9
418 movups 160(ctx), %xmm1 // keyA
420 movups 176(ctx), %xmm1 // keyB
422 movups 192(ctx), %xmm1 // keyC
424 movups 208(ctx), %xmm1 // keyD
426 movups 224(ctx), %xmm1 // keyE
430 movups iv, (obuf) // *obuf = *iv;
431 add $16, ibuf // ibuf++
432 add $16, obuf // obuf++
434 sub $1, num_blk // num_blk --
435 jg 0b // if num_blk > 0, repeat the loop
437 jmp L_HW_cbc_done // share with the common exit code
442 // --------- END of aes_encrypt_cbc_hw -------------------
446 /* ----------------------------------------------------------------------------------------------------------------
448 aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
450 For simplicity, I am assuming all variables are in 128-bit data type.
452 aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
455 aes_decrypt(ibuf, obuf, ctx);
462 The following is an implementation of this function using Intel AESNI.
463 This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
464 Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
465 to this aesni-based function should it detecs that aesni is available.
466 Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
468 Note that the decryption operation is not related over blocks.
469 This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
470 This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
471 The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
473 Example C code for packing 4 blocks in an iteration is shown as follows:
475 while ((num_blk-=4)>=0) {
477 // the following 4 functions can be interleaved to exploit parallelism
478 aes_decrypt(ibuf, obuf, ctx);
479 aes_decrypt(ibuf+1, obuf+1, ctx);
480 aes_decrypt(ibuf+2, obuf+2, ctx);
481 aes_decrypt(ibuf+3, obuf+3, ctx);
483 obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
484 *iv = ibuf[3]; ibuf += 4; obuf += 4;
488 ----------------------------------------------------------------------------------------------------------------*/
492 .globl _aes_decrypt_cbc_hw
495 // push/save registers for local use
519 // if kernel, allocate stack space to save xmm registers
533 movaps %xmm7, 112(sp)
534 #if defined __x86_64__
535 movaps %xmm8, 16*8(sp)
536 movaps %xmm9, 16*9(sp)
537 movaps %xmm10, 16*10(sp)
538 movaps %xmm11, 16*11(sp)
539 movaps %xmm12, 16*12(sp)
540 movaps %xmm13, 16*13(sp)
541 movaps %xmm14, 16*14(sp)
542 movaps %xmm15, 16*15(sp)
550 mov 12(%ebp), %eax // in_iv
551 mov 24(%ebp), %edx // ctx
552 movups (%eax), iv // iv = in_iv
553 mov 8(%ebp), %ebx // ibuf
554 mov 16(%ebp), %ecx // num_blk
555 mov 20(%ebp), %edi // obuf
562 #else // __x86_64__, rdi/rsi/rdx/rcx/r8
564 mov %rdi, %rbx // ibuf
565 movups (%rsi), iv // iv = in_iv
566 mov %rdx, %r13 // num_blk
567 mov %rcx, %r14 // obuf
571 #define num_blk %r13d
577 mov 240(ctx), %eax // aes length
578 cmp $160, %eax // aes-128 decrypt
580 cmp $192, %eax // aes-192 decrypt
582 cmp $224, %eax // aes-256 decrypt
585 mov $-1, %eax // wrong aes length, to return -1
586 jmp L_error // early exit due to wrong aes length
590 // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
596 jl L_HW_cbc_done // if num_blk < 1, early return
598 // aes-128 decrypt expanded keys
599 movups 160(ctx), %xmm3
600 movups 144(ctx), %xmm4
601 movups 128(ctx), %xmm5
602 movups 112(ctx), %xmm6
603 movups 96(ctx), %xmm7
604 #if defined __x86_64__
605 movups 80(ctx), %xmm8
606 movups 64(ctx), %xmm9
607 movups 48(ctx), %xmm10
608 movups 32(ctx), %xmm11
609 movups 16(ctx), %xmm12
610 movups 0(ctx), %xmm13
613 // performs 4 block decryption in an iteration to exploit decrypt in parallel
615 // while ((num_blk-=4)>=0) {
616 // aes_decrypt(ibuf, obuf, ctx);
617 // aes_decrypt(ibuf+1, obuf+1, ctx);
618 // aes_decrypt(ibuf+2, obuf+2, ctx);
619 // aes_decrypt(ibuf+3, obuf+3, ctx);
620 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
621 // *iv = ibuf[3]; ibuf += 4; obuf += 4;
624 sub $4, num_blk // pre decrement num_blk by 4
625 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
630 #if defined __x86_64__
632 movups (ibuf), %xmm1 // tmp = 1st ibuf
633 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
634 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
635 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
637 // for x86_64, the expanded keys are already stored in xmm3-xmm13
639 // aes-128 decrypt round 0 per 4 blocks
645 // aes-128 decrypt round 1 per 4 blocks
651 // aes-128 decrypt round 2 per 4 blocks
657 // aes-128 decrypt round 3 per 4 blocks
663 // aes-128 decrypt round 4 per 4 blocks
669 // aes-128 decrypt round 5 per 4 blocks
675 // aes-128 decrypt round 6 per 4 blocks
681 // aes-128 decrypt round 7 per 4 blocks
684 aesdec %xmm10, %xmm14
685 aesdec %xmm10, %xmm15
687 // aes-128 decrypt round 8 per 4 blocks
690 aesdec %xmm11, %xmm14
691 aesdec %xmm11, %xmm15
693 // aes-128 decrypt round 9 per 4 blocks
696 aesdec %xmm12, %xmm14
697 aesdec %xmm12, %xmm15
699 // aes-128 decrypt round 10 (last) per 4 blocks
700 aesdeclast %xmm13, %xmm1
701 aesdeclast %xmm13, %xmm2
702 aesdeclast %xmm13, %xmm14
703 aesdeclast %xmm13, %xmm15
705 pxor iv, %xmm1 // obuf[0] ^= *iv;
706 movups (ibuf), iv // ibuf[0]
707 pxor iv, %xmm2 // obuf[1] ^= ibuf[0];
708 movups 16(ibuf), iv // ibuf[1]
709 pxor iv, %xmm14 // obuf[2] ^= ibuf[1];
710 movups 32(ibuf), iv // ibuf[2]
711 pxor iv, %xmm15 // obuf[3] ^= obuf[2];
712 movups 48(ibuf), iv // *iv = ibuf[3]
714 movups %xmm1, (obuf) // write 1st obuf
715 movups %xmm2, 16(obuf) // write 2nd obuf
716 movups %xmm14, 32(obuf) // write 3rd obuf
717 movups %xmm15, 48(obuf) // write 4th obuf
722 // aes_decrypt_cbc per 4 blocks using aes-128 for i386
723 // xmm1/xmm2/xmm4/xmm5 used for obuf per block
726 // xmm6/xmm7 dynamically load with other expanded keys
728 movups (ibuf), %xmm1 // tmp = 1st ibuf
729 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
730 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
731 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
734 // for i386, sequentially load expanded keys into xmm6/xmm7
736 movups 144(ctx), %xmm6 // key1
738 // aes-128 decrypt round 0 per 4 blocks
744 movups 128(ctx), %xmm7 // key2
746 // aes-128 decrypt round 1 per 4 blocks
752 movups 112(ctx), %xmm6 // key3
754 // aes-128 decrypt round 2 per 4 blocks
760 movups 96(ctx), %xmm7 // key4
762 // aes-128 decrypt round 3 per 4 blocks
768 movups 80(ctx), %xmm6 // key5
770 // aes-128 decrypt round 4 per 4 blocks
776 movups 64(ctx), %xmm7 // key6
778 // aes-128 decrypt round 5 per 4 blocks
784 movups 48(ctx), %xmm6 // key7
786 // aes-128 decrypt round 6 per 4 blocks
792 movups 32(ctx), %xmm7 // key8
794 // aes-128 decrypt round 7 per 4 blocks
800 movups 16(ctx), %xmm6 // key9
802 // aes-128 decrypt round 8 per 4 blocks
808 movups 0(ctx), %xmm7 // keyA
810 // aes-128 decrypt round 9 per 4 blocks
816 // aes-128 decrypt round 10 (last) per 4 blocks
817 aesdeclast %xmm7, %xmm1
818 aesdeclast %xmm7, %xmm2
819 aesdeclast %xmm7, %xmm4
820 aesdeclast %xmm7, %xmm5
822 pxor iv, %xmm1 // 1st obuf ^= iv;
823 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
824 pxor iv, %xmm2 // 2nd obuf ^= iv;
825 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
826 pxor iv, %xmm4 // 3rd obuf ^= iv;
827 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
828 pxor iv, %xmm5 // 4th obuf ^= iv;
829 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
831 movups %xmm1, (obuf) // write 1st obuf
832 movups %xmm2, 16(obuf) // write 2nd obuf
833 movups %xmm4, 32(obuf) // write 3rd obuf
834 movups %xmm5, 48(obuf) // write 4th obuf
837 add $64, ibuf // ibuf += 4;
838 add $64, obuf // obuf += 4;
840 sub $4, num_blk // num_blk -= 4
841 jge 0b // if num_blk > 0, repeat the loop
843 9: add $4, num_blk // post incremtn num_blk by 4
844 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
847 // updated as they might be needed as expanded keys in the remaining
848 movups 144(ctx), %xmm4
849 movups 128(ctx), %xmm5
850 movups 112(ctx), %xmm6
851 movups 96(ctx), %xmm7
854 test $2, num_blk // check whether num_blk has 2 blocks
855 je 9f // if num_blk & 2 == 0, skip the per-pair processing code
857 // do the remaining 2 blocks together
859 movups (ibuf), %xmm1 // tmp = 1st ibuf
860 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
871 #if defined __x86_64__
884 aesdeclast %xmm13, %xmm1
885 aesdeclast %xmm13, %xmm2
887 movups 80(ctx), %xmm6
890 movups 64(ctx), %xmm7
893 movups 48(ctx), %xmm6
896 movups 32(ctx), %xmm7
899 movups 16(ctx), %xmm6
905 aesdeclast %xmm7, %xmm1
906 aesdeclast %xmm7, %xmm2
907 movups 112(ctx), %xmm6
908 movups 96(ctx), %xmm7
911 pxor iv, %xmm1 // obuf[0] ^= *iv;
912 movups (ibuf), iv // ibuf[0]
913 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
914 movups 16(ibuf), iv // *iv = ibuf[1]
916 movups %xmm1, (obuf) // write obuf[0]
917 movups %xmm2, 16(obuf) // write obuf[1]
919 add $32, ibuf // ibuf += 2
920 add $32, obuf // obuf += 2
923 test $1, num_blk // check whether num_blk has residual 1 block
924 je L_HW_cbc_done // if num_blk == 0, no need for residual processing code
926 movups (ibuf), %xmm2 // tmp = ibuf
933 #if defined __x86_64__
939 aesdeclast %xmm13, %xmm2
941 movups 80(ctx), %xmm1
943 movups 64(ctx), %xmm1
945 movups 48(ctx), %xmm1
947 movups 32(ctx), %xmm1
949 movups 16(ctx), %xmm1
952 aesdeclast %xmm1, %xmm2
955 pxor iv, %xmm2 // *obuf ^= *iv;
956 movups (ibuf), iv // *iv = *ibuf;
957 movups %xmm2, (obuf) // write *obuf
962 // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
968 jl L_HW_cbc_done // if num_blk < 1, early return
970 // aes-192 decryp expanded keys
971 movups 192(ctx), %xmm3
972 movups 176(ctx), %xmm4
973 movups 160(ctx), %xmm5
974 movups 144(ctx), %xmm6
975 movups 128(ctx), %xmm7
976 #if defined __x86_64__
977 movups 112(ctx), %xmm8
978 movups 96(ctx), %xmm9
979 movups 80(ctx), %xmm10
980 movups 64(ctx), %xmm11
981 movups 48(ctx), %xmm12
982 movups 32(ctx), %xmm13
983 movups 16(ctx), %xmm14
987 // performs 4 block decryption in an iteration to exploit decrypt in parallel
989 // while ((num_blk-=4)>=0) {
990 // aes_decrypt(ibuf, obuf, ctx);
991 // aes_decrypt(ibuf+1, obuf+1, ctx);
992 // aes_decrypt(ibuf+2, obuf+2, ctx);
993 // aes_decrypt(ibuf+3, obuf+3, ctx);
994 // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
995 // *iv = ibuf[3]; ibuf += 4; obuf += 4;
998 sub $4, num_blk // pre decrement num_blk by 4
999 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
1002 #if defined __x86_64__
1004 movups (ibuf), %xmm1 // tmp = 1st ibuf
1005 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1006 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
1007 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
1009 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1010 // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
1012 // round 0 for 4 blocks
1018 // round 1 for 4 blocks
1021 aesdec %xmm4, %xmm14
1022 aesdec %xmm4, %xmm15
1024 // round 2 for 4 blocks
1027 aesdec %xmm5, %xmm14
1028 aesdec %xmm5, %xmm15
1030 // round 3 for 4 blocks
1033 aesdec %xmm6, %xmm14
1034 aesdec %xmm6, %xmm15
1036 // round 4 for 4 blocks
1039 aesdec %xmm7, %xmm14
1040 aesdec %xmm7, %xmm15
1042 // round 5 for 4 blocks
1045 aesdec %xmm8, %xmm14
1046 aesdec %xmm8, %xmm15
1048 // round 6 for 4 blocks
1051 aesdec %xmm9, %xmm14
1052 aesdec %xmm9, %xmm15
1054 // round 7 for 4 blocks
1055 aesdec %xmm10, %xmm1
1056 aesdec %xmm10, %xmm2
1057 aesdec %xmm10, %xmm14
1058 aesdec %xmm10, %xmm15
1060 // round 8 for 4 blocks
1061 aesdec %xmm11, %xmm1
1062 aesdec %xmm11, %xmm2
1063 aesdec %xmm11, %xmm14
1064 aesdec %xmm11, %xmm15
1066 // round 9 for 4 blocks
1067 aesdec %xmm12, %xmm1
1068 aesdec %xmm12, %xmm2
1069 aesdec %xmm12, %xmm14
1070 aesdec %xmm12, %xmm15
1072 movups 16(ctx), %xmm12
1074 // round A for 4 blocks
1075 aesdec %xmm13, %xmm1
1076 aesdec %xmm13, %xmm2
1077 aesdec %xmm13, %xmm14
1078 aesdec %xmm13, %xmm15
1080 movups (ctx), %xmm13
1082 // round B for 4 blocks
1083 aesdec %xmm12, %xmm1
1084 aesdec %xmm12, %xmm2
1085 aesdec %xmm12, %xmm14
1086 aesdec %xmm12, %xmm15
1088 movups 48(ctx), %xmm12 // restore %xmm12 to its original key
1090 // round C (last) for 4 blocks
1091 aesdeclast %xmm13, %xmm1
1092 aesdeclast %xmm13, %xmm2
1093 aesdeclast %xmm13, %xmm14
1094 aesdeclast %xmm13, %xmm15
1096 movups 32(ctx), %xmm13 // restore %xmm13 to its original key
1098 pxor iv, %xmm1 // obuf[0] ^= *iv;
1099 movups (ibuf), iv // ibuf[0]
1100 pxor iv, %xmm2 // obuf[1] ^= ibuf[0]
1101 movups 16(ibuf), iv // ibuf[1]
1102 pxor iv, %xmm14 // obuf[2] ^= ibuf[1]
1103 movups 32(ibuf), iv // ibuf[2]
1104 pxor iv, %xmm15 // obuf[3] ^= ibuf[2]
1105 movups 48(ibuf), iv // *iv = ibuf[3]
1107 movups %xmm1, (obuf) // write 1st obuf
1108 movups %xmm2, 16(obuf) // write 2nd obuf
1109 movups %xmm14, 32(obuf) // write 3rd obuf
1110 movups %xmm15, 48(obuf) // write 4th obuf
1112 add $64, ibuf // ibuf += 4;
1113 add $64, obuf // obuf += 4;
1115 sub $4, num_blk // num_blk -= 4
1116 jge 0b // if num_blk > 0, repeat the loop
1118 9: add $4, num_blk // post incremtn num_blk by 4
1119 je L_HW_cbc_done // if num_blk == 0, prepare to return
1121 movups 16(ctx), %xmm14 // restore %xmm14 to its key
1122 movups (ctx), %xmm15 // restore %xmm15 to its key
1126 movups (ibuf), %xmm1 // tmp = 1st ibuf
1127 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1128 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
1129 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
1132 // for i386, sequentially load expanded keys into xmm6/xmm7
1133 movups 176(ctx), %xmm6
1139 movups 160(ctx), %xmm7
1145 movups 144(ctx), %xmm6
1151 movups 128(ctx), %xmm7
1157 movups 112(ctx), %xmm6
1163 movups 96(ctx), %xmm7
1169 movups 80(ctx), %xmm6
1175 movups 64(ctx), %xmm7
1181 movups 48(ctx), %xmm6
1187 movups 32(ctx), %xmm7
1193 movups 16(ctx), %xmm6
1199 movups 0(ctx), %xmm7
1205 aesdeclast %xmm7, %xmm1
1206 aesdeclast %xmm7, %xmm2
1207 aesdeclast %xmm7, %xmm4
1208 aesdeclast %xmm7, %xmm5
1210 pxor iv, %xmm1 // 1st obuf ^= iv;
1211 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1212 pxor iv, %xmm2 // 2nd obuf ^= iv;
1213 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1214 pxor iv, %xmm4 // 3rd obuf ^= iv;
1215 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1216 pxor iv, %xmm5 // 4th obuf ^= iv;
1217 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1218 movups %xmm1, (obuf) // write 1st obuf
1219 movups %xmm2, 16(obuf) // write 2nd obuf
1220 movups %xmm4, 32(obuf) // write 3rd obuf
1221 movups %xmm5, 48(obuf) // write 4th obuf
1223 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
1224 add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
1226 sub $4, num_blk // num_blk -= 4
1227 jge 0b // if num_blk > 0, repeat the loop
1230 9: add $4, num_blk // post incremtn num_blk by 4
1231 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1233 movups 176(ctx), %xmm4
1234 movups 160(ctx), %xmm5
1235 movups 144(ctx), %xmm6
1236 movups 128(ctx), %xmm7
1240 // per-block aes_decrypt_cbc loop
1243 movups (ibuf), %xmm2 // tmp = ibuf
1251 #if defined __x86_64__
1254 aesdec %xmm10, %xmm2
1255 aesdec %xmm11, %xmm2
1256 aesdec %xmm12, %xmm2
1257 aesdec %xmm13, %xmm2
1258 aesdec %xmm14, %xmm2
1259 aesdeclast %xmm15, %xmm2
1261 movups 112(ctx), %xmm1
1263 movups 96(ctx), %xmm1
1265 movups 80(ctx), %xmm1
1267 movups 64(ctx), %xmm1
1269 movups 48(ctx), %xmm1
1271 movups 32(ctx), %xmm1
1273 movups 16(ctx), %xmm1
1276 aesdeclast %xmm1, %xmm2
1279 pxor iv, %xmm2 // obuf ^= iv;
1280 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1282 movups %xmm2, (obuf) // write obuf
1284 add $16, ibuf // ibuf += AES_BLOCK_SIZE;
1285 add $16, obuf // obuf += AES_BLOCK_SIZE;
1286 sub $1, num_blk // num_blk --
1287 jg 0b // if num_blk > 0, repeat the loop
1292 // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1300 movups 224(ctx), %xmm3
1301 movups 208(ctx), %xmm4
1302 movups 192(ctx), %xmm5
1303 movups 176(ctx), %xmm6
1304 movups 160(ctx), %xmm7
1305 #if defined __x86_64__
1306 movups 144(ctx), %xmm8
1307 movups 128(ctx), %xmm9
1308 movups 112(ctx), %xmm10
1309 movups 96(ctx), %xmm11
1310 movups 80(ctx), %xmm12
1311 movups 64(ctx), %xmm13
1312 movups 48(ctx), %xmm14
1313 movups 32(ctx), %xmm15
1314 // movups 16(ctx), %xmm14
1315 // movups (ctx), %xmm15
1318 #if defined __x86_64__
1320 sub $4, num_blk // pre decrement num_blk by 4
1321 jl 9f // if num_blk < 4, skip the per-4-blocks processing code
1323 movups (ibuf), %xmm1 // tmp = 1st ibuf
1324 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1325 movups 32(ibuf), %xmm14 // tmp = 3rd ibuf
1326 movups 48(ibuf), %xmm15 // tmp = 4th ibuf
1328 // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1336 aesdec %xmm4, %xmm14
1337 aesdec %xmm4, %xmm15
1341 aesdec %xmm5, %xmm14
1342 aesdec %xmm5, %xmm15
1346 aesdec %xmm6, %xmm14
1347 aesdec %xmm6, %xmm15
1351 aesdec %xmm7, %xmm14
1352 aesdec %xmm7, %xmm15
1356 aesdec %xmm8, %xmm14
1357 aesdec %xmm8, %xmm15
1361 aesdec %xmm9, %xmm14
1362 aesdec %xmm9, %xmm15
1364 aesdec %xmm10, %xmm1
1365 aesdec %xmm10, %xmm2
1366 aesdec %xmm10, %xmm14
1367 aesdec %xmm10, %xmm15
1369 aesdec %xmm11, %xmm1
1370 aesdec %xmm11, %xmm2
1371 aesdec %xmm11, %xmm14
1372 aesdec %xmm11, %xmm15
1374 aesdec %xmm12, %xmm1
1375 aesdec %xmm12, %xmm2
1376 aesdec %xmm12, %xmm14
1377 aesdec %xmm12, %xmm15
1378 movups 48(ctx), %xmm12
1380 aesdec %xmm13, %xmm1
1381 aesdec %xmm13, %xmm2
1382 aesdec %xmm13, %xmm14
1383 aesdec %xmm13, %xmm15
1384 movups 32(ctx), %xmm13
1386 aesdec %xmm12, %xmm1
1387 aesdec %xmm12, %xmm2
1388 aesdec %xmm12, %xmm14
1389 aesdec %xmm12, %xmm15
1390 movups 16(ctx), %xmm12
1392 aesdec %xmm13, %xmm1
1393 aesdec %xmm13, %xmm2
1394 aesdec %xmm13, %xmm14
1395 aesdec %xmm13, %xmm15
1396 movups (ctx), %xmm13
1398 aesdec %xmm12, %xmm1
1399 aesdec %xmm12, %xmm2
1400 aesdec %xmm12, %xmm14
1401 aesdec %xmm12, %xmm15
1402 movups 80(ctx), %xmm12
1404 aesdeclast %xmm13, %xmm1
1405 aesdeclast %xmm13, %xmm2
1406 aesdeclast %xmm13, %xmm14
1407 aesdeclast %xmm13, %xmm15
1408 movups 64(ctx), %xmm13
1410 pxor iv, %xmm1 // obuf ^= iv;
1411 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1412 pxor iv, %xmm2 // obuf ^= iv;
1413 movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1414 pxor iv, %xmm14 // obuf ^= iv;
1415 movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1416 pxor iv, %xmm15 // obuf ^= iv;
1417 movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1419 movups %xmm1, (obuf) // write 1st obuf
1420 movups %xmm2, 16(obuf) // write 2nd obuf
1421 movups %xmm14, 32(obuf) // write 3rd obuf
1422 movups %xmm15, 48(obuf) // write 4th obuf
1424 add $64, ibuf // ibuf += AES_BLOCK_SIZE*4;
1425 add $64, obuf // obuf += AES_BLOCK_SIZE*4;
1427 sub $4, num_blk // num_blk -= 4
1428 jge 0b // if num_blk > 0, repeat the loop
1430 9: add $4, num_blk // post incremtn num_blk by 4
1431 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1433 movups 48(ctx), %xmm14
1434 movups 32(ctx), %xmm15
1438 sub $4, num_blk // pre decrement num_blk by 4
1439 jl 9f // if num_blk < 4, skip the per-pair processing code
1441 movups (ibuf), %xmm1 // tmp = 1st ibuf
1442 movups 16(ibuf), %xmm2 // tmp = 2nd ibuf
1443 movups 32(ibuf), %xmm4 // tmp = 3rd ibuf
1444 movups 48(ibuf), %xmm5 // tmp = 4th ibuf
1447 // for i386, sequentially load expanded keys into xmm6/xmm7
1448 movups 208(ctx), %xmm6
1454 movups 192(ctx), %xmm7
1460 movups 176(ctx), %xmm6
1466 movups 160(ctx), %xmm7
1472 movups 144(ctx), %xmm6
1478 movups 128(ctx), %xmm7
1484 movups 112(ctx), %xmm6
1490 movups 96(ctx), %xmm7
1496 movups 80(ctx), %xmm6
1502 movups 64(ctx), %xmm7
1508 movups 48(ctx), %xmm6
1514 movups 32(ctx), %xmm7
1520 movups 16(ctx), %xmm6
1526 movups 0(ctx), %xmm7
1532 aesdeclast %xmm7, %xmm1
1533 aesdeclast %xmm7, %xmm2
1534 aesdeclast %xmm7, %xmm4
1535 aesdeclast %xmm7, %xmm5
1537 pxor iv, %xmm1 // 1st obuf ^= iv;
1538 movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1539 pxor iv, %xmm2 // 2nd obuf ^= iv;
1540 movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1541 pxor iv, %xmm4 // 3rd obuf ^= iv;
1542 movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1543 pxor iv, %xmm5 // 4th obuf ^= iv;
1544 movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1545 movups %xmm1, (obuf) // write 1st obuf
1546 movups %xmm2, 16(obuf) // write 2nd obuf
1547 movups %xmm4, 32(obuf) // write 3rd obuf
1548 movups %xmm5, 48(obuf) // write 4th obuf
1550 add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4;
1551 add $64, obuf // obuf += AES_BLOCK_SIZE * 4;
1553 sub $4, num_blk // num_blk -= 4
1554 jge 0b // if num_blk > 0, repeat the loop
1557 9: add $4, num_blk // post incremtn num_blk by 4
1558 je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code
1560 movups 208(ctx), %xmm4
1561 movups 192(ctx), %xmm5
1562 movups 176(ctx), %xmm6
1563 movups 160(ctx), %xmm7
1568 movups (ibuf), %xmm2 // tmp = ibuf
1576 #if defined __x86_64__
1579 aesdec %xmm10, %xmm2
1580 aesdec %xmm11, %xmm2
1581 aesdec %xmm12, %xmm2
1582 aesdec %xmm13, %xmm2
1583 aesdec %xmm14, %xmm2
1584 aesdec %xmm15, %xmm2
1586 movups 144(ctx), %xmm1
1588 movups 128(ctx), %xmm1
1590 movups 112(ctx), %xmm1
1592 movups 96(ctx), %xmm1
1594 movups 80(ctx), %xmm1
1596 movups 64(ctx), %xmm1
1598 movups 48(ctx), %xmm1
1600 movups 32(ctx), %xmm1
1603 movups 16(ctx), %xmm1
1606 aesdeclast %xmm1, %xmm2
1608 pxor iv, %xmm2 // obuf ^= iv;
1609 movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);
1611 movups %xmm2, (obuf) // write obuf
1613 add $16, ibuf // ibuf += AES_BLOCK_SIZE;
1614 add $16, obuf // obuf += AES_BLOCK_SIZE;
1615 sub $1, num_blk // num_blk --
1616 jg 0b // if num_blk > 0, repeat the loop
1621 // --------- END of aes_decrypt_cbc_hw -------------------