bsd/crypto/aes/i386/aes_modes_hw.s

   1 /*
   2  ---------------------------------------------------------------------------
   3  Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
   4
   5  LICENSE TERMS
   6
   7  The free distribution and use of this software in both source and binary
   8  form is allowed (with or without changes) provided that:
   9
  10    1. distributions of this source code include the above copyright
  11       notice, this list of conditions and the following disclaimer;
  12
  13    2. distributions in binary form include the above copyright
  14       notice, this list of conditions and the following disclaimer
  15       in the documentation and/or other associated materials;
  16
  17    3. the copyright holder's name is not used to endorse products
  18       built using this software without specific written permission.
  19
  20  ALTERNATIVELY, provided that this notice is retained in full, this product
  21  may be distributed under the terms of the GNU General Public License (GPL),
  22  in which case the provisions of the GPL apply INSTEAD OF those given above.
  23
  24  DISCLAIMER
  25
  26  This software is provided 'as is' with no explicit or implied warranties
  27  in respect of its properties, including, but not limited to, correctness
  28  and/or fitness for purpose.
  29  ---------------------------------------------------------------------------
  30  Issue 31/01/2006
  31
  32  These subroutines implement multiple block AES modes for ECB, CBC, CFB,
  33  OFB and CTR encryption,  The code provides support for the VIA Advanced
  34  Cryptography Engine (ACE).
  35
  36  NOTE: In the following subroutines, the AES contexts (ctx) must be
  37  16 byte aligned if VIA ACE is being used
  38 */
  39
  40 /* ----------------------------------------------------------------------------------------------------------------
  41
  42         aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
  43
  44         For simplicity, I am assuming all variables are in 128-bit data type.
  45
  46         aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
  47         {
  48                 while(num_blk--) {
  49                         *iv ^= *ibuf++;
  50                         aes_encrypt(iv, iv, ctx);
  51                         *obuf++ = *iv;
  52                 }
  53                 return 0;
  54         }
  55
  56         The following is an implementation of this function using Intel AESNI.
  57         This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
  58         Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
  59         to this aesni-based function should it detecs that aesni is available.
  60         Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
  61
  62         Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
  63         are serially chained. This prevents us from arranging several blocks for encryption in parallel.
  64
  65    ----------------------------------------------------------------------------------------------------------------*/
  66
  67         .text
  68         .align  4,0x90
  69         .globl  _aes_encrypt_cbc_hw
  70 _aes_encrypt_cbc_hw:
  71
  72         // push/save registers for local use
  73 #if     defined __i386__
  74
  75         push    %ebp
  76         movl    %esp, %ebp
  77         push    %ebx
  78         push    %edi
  79
  80         #define sp      %esp
  81
  82 #else   // __x86_64__
  83
  84         push    %rbp
  85         mov             %rsp, %rbp
  86         push    %rbx
  87         push    %r13
  88         push    %r14
  89         push    %r15
  90
  91         #define sp      %rsp
  92
  93 #endif
  94
  95         // if this is kernel code, need to save used xmm registers
  96 #ifdef  KERNEL
  97
  98 #if defined __i386__
  99         sub             $(8*16), %esp                   // for possible xmm0-xmm7 save/restore
 100 #else
 101         sub             $(16*16), %rsp          // xmm0-xmm15 save/restore
 102 #endif
 103
 104         movaps  %xmm0, (sp)
 105         movaps  %xmm1, 16(sp)
 106         movaps  %xmm2, 32(sp)
 107         movaps  %xmm3, 48(sp)
 108         movaps  %xmm4, 64(sp)
 109         movaps  %xmm5, 80(sp)
 110         movaps  %xmm6, 96(sp)
 111         movaps  %xmm7, 112(sp)
 112 #if defined     __x86_64__
 113         movaps  %xmm8, 16*8(sp)
 114         movaps  %xmm9, 16*9(sp)
 115         movaps  %xmm10, 16*10(sp)
 116         movaps  %xmm11, 16*11(sp)
 117         movaps  %xmm12, 16*12(sp)
 118         movaps  %xmm13, 16*13(sp)
 119         movaps  %xmm14, 16*14(sp)
 120         movaps  %xmm15, 16*15(sp)
 121 #endif  // __x86_64__
 122
 123 #endif  // KERNEL
 124
 125         #define iv      %xmm0
 126
 127 #ifdef  __i386__
 128
 129         mov             12(%ebp), %eax                  // in_iv
 130         mov             24(%ebp), %edx                  // ctx
 131         movups  (%eax), iv                              // iv = in_iv
 132         mov             8(%ebp), %ebx                   // ibuf
 133         mov             16(%ebp), %ecx                  // num_blk
 134         mov             20(%ebp), %edi                  // obuf
 135
 136         #define ibuf    %ebx
 137         #define obuf    %edi
 138         #define num_blk %ecx
 139         #define ctx             %edx
 140
 141 #else
 142
 143         mov             %rdi, %rbx                              // ibuf
 144         movups  (%rsi), iv                              // iv = in_iv
 145         mov             %rdx, %r13                              // num_blk
 146         mov             %rcx, %r14                              // obuf
 147         mov             %r8, %r15                               // ctx
 148
 149         #define ibuf    %rbx
 150         #define num_blk %r13d
 151         #define obuf    %r14
 152         #define ctx             %r15
 153
 154 #endif
 155
 156         mov             240(ctx), %eax                  // aes length
 157         cmp             $160, %eax                              // aes-128 encrypt ?
 158         je              L_encrypt_128
 159         cmp             $192, %eax                              // aes-192 encrypt ?
 160         je              L_encrypt_192
 161         cmp             $224, %eax                              // aes-256 encrypt ?
 162         je              L_encrypt_256
 163         mov             $-1, %eax                               // return error
 164         jmp             L_error
 165
 166         //
 167         // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
 168         //
 169
 170 L_encrypt_128:
 171
 172         cmp             $1, num_blk                             // check number of block
 173         jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
 174
 175         movups  (ctx), %xmm2                    // key0
 176         movups  16(ctx), %xmm3                  // key1
 177         movups  32(ctx), %xmm4                  // key2
 178         movups  48(ctx), %xmm5                  // key3
 179         movups  64(ctx), %xmm6                  // key4
 180         movups  80(ctx), %xmm7                  // key5
 181 #if defined     __x86_64__
 182         movups  96(ctx), %xmm8                  // key6
 183         movups  112(ctx), %xmm9                 // key7
 184         movups  128(ctx), %xmm10                // key8
 185         movups  144(ctx), %xmm11                // key9
 186         movups  160(ctx), %xmm12                // keyA
 187 #endif
 188
 189         // while (num_blk--) {
 190         //                      *iv ^= *ibuf++;
 191         //                      aes_encrypt(iv, iv, ctx);
 192         //                      *obuf++ = *iv;
 193         // }
 194 0:
 195         movups  (ibuf), %xmm1                           // *ibuf
 196         pxor    %xmm2, iv                                       // 1st instruction inside aes_encrypt
 197         pxor    %xmm1, iv                                       // *iv ^= *ibuf
 198
 199         // finishing up the rest of aes_encrypt
 200     aesenc  %xmm3, iv
 201     aesenc  %xmm4, iv
 202     aesenc  %xmm5, iv
 203     aesenc  %xmm6, iv
 204     aesenc  %xmm7, iv
 205 #if defined     __x86_64__
 206     aesenc  %xmm8, iv
 207     aesenc  %xmm9, iv
 208     aesenc  %xmm10, iv
 209     aesenc  %xmm11, iv
 210     aesenclast  %xmm12, iv
 211 #else
 212         movups  96(ctx), %xmm1                          // key6
 213     aesenc  %xmm1, iv
 214         movups  112(ctx), %xmm1                         // key7
 215     aesenc  %xmm1, iv
 216         movups  128(ctx), %xmm1                         // key8
 217     aesenc  %xmm1, iv
 218         movups  144(ctx), %xmm1                         // key9
 219     aesenc  %xmm1, iv
 220         movups  160(ctx), %xmm1                         // keyA
 221     aesenclast  %xmm1, iv
 222 #endif
 223
 224         movups  iv, (obuf)                                      // *obuf = *iv;
 225         add             $16, obuf                                       // obuf++;
 226         add             $16, ibuf                                       // ibuf++;
 227         sub             $1, num_blk                                     // num_blk --
 228         jg              0b                                                      // if num_blk > 0, repeat the loop
 229
 230         // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
 231
 232 L_HW_cbc_done:
 233
 234         xor             %eax, %eax                              // to return CRYPT_OK
 235
 236 L_error:
 237
 238         // if kernel, restore xmm registers
 239 #ifdef  KERNEL
 240         movaps  0(sp), %xmm0
 241         movaps  16(sp), %xmm1
 242         movaps  32(sp), %xmm2
 243         movaps  48(sp), %xmm3
 244         movaps  64(sp), %xmm4
 245         movaps  80(sp), %xmm5
 246         movaps  96(sp), %xmm6
 247         movaps  112(sp), %xmm7
 248 #if defined     __x86_64__
 249         movaps  16*8(sp), %xmm8
 250         movaps  16*9(sp), %xmm9
 251         movaps  16*10(sp), %xmm10
 252         movaps  16*11(sp), %xmm11
 253         movaps  16*12(sp), %xmm12
 254         movaps  16*13(sp), %xmm13
 255         movaps  16*14(sp), %xmm14
 256         movaps  16*15(sp), %xmm15
 257 #endif  // __x86_64__
 258 #endif  // KERNEL
 259
 260         // release used stack memory, restore used callee-saved registers, and return
 261 #if     defined __i386__
 262 #ifdef  KERNEL
 263         add             $(8*16), %esp
 264 #endif
 265         pop             %edi
 266         pop             %ebx
 267 #else
 268 #ifdef  KERNEL
 269         add             $(16*16), %rsp
 270 #endif
 271         pop             %r15
 272         pop             %r14
 273         pop             %r13
 274         pop             %rbx
 275 #endif
 276         leave
 277         ret
 278
 279         //
 280         // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
 281         //
 282
 283 L_encrypt_192:
 284
 285         cmp             $1, num_blk                             // check number of block
 286         jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
 287
 288         movups  (ctx), %xmm2                    // key0
 289         movups  16(ctx), %xmm3                  // key1
 290         movups  32(ctx), %xmm4                  // key2
 291         movups  48(ctx), %xmm5                  // key3
 292         movups  64(ctx), %xmm6                  // key4
 293         movups  80(ctx), %xmm7                  // key5
 294 #if defined     __x86_64__
 295         movups  96(ctx), %xmm8                  // key6
 296         movups  112(ctx), %xmm9                 // key7
 297         movups  128(ctx), %xmm10                // key8
 298         movups  144(ctx), %xmm11                // key9
 299         movups  160(ctx), %xmm12                // keyA
 300         movups  176(ctx), %xmm13                // keyB
 301         movups  192(ctx), %xmm14                // keyC
 302 #endif
 303
 304         // while (num_blk--) {
 305         //                      *iv ^= *ibuf++;
 306         //                      aes_encrypt(iv, iv, ctx);
 307         //                      *obuf++ = *iv;
 308         // }
 309 0:
 310         movups  (ibuf), %xmm1                   // *ibuf
 311         pxor    %xmm1, iv                               // *iv ^= ibuf
 312
 313         // aes_encrypt(iv, iv, ctx);
 314
 315         pxor    %xmm2, iv
 316     aesenc  %xmm3, iv
 317     aesenc  %xmm4, iv
 318     aesenc  %xmm5, iv
 319     aesenc  %xmm6, iv
 320     aesenc  %xmm7, iv
 321 #if defined     __x86_64__
 322     aesenc  %xmm8, iv
 323     aesenc  %xmm9, iv
 324     aesenc  %xmm10, iv
 325     aesenc  %xmm11, iv
 326     aesenc  %xmm12, iv
 327     aesenc  %xmm13, iv
 328     aesenclast  %xmm14, iv
 329 #else
 330         movups  96(ctx), %xmm1
 331     aesenc  %xmm1, iv
 332         movups  112(ctx), %xmm1
 333     aesenc  %xmm1, iv
 334         movups  128(ctx), %xmm1
 335     aesenc  %xmm1, iv
 336         movups  144(ctx), %xmm1
 337     aesenc  %xmm1, iv
 338         movups  160(ctx), %xmm1
 339     aesenc  %xmm1, iv
 340         movups  176(ctx), %xmm1
 341     aesenc  %xmm1, iv
 342         movups  192(ctx), %xmm1
 343     aesenclast  %xmm1, iv
 344 #endif
 345
 346         movups  iv, (obuf)                              // *obuf = *iv;
 347         add             $16, ibuf                               // ibuf++
 348         add             $16, obuf                               // obuf++
 349
 350         sub             $1, num_blk                             // num_blk --
 351         jg              0b                                              // if num_blk > 0, repeat the loop
 352
 353         jmp             L_HW_cbc_done                   // share with the common exit code
 354
 355         //
 356         // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
 357         //
 358
 359 L_encrypt_256:
 360
 361         cmp             $1, num_blk                             // check number of block
 362         jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
 363
 364         movups  (ctx), %xmm2                    // key0
 365         movups  16(ctx), %xmm3                  // key1
 366         movups  32(ctx), %xmm4                  // key2
 367         movups  48(ctx), %xmm5                  // key3
 368         movups  64(ctx), %xmm6                  // key4
 369         movups  80(ctx), %xmm7                  // key5
 370 #if defined     __x86_64__
 371         movups  96(ctx), %xmm8                  // key6
 372         movups  112(ctx), %xmm9                 // key7
 373         movups  128(ctx), %xmm10                // key8
 374         movups  144(ctx), %xmm11                // key9
 375         movups  160(ctx), %xmm12                // keyA
 376         movups  176(ctx), %xmm13                // keyB
 377         movups  192(ctx), %xmm14                // keyC
 378         movups  208(ctx), %xmm15                // keyD
 379         // movups       224(ctx), %xmm1         // keyE
 380 #endif
 381
 382         // while (num_blk--) {
 383         //                      *iv ^= *ibuf++;
 384         //                      aes_encrypt(iv, iv, ctx);
 385         //                      *obuf++ = *iv;
 386         // }
 387 0:
 388         movups  (ibuf), %xmm1                   // *ibuf
 389         pxor    %xmm1, iv                               // *iv ^= ibuf
 390
 391         // aes_encrypt(iv, iv, ctx);
 392         pxor    %xmm2, iv
 393     aesenc  %xmm3, iv
 394     aesenc  %xmm4, iv
 395     aesenc  %xmm5, iv
 396     aesenc  %xmm6, iv
 397     aesenc  %xmm7, iv
 398 #if defined     __x86_64__
 399         movups  224(ctx), %xmm1                 // keyE
 400     aesenc  %xmm8, iv
 401     aesenc  %xmm9, iv
 402     aesenc  %xmm10, iv
 403     aesenc  %xmm11, iv
 404     aesenc  %xmm12, iv
 405     aesenc  %xmm13, iv
 406     aesenc  %xmm14, iv
 407     aesenc  %xmm15, iv
 408     aesenclast  %xmm1, iv
 409 #else
 410         movups  96(ctx), %xmm1                  // key6
 411     aesenc  %xmm1, iv
 412         movups  112(ctx), %xmm1                 // key7
 413     aesenc  %xmm1, iv
 414         movups  128(ctx), %xmm1                 // key8
 415     aesenc  %xmm1, iv
 416         movups  144(ctx), %xmm1                 // key9
 417     aesenc  %xmm1, iv
 418         movups  160(ctx), %xmm1                 // keyA
 419     aesenc  %xmm1, iv
 420         movups  176(ctx), %xmm1                 // keyB
 421     aesenc  %xmm1, iv
 422         movups  192(ctx), %xmm1                 // keyC
 423     aesenc  %xmm1, iv
 424         movups  208(ctx), %xmm1                 // keyD
 425     aesenc  %xmm1, iv
 426         movups  224(ctx), %xmm1                 // keyE
 427     aesenclast  %xmm1, iv
 428 #endif
 429
 430         movups  iv, (obuf)                              // *obuf = *iv;
 431         add             $16, ibuf                               // ibuf++
 432         add             $16, obuf                               // obuf++
 433
 434         sub             $1, num_blk                             // num_blk --
 435         jg              0b                                              // if num_blk > 0, repeat the loop
 436
 437         jmp             L_HW_cbc_done                   // share with the common exit code
 438
 439
 440
 441         //
 442         // --------- END of aes_encrypt_cbc_hw  -------------------
 443         //
 444
 445
 446 /* ----------------------------------------------------------------------------------------------------------------
 447
 448         aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
 449
 450         For simplicity, I am assuming all variables are in 128-bit data type.
 451
 452         aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
 453         {
 454                 while(num_blk--) {
 455                         aes_decrypt(ibuf, obuf, ctx);
 456                         *obuf++ ^= *iv;
 457                         *iv = *ibuf++;
 458                 }
 459                 return 0;
 460         }
 461
 462         The following is an implementation of this function using Intel AESNI.
 463         This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
 464         Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
 465         to this aesni-based function should it detecs that aesni is available.
 466         Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
 467
 468         Note that the decryption operation is not related over blocks.
 469         This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
 470         This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
 471         The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
 472
 473         Example C code for packing 4 blocks in an iteration is shown as follows:
 474
 475                 while ((num_blk-=4)>=0) {
 476
 477                         // the following 4 functions can be interleaved to exploit parallelism
 478                         aes_decrypt(ibuf, obuf, ctx);
 479                         aes_decrypt(ibuf+1, obuf+1, ctx);
 480                         aes_decrypt(ibuf+2, obuf+2, ctx);
 481                         aes_decrypt(ibuf+3, obuf+3, ctx);
 482
 483                         obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
 484                         *iv = ibuf[3];          ibuf += 4;      obuf += 4;
 485                 }
 486                 num_blk+=4;
 487
 488    ----------------------------------------------------------------------------------------------------------------*/
 489
 490         .text
 491         .align  4,0x90
 492         .globl  _aes_decrypt_cbc_hw
 493 _aes_decrypt_cbc_hw:
 494
 495         // push/save registers for local use
 496 #if     defined __i386__
 497
 498         push    %ebp
 499         movl    %esp, %ebp
 500         push    %ebx                                    // ibuf
 501         push    %edi                                    // obuf
 502
 503         #define sp      %esp
 504
 505 #else   // __x86_64__
 506
 507         push    %rbp
 508         mov             %rsp, %rbp
 509         push    %rbx
 510         push    %r13
 511         push    %r14
 512         push    %r15
 513
 514         #define sp      %rsp
 515
 516 #endif
 517
 518
 519         // if kernel, allocate stack space to save xmm registers
 520 #ifdef  KERNEL
 521 #if defined __i386__
 522         sub             $(8*16), %esp
 523 #else
 524         sub             $(16*16), %rsp
 525 #endif
 526         movaps  %xmm0, (sp)
 527         movaps  %xmm1, 16(sp)
 528         movaps  %xmm2, 32(sp)
 529         movaps  %xmm3, 48(sp)
 530         movaps  %xmm4, 64(sp)
 531         movaps  %xmm5, 80(sp)
 532         movaps  %xmm6, 96(sp)
 533         movaps  %xmm7, 112(sp)
 534 #if defined     __x86_64__
 535         movaps  %xmm8, 16*8(sp)
 536         movaps  %xmm9, 16*9(sp)
 537         movaps  %xmm10, 16*10(sp)
 538         movaps  %xmm11, 16*11(sp)
 539         movaps  %xmm12, 16*12(sp)
 540         movaps  %xmm13, 16*13(sp)
 541         movaps  %xmm14, 16*14(sp)
 542         movaps  %xmm15, 16*15(sp)
 543 #endif  // __x86_64__
 544 #endif
 545
 546         #undef  iv
 547         #define iv      %xmm0
 548
 549 #if defined     __i386__
 550         mov             12(%ebp), %eax                  // in_iv
 551         mov             24(%ebp), %edx                  // ctx
 552         movups  (%eax), iv                              // iv = in_iv
 553         mov             8(%ebp), %ebx                   // ibuf
 554         mov             16(%ebp), %ecx                  // num_blk
 555         mov             20(%ebp), %edi                  // obuf
 556
 557         #define ibuf    %ebx
 558         #define obuf    %edi
 559         #define num_blk %ecx
 560         #define ctx             %edx
 561
 562 #else   //      __x86_64__, rdi/rsi/rdx/rcx/r8
 563
 564         mov             %rdi, %rbx                              // ibuf
 565         movups  (%rsi), iv                              // iv = in_iv
 566         mov             %rdx, %r13                              // num_blk
 567         mov             %rcx, %r14                              // obuf
 568         mov             %r8, %r15                               // ctx
 569
 570         #define ibuf    %rbx
 571         #define num_blk %r13d
 572         #define obuf    %r14
 573         #define ctx             %r15
 574
 575 #endif
 576
 577         mov             240(ctx), %eax                  // aes length
 578         cmp             $160, %eax                              // aes-128 decrypt
 579         je              L_decrypt_128
 580         cmp             $192, %eax                              // aes-192 decrypt
 581         je              L_decrypt_192
 582         cmp             $224, %eax                              // aes-256 decrypt
 583         je              L_decrypt_256
 584
 585         mov             $-1, %eax                               // wrong aes length, to return -1
 586         jmp             L_error                                 // early exit due to wrong aes length
 587
 588
 589         //
 590         // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
 591         //
 592
 593 L_decrypt_128:
 594
 595         cmp             $1, num_blk
 596         jl              L_HW_cbc_done                   // if num_blk < 1, early return
 597
 598         // aes-128 decrypt expanded keys
 599         movups  160(ctx), %xmm3
 600         movups  144(ctx), %xmm4
 601         movups  128(ctx), %xmm5
 602         movups  112(ctx), %xmm6
 603         movups  96(ctx), %xmm7
 604 #if defined     __x86_64__
 605         movups  80(ctx), %xmm8
 606         movups  64(ctx), %xmm9
 607         movups  48(ctx), %xmm10
 608         movups  32(ctx), %xmm11
 609         movups  16(ctx), %xmm12
 610         movups  0(ctx), %xmm13
 611 #endif
 612
 613         // performs 4 block decryption in an iteration to exploit decrypt in parallel
 614
 615         //              while ((num_blk-=4)>=0) {
 616         //                      aes_decrypt(ibuf, obuf, ctx);
 617         //                      aes_decrypt(ibuf+1, obuf+1, ctx);
 618         //                      aes_decrypt(ibuf+2, obuf+2, ctx);
 619         //                      aes_decrypt(ibuf+3, obuf+3, ctx);
 620         //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
 621         //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;
 622         //              }
 623
 624         sub             $4, num_blk                                     // pre decrement num_blk by 4
 625         jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
 626
 627 0:
 628
 629
 630 #if defined     __x86_64__
 631
 632         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
 633         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
 634         movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
 635         movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
 636
 637         // for x86_64, the expanded keys are already stored in xmm3-xmm13
 638
 639         // aes-128 decrypt round 0 per 4 blocks
 640         pxor    %xmm3, %xmm1
 641         pxor    %xmm3, %xmm2
 642         pxor    %xmm3, %xmm14
 643         pxor    %xmm3, %xmm15
 644
 645         // aes-128 decrypt round 1 per 4 blocks
 646     aesdec  %xmm4, %xmm1
 647     aesdec  %xmm4, %xmm2
 648     aesdec  %xmm4, %xmm14
 649     aesdec  %xmm4, %xmm15
 650
 651         // aes-128 decrypt round 2 per 4 blocks
 652     aesdec  %xmm5, %xmm1
 653     aesdec  %xmm5, %xmm2
 654     aesdec  %xmm5, %xmm14
 655     aesdec  %xmm5, %xmm15
 656
 657         // aes-128 decrypt round 3 per 4 blocks
 658     aesdec  %xmm6, %xmm1
 659     aesdec  %xmm6, %xmm2
 660     aesdec  %xmm6, %xmm14
 661     aesdec  %xmm6, %xmm15
 662
 663         // aes-128 decrypt round 4 per 4 blocks
 664     aesdec  %xmm7, %xmm1
 665     aesdec  %xmm7, %xmm2
 666     aesdec  %xmm7, %xmm14
 667     aesdec  %xmm7, %xmm15
 668
 669         // aes-128 decrypt round 5 per 4 blocks
 670     aesdec  %xmm8, %xmm1
 671     aesdec  %xmm8, %xmm2
 672     aesdec  %xmm8, %xmm14
 673     aesdec  %xmm8, %xmm15
 674
 675         // aes-128 decrypt round 6 per 4 blocks
 676     aesdec  %xmm9, %xmm1
 677     aesdec  %xmm9, %xmm2
 678     aesdec  %xmm9, %xmm14
 679     aesdec  %xmm9, %xmm15
 680
 681         // aes-128 decrypt round 7 per 4 blocks
 682     aesdec  %xmm10, %xmm1
 683     aesdec  %xmm10, %xmm2
 684     aesdec  %xmm10, %xmm14
 685     aesdec  %xmm10, %xmm15
 686
 687         // aes-128 decrypt round 8 per 4 blocks
 688     aesdec  %xmm11, %xmm1
 689     aesdec  %xmm11, %xmm2
 690     aesdec  %xmm11, %xmm14
 691     aesdec  %xmm11, %xmm15
 692
 693         // aes-128 decrypt round 9 per 4 blocks
 694     aesdec  %xmm12, %xmm1
 695     aesdec  %xmm12, %xmm2
 696     aesdec  %xmm12, %xmm14
 697     aesdec  %xmm12, %xmm15
 698
 699         // aes-128 decrypt round 10 (last) per 4 blocks
 700     aesdeclast  %xmm13, %xmm1
 701     aesdeclast  %xmm13, %xmm2
 702     aesdeclast  %xmm13, %xmm14
 703     aesdeclast  %xmm13, %xmm15
 704
 705         pxor    iv, %xmm1                               // obuf[0] ^= *iv;
 706         movups  (ibuf), iv                              // ibuf[0]
 707         pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0];
 708         movups  16(ibuf), iv                    // ibuf[1]
 709         pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1];
 710         movups  32(ibuf), iv                    // ibuf[2]
 711         pxor    iv, %xmm15                              // obuf[3] ^= obuf[2];
 712         movups  48(ibuf), iv                    // *iv = ibuf[3]
 713
 714         movups  %xmm1, (obuf)                   // write 1st obuf
 715         movups  %xmm2, 16(obuf)                 // write 2nd obuf
 716         movups  %xmm14, 32(obuf)                // write 3rd obuf
 717         movups  %xmm15, 48(obuf)                // write 4th obuf
 718
 719
 720 #else
 721
 722         // aes_decrypt_cbc per 4 blocks using aes-128 for i386
 723         // xmm1/xmm2/xmm4/xmm5 used for obuf per block
 724         // xmm3 = key0
 725         // xmm0 = iv
 726         // xmm6/xmm7 dynamically load with other expanded keys
 727
 728         movups  (ibuf), %xmm1                   // tmp = 1st ibuf
 729         movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
 730         movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
 731         movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
 732
 733         // aes_decrypt
 734         // for i386, sequentially load expanded keys into xmm6/xmm7
 735
 736         movups  144(ctx), %xmm6                 // key1
 737
 738         // aes-128 decrypt round 0 per 4 blocks
 739         pxor    %xmm3, %xmm1
 740         pxor    %xmm3, %xmm2
 741         pxor    %xmm3, %xmm4
 742         pxor    %xmm3, %xmm5
 743
 744         movups  128(ctx), %xmm7                 // key2
 745
 746         // aes-128 decrypt round 1 per 4 blocks
 747     aesdec  %xmm6, %xmm1
 748     aesdec  %xmm6, %xmm2
 749     aesdec  %xmm6, %xmm4
 750     aesdec  %xmm6, %xmm5
 751
 752         movups  112(ctx), %xmm6                 // key3
 753
 754         // aes-128 decrypt round 2 per 4 blocks
 755     aesdec  %xmm7, %xmm1
 756     aesdec  %xmm7, %xmm2
 757     aesdec  %xmm7, %xmm4
 758     aesdec  %xmm7, %xmm5
 759
 760         movups  96(ctx), %xmm7                  // key4
 761
 762         // aes-128 decrypt round 3 per 4 blocks
 763     aesdec  %xmm6, %xmm1
 764     aesdec  %xmm6, %xmm2
 765     aesdec  %xmm6, %xmm4
 766     aesdec  %xmm6, %xmm5
 767
 768         movups  80(ctx), %xmm6                  // key5
 769
 770         // aes-128 decrypt round 4 per 4 blocks
 771     aesdec  %xmm7, %xmm1
 772     aesdec  %xmm7, %xmm2
 773     aesdec  %xmm7, %xmm4
 774     aesdec  %xmm7, %xmm5
 775
 776         movups  64(ctx), %xmm7                  // key6
 777
 778         // aes-128 decrypt round 5 per 4 blocks
 779     aesdec  %xmm6, %xmm1
 780     aesdec  %xmm6, %xmm2
 781     aesdec  %xmm6, %xmm4
 782     aesdec  %xmm6, %xmm5
 783
 784         movups  48(ctx), %xmm6                  // key7
 785
 786         // aes-128 decrypt round 6 per 4 blocks
 787     aesdec  %xmm7, %xmm1
 788     aesdec  %xmm7, %xmm2
 789     aesdec  %xmm7, %xmm4
 790     aesdec  %xmm7, %xmm5
 791
 792         movups  32(ctx), %xmm7                  // key8
 793
 794         // aes-128 decrypt round 7 per 4 blocks
 795     aesdec  %xmm6, %xmm1
 796     aesdec  %xmm6, %xmm2
 797     aesdec  %xmm6, %xmm4
 798     aesdec  %xmm6, %xmm5
 799
 800         movups  16(ctx), %xmm6                  // key9
 801
 802         // aes-128 decrypt round 8 per 4 blocks
 803     aesdec  %xmm7, %xmm1
 804     aesdec  %xmm7, %xmm2
 805     aesdec  %xmm7, %xmm4
 806     aesdec  %xmm7, %xmm5
 807
 808         movups  0(ctx), %xmm7                   // keyA
 809
 810         // aes-128 decrypt round 9 per 4 blocks
 811     aesdec  %xmm6, %xmm1
 812     aesdec  %xmm6, %xmm2
 813     aesdec  %xmm6, %xmm4
 814     aesdec  %xmm6, %xmm5
 815
 816         // aes-128 decrypt round 10 (last) per 4 blocks
 817     aesdeclast  %xmm7, %xmm1
 818     aesdeclast  %xmm7, %xmm2
 819     aesdeclast  %xmm7, %xmm4
 820     aesdeclast  %xmm7, %xmm5
 821
 822         pxor    iv, %xmm1                               // 1st obuf ^= iv;
 823         movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
 824         pxor    iv, %xmm2                               // 2nd obuf ^= iv;
 825         movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
 826         pxor    iv, %xmm4                               // 3rd obuf ^= iv;
 827         movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
 828         pxor    iv, %xmm5                               // 4th obuf ^= iv;
 829         movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
 830
 831         movups  %xmm1, (obuf)                   // write 1st obuf
 832         movups  %xmm2, 16(obuf)                 // write 2nd obuf
 833         movups  %xmm4, 32(obuf)                 // write 3rd obuf
 834         movups  %xmm5, 48(obuf)                 // write 4th obuf
 835 #endif
 836
 837         add             $64, ibuf                               // ibuf += 4;
 838         add             $64, obuf                               // obuf += 4;
 839
 840         sub             $4, num_blk                             // num_blk -= 4
 841         jge             0b                                              // if num_blk > 0, repeat the loop
 842
 843 9:      add             $4, num_blk                             // post incremtn num_blk by 4
 844         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
 845
 846 #if defined     __i386__
 847         // updated as they might be needed as expanded keys in the remaining
 848         movups  144(ctx), %xmm4
 849         movups  128(ctx), %xmm5
 850         movups  112(ctx), %xmm6
 851         movups  96(ctx), %xmm7
 852 #endif
 853
 854         test    $2, num_blk                             // check whether num_blk has 2 blocks
 855         je              9f                                              // if num_blk & 2 == 0, skip the per-pair processing code
 856
 857         // do the remaining 2 blocks together
 858
 859         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
 860         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
 861
 862         // aes_decrypt
 863         pxor    %xmm3, %xmm1
 864         pxor    %xmm3, %xmm2
 865     aesdec  %xmm4, %xmm1
 866     aesdec  %xmm4, %xmm2
 867     aesdec  %xmm5, %xmm1
 868     aesdec  %xmm5, %xmm2
 869     aesdec  %xmm6, %xmm1
 870     aesdec  %xmm6, %xmm2
 871 #if defined     __x86_64__
 872     aesdec  %xmm7, %xmm1
 873     aesdec  %xmm7, %xmm2
 874     aesdec  %xmm8, %xmm1
 875     aesdec  %xmm8, %xmm2
 876     aesdec  %xmm9, %xmm1
 877     aesdec  %xmm9, %xmm2
 878     aesdec  %xmm10, %xmm1
 879     aesdec  %xmm10, %xmm2
 880     aesdec  %xmm11, %xmm1
 881     aesdec  %xmm11, %xmm2
 882     aesdec  %xmm12, %xmm1
 883     aesdec  %xmm12, %xmm2
 884     aesdeclast  %xmm13, %xmm1
 885     aesdeclast  %xmm13, %xmm2
 886 #else
 887         movups  80(ctx), %xmm6
 888     aesdec  %xmm7, %xmm1
 889     aesdec  %xmm7, %xmm2
 890         movups  64(ctx), %xmm7
 891     aesdec  %xmm6, %xmm1
 892     aesdec  %xmm6, %xmm2
 893         movups  48(ctx), %xmm6
 894     aesdec  %xmm7, %xmm1
 895     aesdec  %xmm7, %xmm2
 896         movups  32(ctx), %xmm7
 897     aesdec  %xmm6, %xmm1
 898     aesdec  %xmm6, %xmm2
 899         movups  16(ctx), %xmm6
 900     aesdec  %xmm7, %xmm1
 901     aesdec  %xmm7, %xmm2
 902         movups  0(ctx), %xmm7
 903     aesdec  %xmm6, %xmm1
 904     aesdec  %xmm6, %xmm2
 905     aesdeclast  %xmm7, %xmm1
 906     aesdeclast  %xmm7, %xmm2
 907         movups  112(ctx), %xmm6
 908         movups  96(ctx), %xmm7
 909 #endif
 910
 911         pxor    iv, %xmm1                               // obuf[0] ^= *iv;
 912         movups  (ibuf), iv                              // ibuf[0]
 913         pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]
 914         movups  16(ibuf), iv                    // *iv = ibuf[1]
 915
 916         movups  %xmm1, (obuf)                   // write obuf[0]
 917         movups  %xmm2, 16(obuf)                 // write obuf[1]
 918
 919         add             $32, ibuf                               // ibuf += 2
 920         add             $32, obuf                               // obuf += 2
 921
 922 9:
 923         test    $1, num_blk                             // check whether num_blk has residual 1 block
 924         je              L_HW_cbc_done                   // if num_blk == 0, no need for residual processing code
 925
 926         movups  (ibuf), %xmm2                           // tmp = ibuf
 927         // aes_decrypt
 928         pxor    %xmm3, %xmm2
 929     aesdec  %xmm4, %xmm2
 930     aesdec  %xmm5, %xmm2
 931     aesdec  %xmm6, %xmm2
 932     aesdec  %xmm7, %xmm2
 933 #if defined     __x86_64__
 934     aesdec  %xmm8, %xmm2
 935     aesdec  %xmm9, %xmm2
 936     aesdec  %xmm10, %xmm2
 937     aesdec  %xmm11, %xmm2
 938     aesdec  %xmm12, %xmm2
 939     aesdeclast  %xmm13, %xmm2
 940 #else
 941         movups  80(ctx), %xmm1
 942     aesdec  %xmm1, %xmm2
 943         movups  64(ctx), %xmm1
 944     aesdec  %xmm1, %xmm2
 945         movups  48(ctx), %xmm1
 946     aesdec  %xmm1, %xmm2
 947         movups  32(ctx), %xmm1
 948     aesdec  %xmm1, %xmm2
 949         movups  16(ctx), %xmm1
 950     aesdec  %xmm1, %xmm2
 951         movups  (ctx), %xmm1
 952     aesdeclast  %xmm1, %xmm2
 953 #endif
 954
 955         pxor    iv, %xmm2                       // *obuf ^= *iv;
 956         movups  (ibuf), iv                      // *iv = *ibuf;
 957         movups  %xmm2, (obuf)           // write *obuf
 958
 959         jmp             L_HW_cbc_done
 960
 961         //
 962         // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
 963         //
 964
 965 L_decrypt_192:
 966
 967         cmp             $1, num_blk
 968         jl              L_HW_cbc_done                   // if num_blk < 1, early return
 969
 970         // aes-192 decryp expanded keys
 971         movups  192(ctx), %xmm3
 972         movups  176(ctx), %xmm4
 973         movups  160(ctx), %xmm5
 974         movups  144(ctx), %xmm6
 975         movups  128(ctx), %xmm7
 976 #if defined     __x86_64__
 977         movups  112(ctx), %xmm8
 978         movups  96(ctx), %xmm9
 979         movups  80(ctx), %xmm10
 980         movups  64(ctx), %xmm11
 981         movups  48(ctx), %xmm12
 982         movups  32(ctx), %xmm13
 983         movups  16(ctx), %xmm14
 984         movups  (ctx), %xmm15
 985 #endif
 986
 987         // performs 4 block decryption in an iteration to exploit decrypt in parallel
 988
 989         //              while ((num_blk-=4)>=0) {
 990         //                      aes_decrypt(ibuf, obuf, ctx);
 991         //                      aes_decrypt(ibuf+1, obuf+1, ctx);
 992         //                      aes_decrypt(ibuf+2, obuf+2, ctx);
 993         //                      aes_decrypt(ibuf+3, obuf+3, ctx);
 994         //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
 995         //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;
 996         //              }
 997
 998         sub             $4, num_blk                                     // pre decrement num_blk by 4
 999         jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
1000 0:
1001
1002 #if defined     __x86_64__
1003
1004         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
1005         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
1006         movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
1007         movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
1008
1009         // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1010         // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
1011
1012         // round 0 for 4 blocks
1013         pxor    %xmm3, %xmm1
1014         pxor    %xmm3, %xmm2
1015         pxor    %xmm3, %xmm14
1016         pxor    %xmm3, %xmm15
1017
1018         // round 1 for 4 blocks
1019     aesdec  %xmm4, %xmm1
1020     aesdec  %xmm4, %xmm2
1021     aesdec  %xmm4, %xmm14
1022     aesdec  %xmm4, %xmm15
1023
1024         // round 2 for 4 blocks
1025     aesdec  %xmm5, %xmm1
1026     aesdec  %xmm5, %xmm2
1027     aesdec  %xmm5, %xmm14
1028     aesdec  %xmm5, %xmm15
1029
1030         // round 3 for 4 blocks
1031     aesdec  %xmm6, %xmm1
1032     aesdec  %xmm6, %xmm2
1033     aesdec  %xmm6, %xmm14
1034     aesdec  %xmm6, %xmm15
1035
1036         // round 4 for 4 blocks
1037     aesdec  %xmm7, %xmm1
1038     aesdec  %xmm7, %xmm2
1039     aesdec  %xmm7, %xmm14
1040     aesdec  %xmm7, %xmm15
1041
1042         // round 5 for 4 blocks
1043     aesdec  %xmm8, %xmm1
1044     aesdec  %xmm8, %xmm2
1045     aesdec  %xmm8, %xmm14
1046     aesdec  %xmm8, %xmm15
1047
1048         // round 6 for 4 blocks
1049     aesdec  %xmm9, %xmm1
1050     aesdec  %xmm9, %xmm2
1051     aesdec  %xmm9, %xmm14
1052     aesdec  %xmm9, %xmm15
1053
1054         // round 7 for 4 blocks
1055     aesdec  %xmm10, %xmm1
1056     aesdec  %xmm10, %xmm2
1057     aesdec  %xmm10, %xmm14
1058     aesdec  %xmm10, %xmm15
1059
1060         // round 8 for 4 blocks
1061     aesdec  %xmm11, %xmm1
1062     aesdec  %xmm11, %xmm2
1063     aesdec  %xmm11, %xmm14
1064     aesdec  %xmm11, %xmm15
1065
1066         // round 9 for 4 blocks
1067     aesdec  %xmm12, %xmm1
1068     aesdec  %xmm12, %xmm2
1069     aesdec  %xmm12, %xmm14
1070     aesdec  %xmm12, %xmm15
1071
1072         movups  16(ctx), %xmm12
1073
1074         // round A for 4 blocks
1075     aesdec  %xmm13, %xmm1
1076     aesdec  %xmm13, %xmm2
1077     aesdec  %xmm13, %xmm14
1078     aesdec  %xmm13, %xmm15
1079
1080         movups  (ctx), %xmm13
1081
1082         // round B for 4 blocks
1083     aesdec  %xmm12, %xmm1
1084     aesdec  %xmm12, %xmm2
1085     aesdec  %xmm12, %xmm14
1086     aesdec  %xmm12, %xmm15
1087
1088         movups  48(ctx), %xmm12         // restore %xmm12 to its original key
1089
1090         // round C (last) for 4 blocks
1091     aesdeclast  %xmm13, %xmm1
1092     aesdeclast  %xmm13, %xmm2
1093     aesdeclast  %xmm13, %xmm14
1094     aesdeclast  %xmm13, %xmm15
1095
1096         movups  32(ctx), %xmm13         // restore %xmm13 to its original key
1097
1098         pxor    iv, %xmm1                               // obuf[0] ^= *iv;
1099         movups  (ibuf), iv                              // ibuf[0]
1100         pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]
1101         movups  16(ibuf), iv                    // ibuf[1]
1102         pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1]
1103         movups  32(ibuf), iv                    // ibuf[2]
1104         pxor    iv, %xmm15                              // obuf[3] ^= ibuf[2]
1105         movups  48(ibuf), iv                    // *iv = ibuf[3]
1106
1107         movups  %xmm1, (obuf)                   // write 1st obuf
1108         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1109         movups  %xmm14, 32(obuf)                // write 3rd obuf
1110         movups  %xmm15, 48(obuf)                // write 4th obuf
1111
1112         add             $64, ibuf                               // ibuf += 4;
1113         add             $64, obuf                               // obuf += 4;
1114
1115         sub             $4, num_blk                             // num_blk -= 4
1116         jge             0b                                              // if num_blk > 0, repeat the loop
1117
1118 9:      add             $4, num_blk                             // post incremtn num_blk by 4
1119         je              L_HW_cbc_done                   // if num_blk == 0, prepare to return
1120
1121         movups  16(ctx), %xmm14                 // restore %xmm14 to its key
1122         movups  (ctx), %xmm15                   // restore %xmm15 to its key
1123
1124 #else
1125
1126         movups  (ibuf), %xmm1                   // tmp = 1st ibuf
1127         movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
1128         movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
1129         movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
1130
1131         // aes_decrypt
1132         // for i386, sequentially load expanded keys into xmm6/xmm7
1133         movups  176(ctx), %xmm6
1134         pxor    %xmm3, %xmm1
1135         pxor    %xmm3, %xmm2
1136         pxor    %xmm3, %xmm4
1137         pxor    %xmm3, %xmm5
1138
1139         movups  160(ctx), %xmm7
1140     aesdec  %xmm6, %xmm1
1141     aesdec  %xmm6, %xmm2
1142     aesdec  %xmm6, %xmm4
1143     aesdec  %xmm6, %xmm5
1144
1145         movups  144(ctx), %xmm6
1146         aesdec    %xmm7, %xmm1
1147         aesdec    %xmm7, %xmm2
1148         aesdec    %xmm7, %xmm4
1149         aesdec    %xmm7, %xmm5
1150
1151         movups  128(ctx), %xmm7
1152     aesdec  %xmm6, %xmm1
1153     aesdec  %xmm6, %xmm2
1154     aesdec  %xmm6, %xmm4
1155     aesdec  %xmm6, %xmm5
1156
1157         movups  112(ctx), %xmm6
1158     aesdec  %xmm7, %xmm1
1159     aesdec  %xmm7, %xmm2
1160     aesdec  %xmm7, %xmm4
1161     aesdec  %xmm7, %xmm5
1162
1163         movups  96(ctx), %xmm7
1164     aesdec  %xmm6, %xmm1
1165     aesdec  %xmm6, %xmm2
1166     aesdec  %xmm6, %xmm4
1167     aesdec  %xmm6, %xmm5
1168
1169         movups  80(ctx), %xmm6
1170     aesdec  %xmm7, %xmm1
1171     aesdec  %xmm7, %xmm2
1172     aesdec  %xmm7, %xmm4
1173     aesdec  %xmm7, %xmm5
1174
1175         movups  64(ctx), %xmm7
1176     aesdec  %xmm6, %xmm1
1177     aesdec  %xmm6, %xmm2
1178     aesdec  %xmm6, %xmm4
1179     aesdec  %xmm6, %xmm5
1180
1181         movups  48(ctx), %xmm6
1182     aesdec  %xmm7, %xmm1
1183     aesdec  %xmm7, %xmm2
1184     aesdec  %xmm7, %xmm4
1185     aesdec  %xmm7, %xmm5
1186
1187         movups  32(ctx), %xmm7
1188     aesdec  %xmm6, %xmm1
1189     aesdec  %xmm6, %xmm2
1190     aesdec  %xmm6, %xmm4
1191     aesdec  %xmm6, %xmm5
1192
1193         movups  16(ctx), %xmm6
1194     aesdec  %xmm7, %xmm1
1195     aesdec  %xmm7, %xmm2
1196     aesdec  %xmm7, %xmm4
1197     aesdec  %xmm7, %xmm5
1198
1199         movups  0(ctx), %xmm7
1200     aesdec  %xmm6, %xmm1
1201     aesdec  %xmm6, %xmm2
1202     aesdec  %xmm6, %xmm4
1203     aesdec  %xmm6, %xmm5
1204
1205     aesdeclast  %xmm7, %xmm1
1206     aesdeclast  %xmm7, %xmm2
1207     aesdeclast  %xmm7, %xmm4
1208     aesdeclast  %xmm7, %xmm5
1209
1210         pxor    iv, %xmm1                               // 1st obuf ^= iv;
1211         movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1212         pxor    iv, %xmm2                               // 2nd obuf ^= iv;
1213         movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1214         pxor    iv, %xmm4                               // 3rd obuf ^= iv;
1215         movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1216         pxor    iv, %xmm5                               // 4th obuf ^= iv;
1217         movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1218         movups  %xmm1, (obuf)                   // write 1st obuf
1219         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1220         movups  %xmm4, 32(obuf)                 // write 3rd obuf
1221         movups  %xmm5, 48(obuf)                 // write 4th obuf
1222
1223         add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4;
1224         add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;
1225
1226         sub             $4, num_blk                             // num_blk -= 4
1227         jge             0b                                              // if num_blk > 0, repeat the loop
1228
1229
1230 9:      add             $4, num_blk                             //      post incremtn num_blk by 4
1231         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
1232
1233         movups  176(ctx), %xmm4
1234         movups  160(ctx), %xmm5
1235         movups  144(ctx), %xmm6
1236         movups  128(ctx), %xmm7
1237
1238 #endif
1239
1240         // per-block aes_decrypt_cbc loop
1241
1242 0:
1243         movups  (ibuf), %xmm2                           // tmp = ibuf
1244
1245         // aes_decrypt
1246         pxor    %xmm3, %xmm2
1247     aesdec  %xmm4, %xmm2
1248     aesdec  %xmm5, %xmm2
1249     aesdec  %xmm6, %xmm2
1250     aesdec  %xmm7, %xmm2
1251 #if defined     __x86_64__
1252     aesdec  %xmm8, %xmm2
1253     aesdec  %xmm9, %xmm2
1254     aesdec  %xmm10, %xmm2
1255     aesdec  %xmm11, %xmm2
1256     aesdec  %xmm12, %xmm2
1257     aesdec  %xmm13, %xmm2
1258     aesdec  %xmm14, %xmm2
1259     aesdeclast  %xmm15, %xmm2
1260 #else
1261         movups  112(ctx), %xmm1
1262     aesdec  %xmm1, %xmm2
1263         movups  96(ctx), %xmm1
1264     aesdec  %xmm1, %xmm2
1265         movups  80(ctx), %xmm1
1266     aesdec  %xmm1, %xmm2
1267         movups  64(ctx), %xmm1
1268     aesdec  %xmm1, %xmm2
1269         movups  48(ctx), %xmm1
1270     aesdec  %xmm1, %xmm2
1271         movups  32(ctx), %xmm1
1272     aesdec  %xmm1, %xmm2
1273         movups  16(ctx), %xmm1
1274     aesdec  %xmm1, %xmm2
1275         movups  (ctx), %xmm1
1276     aesdeclast  %xmm1, %xmm2
1277 #endif
1278
1279         pxor    iv, %xmm2                       // obuf ^= iv;
1280         movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);
1281
1282         movups  %xmm2, (obuf)           // write obuf
1283
1284         add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE;
1285         add             $16, obuf                               // obuf += AES_BLOCK_SIZE;
1286         sub             $1, num_blk                             // num_blk --
1287         jg              0b                                              // if num_blk > 0, repeat the loop
1288
1289         jmp             L_HW_cbc_done
1290
1291         //
1292         // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1293         //
1294
1295 L_decrypt_256:
1296
1297         cmp             $1, num_blk
1298         jl              L_HW_cbc_done
1299
1300         movups  224(ctx), %xmm3
1301         movups  208(ctx), %xmm4
1302         movups  192(ctx), %xmm5
1303         movups  176(ctx), %xmm6
1304         movups  160(ctx), %xmm7
1305 #if defined     __x86_64__
1306         movups  144(ctx), %xmm8
1307         movups  128(ctx), %xmm9
1308         movups  112(ctx), %xmm10
1309         movups  96(ctx), %xmm11
1310         movups  80(ctx), %xmm12
1311         movups  64(ctx), %xmm13
1312         movups  48(ctx), %xmm14
1313         movups  32(ctx), %xmm15
1314 //      movups  16(ctx), %xmm14
1315 //      movups  (ctx), %xmm15
1316 #endif
1317
1318 #if defined     __x86_64__
1319
1320         sub             $4, num_blk                                     // pre decrement num_blk by 4
1321         jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
1322 0:
1323         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
1324         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
1325         movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
1326         movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
1327
1328         // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1329         pxor    %xmm3, %xmm1
1330         pxor    %xmm3, %xmm2
1331         pxor    %xmm3, %xmm14
1332         pxor    %xmm3, %xmm15
1333
1334     aesdec  %xmm4, %xmm1
1335     aesdec  %xmm4, %xmm2
1336     aesdec  %xmm4, %xmm14
1337     aesdec  %xmm4, %xmm15
1338
1339     aesdec  %xmm5, %xmm1
1340     aesdec  %xmm5, %xmm2
1341     aesdec  %xmm5, %xmm14
1342     aesdec  %xmm5, %xmm15
1343
1344     aesdec  %xmm6, %xmm1
1345     aesdec  %xmm6, %xmm2
1346     aesdec  %xmm6, %xmm14
1347     aesdec  %xmm6, %xmm15
1348
1349     aesdec  %xmm7, %xmm1
1350     aesdec  %xmm7, %xmm2
1351     aesdec  %xmm7, %xmm14
1352     aesdec  %xmm7, %xmm15
1353
1354     aesdec  %xmm8, %xmm1
1355     aesdec  %xmm8, %xmm2
1356     aesdec  %xmm8, %xmm14
1357     aesdec  %xmm8, %xmm15
1358
1359     aesdec  %xmm9, %xmm1
1360     aesdec  %xmm9, %xmm2
1361     aesdec  %xmm9, %xmm14
1362     aesdec  %xmm9, %xmm15
1363
1364     aesdec  %xmm10, %xmm1
1365     aesdec  %xmm10, %xmm2
1366     aesdec  %xmm10, %xmm14
1367     aesdec  %xmm10, %xmm15
1368
1369     aesdec  %xmm11, %xmm1
1370     aesdec  %xmm11, %xmm2
1371     aesdec  %xmm11, %xmm14
1372     aesdec  %xmm11, %xmm15
1373
1374     aesdec  %xmm12, %xmm1
1375     aesdec  %xmm12, %xmm2
1376     aesdec  %xmm12, %xmm14
1377     aesdec  %xmm12, %xmm15
1378         movups  48(ctx), %xmm12
1379
1380     aesdec  %xmm13, %xmm1
1381     aesdec  %xmm13, %xmm2
1382     aesdec  %xmm13, %xmm14
1383     aesdec  %xmm13, %xmm15
1384         movups  32(ctx), %xmm13
1385
1386     aesdec  %xmm12, %xmm1
1387     aesdec  %xmm12, %xmm2
1388     aesdec  %xmm12, %xmm14
1389     aesdec  %xmm12, %xmm15
1390         movups  16(ctx), %xmm12
1391
1392     aesdec  %xmm13, %xmm1
1393     aesdec  %xmm13, %xmm2
1394     aesdec  %xmm13, %xmm14
1395     aesdec  %xmm13, %xmm15
1396         movups  (ctx), %xmm13
1397
1398     aesdec  %xmm12, %xmm1
1399     aesdec  %xmm12, %xmm2
1400     aesdec  %xmm12, %xmm14
1401     aesdec  %xmm12, %xmm15
1402         movups  80(ctx), %xmm12
1403
1404     aesdeclast  %xmm13, %xmm1
1405     aesdeclast  %xmm13, %xmm2
1406     aesdeclast  %xmm13, %xmm14
1407     aesdeclast  %xmm13, %xmm15
1408         movups  64(ctx), %xmm13
1409
1410         pxor    iv, %xmm1                               // obuf ^= iv;
1411         movups  (ibuf), iv                              // memcpy(iv, tmp, AES_BLOCK_SIZE);
1412         pxor    iv, %xmm2                               // obuf ^= iv;
1413         movups  16(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
1414         pxor    iv, %xmm14                              // obuf ^= iv;
1415         movups  32(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
1416         pxor    iv, %xmm15                              // obuf ^= iv;
1417         movups  48(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
1418
1419         movups  %xmm1, (obuf)                   // write 1st obuf
1420         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1421         movups  %xmm14, 32(obuf)                // write 3rd obuf
1422         movups  %xmm15, 48(obuf)                // write 4th obuf
1423
1424         add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE*4;
1425         add             $64, obuf                               // obuf += AES_BLOCK_SIZE*4;
1426
1427         sub             $4, num_blk                             // num_blk -= 4
1428         jge             0b                                              // if num_blk > 0, repeat the loop
1429
1430 9:      add             $4, num_blk                             //      post incremtn num_blk by 4
1431         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
1432
1433         movups  48(ctx), %xmm14
1434         movups  32(ctx), %xmm15
1435
1436 #else
1437
1438         sub             $4, num_blk                             // pre decrement num_blk by 4
1439         jl              9f                                              // if num_blk < 4, skip the per-pair processing code
1440 0:
1441         movups  (ibuf), %xmm1                   // tmp = 1st ibuf
1442         movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
1443         movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
1444         movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
1445
1446         // aes_decrypt
1447         // for i386, sequentially load expanded keys into xmm6/xmm7
1448         movups  208(ctx), %xmm6
1449         pxor    %xmm3, %xmm1
1450         pxor    %xmm3, %xmm2
1451         pxor    %xmm3, %xmm4
1452         pxor    %xmm3, %xmm5
1453
1454         movups  192(ctx), %xmm7
1455     aesdec  %xmm6, %xmm1
1456     aesdec  %xmm6, %xmm2
1457     aesdec  %xmm6, %xmm4
1458     aesdec  %xmm6, %xmm5
1459
1460         movups  176(ctx), %xmm6
1461         aesdec  %xmm7, %xmm1
1462         aesdec  %xmm7, %xmm2
1463         aesdec  %xmm7, %xmm4
1464         aesdec  %xmm7, %xmm5
1465
1466         movups  160(ctx), %xmm7
1467     aesdec  %xmm6, %xmm1
1468     aesdec  %xmm6, %xmm2
1469     aesdec  %xmm6, %xmm4
1470     aesdec  %xmm6, %xmm5
1471
1472         movups  144(ctx), %xmm6
1473         aesdec  %xmm7, %xmm1
1474         aesdec  %xmm7, %xmm2
1475         aesdec  %xmm7, %xmm4
1476         aesdec  %xmm7, %xmm5
1477
1478         movups  128(ctx), %xmm7
1479     aesdec  %xmm6, %xmm1
1480     aesdec  %xmm6, %xmm2
1481     aesdec  %xmm6, %xmm4
1482     aesdec  %xmm6, %xmm5
1483
1484         movups  112(ctx), %xmm6
1485     aesdec  %xmm7, %xmm1
1486     aesdec  %xmm7, %xmm2
1487     aesdec  %xmm7, %xmm4
1488     aesdec  %xmm7, %xmm5
1489
1490         movups  96(ctx), %xmm7
1491     aesdec  %xmm6, %xmm1
1492     aesdec  %xmm6, %xmm2
1493     aesdec  %xmm6, %xmm4
1494     aesdec  %xmm6, %xmm5
1495
1496         movups  80(ctx), %xmm6
1497     aesdec  %xmm7, %xmm1
1498     aesdec  %xmm7, %xmm2
1499     aesdec  %xmm7, %xmm4
1500     aesdec  %xmm7, %xmm5
1501
1502         movups  64(ctx), %xmm7
1503     aesdec  %xmm6, %xmm1
1504     aesdec  %xmm6, %xmm2
1505     aesdec  %xmm6, %xmm4
1506     aesdec  %xmm6, %xmm5
1507
1508         movups  48(ctx), %xmm6
1509     aesdec  %xmm7, %xmm1
1510     aesdec  %xmm7, %xmm2
1511     aesdec  %xmm7, %xmm4
1512     aesdec  %xmm7, %xmm5
1513
1514         movups  32(ctx), %xmm7
1515     aesdec  %xmm6, %xmm1
1516     aesdec  %xmm6, %xmm2
1517     aesdec  %xmm6, %xmm4
1518     aesdec  %xmm6, %xmm5
1519
1520         movups  16(ctx), %xmm6
1521     aesdec  %xmm7, %xmm1
1522     aesdec  %xmm7, %xmm2
1523     aesdec  %xmm7, %xmm4
1524     aesdec  %xmm7, %xmm5
1525
1526         movups  0(ctx), %xmm7
1527     aesdec  %xmm6, %xmm1
1528     aesdec  %xmm6, %xmm2
1529     aesdec  %xmm6, %xmm4
1530     aesdec  %xmm6, %xmm5
1531
1532     aesdeclast  %xmm7, %xmm1
1533     aesdeclast  %xmm7, %xmm2
1534     aesdeclast  %xmm7, %xmm4
1535     aesdeclast  %xmm7, %xmm5
1536
1537         pxor    iv, %xmm1                               // 1st obuf ^= iv;
1538         movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1539         pxor    iv, %xmm2                               // 2nd obuf ^= iv;
1540         movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1541         pxor    iv, %xmm4                               // 3rd obuf ^= iv;
1542         movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1543         pxor    iv, %xmm5                               // 4th obuf ^= iv;
1544         movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1545         movups  %xmm1, (obuf)                   // write 1st obuf
1546         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1547         movups  %xmm4, 32(obuf)                 // write 3rd obuf
1548         movups  %xmm5, 48(obuf)                 // write 4th obuf
1549
1550         add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4;
1551         add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;
1552
1553         sub             $4, num_blk                             // num_blk -= 4
1554         jge             0b                                              // if num_blk > 0, repeat the loop
1555
1556
1557 9:      add             $4, num_blk                             //      post incremtn num_blk by 4
1558         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
1559
1560         movups  208(ctx), %xmm4
1561         movups  192(ctx), %xmm5
1562         movups  176(ctx), %xmm6
1563         movups  160(ctx), %xmm7
1564
1565 #endif
1566
1567 0:
1568         movups  (ibuf), %xmm2                           // tmp = ibuf
1569
1570         // aes_decrypt
1571         pxor    %xmm3, %xmm2
1572     aesdec  %xmm4, %xmm2
1573     aesdec  %xmm5, %xmm2
1574     aesdec  %xmm6, %xmm2
1575     aesdec  %xmm7, %xmm2
1576 #if defined     __x86_64__
1577     aesdec  %xmm8, %xmm2
1578     aesdec  %xmm9, %xmm2
1579     aesdec  %xmm10, %xmm2
1580     aesdec  %xmm11, %xmm2
1581     aesdec  %xmm12, %xmm2
1582     aesdec  %xmm13, %xmm2
1583     aesdec  %xmm14, %xmm2
1584     aesdec  %xmm15, %xmm2
1585 #else
1586         movups  144(ctx), %xmm1
1587     aesdec  %xmm1, %xmm2
1588         movups  128(ctx), %xmm1
1589     aesdec  %xmm1, %xmm2
1590         movups  112(ctx), %xmm1
1591     aesdec  %xmm1, %xmm2
1592         movups  96(ctx), %xmm1
1593     aesdec  %xmm1, %xmm2
1594         movups  80(ctx), %xmm1
1595     aesdec  %xmm1, %xmm2
1596         movups  64(ctx), %xmm1
1597     aesdec  %xmm1, %xmm2
1598         movups  48(ctx), %xmm1
1599     aesdec  %xmm1, %xmm2
1600         movups  32(ctx), %xmm1
1601     aesdec  %xmm1, %xmm2
1602 #endif
1603         movups  16(ctx), %xmm1
1604     aesdec  %xmm1, %xmm2
1605         movups  (ctx), %xmm1
1606     aesdeclast  %xmm1, %xmm2
1607
1608         pxor    iv, %xmm2                       // obuf ^= iv;
1609         movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);
1610
1611         movups  %xmm2, (obuf)           // write obuf
1612
1613         add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE;
1614         add             $16, obuf                               // obuf += AES_BLOCK_SIZE;
1615         sub             $1, num_blk                             // num_blk --
1616         jg              0b                                              // if num_blk > 0, repeat the loop
1617
1618         jmp             L_HW_cbc_done
1619
1620         //
1621         // --------- END of aes_decrypt_cbc_hw  -------------------
1622         //