bsd/crypto/aes/i386/aes_modes_hw.s

   1 /*
   2  ---------------------------------------------------------------------------
   3  Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
   4
   5  LICENSE TERMS
   6
   7  The free distribution and use of this software in both source and binary
   8  form is allowed (with or without changes) provided that:
   9
  10    1. distributions of this source code include the above copyright
  11       notice, this list of conditions and the following disclaimer;
  12
  13    2. distributions in binary form include the above copyright
  14       notice, this list of conditions and the following disclaimer
  15       in the documentation and/or other associated materials;
  16
  17    3. the copyright holder's name is not used to endorse products
  18       built using this software without specific written permission.
  19
  20  ALTERNATIVELY, provided that this notice is retained in full, this product
  21  may be distributed under the terms of the GNU General Public License (GPL),
  22  in which case the provisions of the GPL apply INSTEAD OF those given above.
  23
  24  DISCLAIMER
  25
  26  This software is provided 'as is' with no explicit or implied warranties
  27  in respect of its properties, including, but not limited to, correctness
  28  and/or fitness for purpose.
  29  ---------------------------------------------------------------------------
  30  Issue 31/01/2006
  31
  32  These subroutines implement multiple block AES modes for ECB, CBC, CFB,
  33  OFB and CTR encryption,  The code provides support for the VIA Advanced
  34  Cryptography Engine (ACE).
  35
  36  NOTE: In the following subroutines, the AES contexts (ctx) must be
  37  16 byte aligned if VIA ACE is being used
  38 */
  39
  40 /* modified 3/5/10 cclee */
  41 /* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */
  42 /* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */
  43
  44 /* HW-AES specific implementation cclee 3-12-10 */
  45 /* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled,
  46         and if kHasAES is detected, branch to the hw-specific functions here */
  47
  48
  49 /*
  50         This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation
  51         of _aes_encrypt_cbc and _aes_decrypt_cbc.
  52
  53         These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available.
  54         They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.
  55
  56         The AES HW is detected 1st thing in
  57                 _aes_encrypt_cbc (aes_modes_asm.s)
  58                 _aes_decrypt_cbc (aes_modes_asm.s)
  59         and, if AES HW is detected, branch without link (ie, jump) to the functions here.
  60
  61         The implementation here follows the examples in an Intel White Paper
  62         "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01
  63
  64         Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01
  65
  66         cclee 3-13-10
  67 */
  68
  69 /*
  70         The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block
  71         in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks
  72         in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.
  73
  74         The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
  75
  76         This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.
  77         On a 2.4GHz core-i5/2.66GHz core-i7, the x86_64 decrypt throughput (in xnu-iokit) has been improved
  78         from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.
  79         The encrypt throughput is not changed.
  80
  81         I also enhanced the assembly code comments.
  82
  83         cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)
  84
  85 */
  86
  87 /* ----------------------------------------------------------------------------------------------------------------
  88
  89         aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
  90
  91         For simplicity, I am assuming all variables are in 128-bit data type.
  92
  93         aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
  94         {
  95                 while(num_blk--) {
  96                         *iv ^= *ibuf++;
  97                         aes_encrypt(iv, iv, ctx);
  98                         *obuf++ = *iv;
  99                 }
 100                 return 0;
 101         }
 102
 103         The following is an implementation of this function using Intel AESNI.
 104         This function _aes_encrypt_cbc_hw SHOULD NOT be called directly.
 105         Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
 106         to this aesni-based function should it detecs that aesni is available.
 107         Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
 108
 109         Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
 110         are serially chained. This prevents us from arranging several blocks for encryption in parallel.
 111
 112    ----------------------------------------------------------------------------------------------------------------*/
 113
 114         .text
 115         .align  4,0x90
 116         .globl  _aes_encrypt_cbc_hw
 117 _aes_encrypt_cbc_hw:
 118
 119         // push/save registers for local use
 120 #if     defined __i386__
 121
 122         push    %ebp
 123         movl    %esp, %ebp
 124         push    %ebx
 125         push    %edi
 126
 127         #define sp      %esp
 128
 129 #else   // __x86_64__
 130
 131         push    %rbp
 132         mov             %rsp, %rbp
 133         push    %rbx
 134         push    %r13
 135         push    %r14
 136         push    %r15
 137
 138         #define sp      %rsp
 139
 140 #endif
 141
 142         // if this is kernel code, need to save used xmm registers
 143 #ifdef  KERNEL
 144
 145 #if defined __i386__
 146         sub             $(8*16), %esp                   // for possible xmm0-xmm7 save/restore
 147 #else
 148         sub             $(16*16), %rsp          // xmm0-xmm15 save/restore
 149 #endif
 150
 151         movaps  %xmm0, (sp)
 152         movaps  %xmm1, 16(sp)
 153         movaps  %xmm2, 32(sp)
 154         movaps  %xmm3, 48(sp)
 155         movaps  %xmm4, 64(sp)
 156         movaps  %xmm5, 80(sp)
 157         movaps  %xmm6, 96(sp)
 158         movaps  %xmm7, 112(sp)
 159 #if defined     __x86_64__
 160         movaps  %xmm8, 16*8(sp)
 161         movaps  %xmm9, 16*9(sp)
 162         movaps  %xmm10, 16*10(sp)
 163         movaps  %xmm11, 16*11(sp)
 164         movaps  %xmm12, 16*12(sp)
 165         movaps  %xmm13, 16*13(sp)
 166         movaps  %xmm14, 16*14(sp)
 167         movaps  %xmm15, 16*15(sp)
 168 #endif  // __x86_64__
 169
 170 #endif  // KERNEL
 171
 172         #define iv      %xmm0
 173
 174 #ifdef  __i386__
 175
 176         mov             12(%ebp), %eax                  // in_iv
 177         mov             24(%ebp), %edx                  // ctx
 178         movups  (%eax), iv                              // iv = in_iv
 179         mov             8(%ebp), %ebx                   // ibuf
 180         mov             16(%ebp), %ecx                  // num_blk
 181         mov             20(%ebp), %edi                  // obuf
 182
 183         #define ibuf    %ebx
 184         #define obuf    %edi
 185         #define num_blk %ecx
 186         #define ctx             %edx
 187
 188 #else
 189
 190         mov             %rdi, %rbx                              // ibuf
 191         movups  (%rsi), iv                              // iv = in_iv
 192         mov             %rdx, %r13                              // num_blk
 193         mov             %rcx, %r14                              // obuf
 194         mov             %r8, %r15                               // ctx
 195
 196         #define ibuf    %rbx
 197         #define num_blk %r13d
 198         #define obuf    %r14
 199         #define ctx             %r15
 200
 201 #endif
 202
 203         mov             240(ctx), %eax                  // aes length
 204         cmp             $160, %eax                              // aes-128 encrypt ?
 205         je              L_encrypt_128
 206         cmp             $192, %eax                              // aes-192 encrypt ?
 207         je              L_encrypt_192
 208         cmp             $224, %eax                              // aes-256 encrypt ?
 209         je              L_encrypt_256
 210         mov             $-1, %eax                               // return error
 211         jmp             L_error
 212
 213         //
 214         // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
 215         //
 216
 217 L_encrypt_128:
 218
 219         cmp             $1, num_blk                             // check number of block
 220         jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
 221
 222         movups  (ctx), %xmm2                    // key0
 223         movups  16(ctx), %xmm3                  // key1
 224         movups  32(ctx), %xmm4                  // key2
 225         movups  48(ctx), %xmm5                  // key3
 226         movups  64(ctx), %xmm6                  // key4
 227         movups  80(ctx), %xmm7                  // key5
 228 #if defined     __x86_64__
 229         movups  96(ctx), %xmm8                  // key6
 230         movups  112(ctx), %xmm9                 // key7
 231         movups  128(ctx), %xmm10                // key8
 232         movups  144(ctx), %xmm11                // key9
 233         movups  160(ctx), %xmm12                // keyA
 234 #endif
 235
 236         // while (num_blk--) {
 237         //                      *iv ^= *ibuf++;
 238         //                      aes_encrypt(iv, iv, ctx);
 239         //                      *obuf++ = *iv;
 240         // }
 241 0:
 242         movups  (ibuf), %xmm1                           // *ibuf
 243         pxor    %xmm2, iv                                       // 1st instruction inside aes_encrypt
 244         pxor    %xmm1, iv                                       // *iv ^= *ibuf
 245
 246         // finishing up the rest of aes_encrypt
 247     aesenc  %xmm3, iv
 248     aesenc  %xmm4, iv
 249     aesenc  %xmm5, iv
 250     aesenc  %xmm6, iv
 251     aesenc  %xmm7, iv
 252 #if defined     __x86_64__
 253     aesenc  %xmm8, iv
 254     aesenc  %xmm9, iv
 255     aesenc  %xmm10, iv
 256     aesenc  %xmm11, iv
 257     aesenclast  %xmm12, iv
 258 #else
 259         movups  96(ctx), %xmm1                          // key6
 260     aesenc  %xmm1, iv
 261         movups  112(ctx), %xmm1                         // key7
 262     aesenc  %xmm1, iv
 263         movups  128(ctx), %xmm1                         // key8
 264     aesenc  %xmm1, iv
 265         movups  144(ctx), %xmm1                         // key9
 266     aesenc  %xmm1, iv
 267         movups  160(ctx), %xmm1                         // keyA
 268     aesenclast  %xmm1, iv
 269 #endif
 270
 271         movups  iv, (obuf)                                      // *obuf = *iv;
 272         add             $16, obuf                                       // obuf++;
 273         add             $16, ibuf                                       // ibuf++;
 274         sub             $1, num_blk                                     // num_blk --
 275         jg              0b                                                      // if num_blk > 0, repeat the loop
 276
 277         // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
 278
 279 L_HW_cbc_done:
 280
 281         xor             %eax, %eax                              // to return CRYPT_OK
 282
 283 L_error:
 284
 285         // if kernel, restore xmm registers
 286 #ifdef  KERNEL
 287         movaps  0(sp), %xmm0
 288         movaps  16(sp), %xmm1
 289         movaps  32(sp), %xmm2
 290         movaps  48(sp), %xmm3
 291         movaps  64(sp), %xmm4
 292         movaps  80(sp), %xmm5
 293         movaps  96(sp), %xmm6
 294         movaps  112(sp), %xmm7
 295 #if defined     __x86_64__
 296         movaps  16*8(sp), %xmm8
 297         movaps  16*9(sp), %xmm9
 298         movaps  16*10(sp), %xmm10
 299         movaps  16*11(sp), %xmm11
 300         movaps  16*12(sp), %xmm12
 301         movaps  16*13(sp), %xmm13
 302         movaps  16*14(sp), %xmm14
 303         movaps  16*15(sp), %xmm15
 304 #endif  // __x86_64__
 305 #endif  // KERNEL
 306
 307         // release used stack memory, restore used callee-saved registers, and return
 308 #if     defined __i386__
 309 #ifdef  KERNEL
 310         add             $(8*16), %esp
 311 #endif
 312         pop             %edi
 313         pop             %ebx
 314 #else
 315 #ifdef  KERNEL
 316         add             $(16*16), %rsp
 317 #endif
 318         pop             %r15
 319         pop             %r14
 320         pop             %r13
 321         pop             %rbx
 322 #endif
 323         leave
 324         ret
 325
 326         //
 327         // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
 328         //
 329
 330 L_encrypt_192:
 331
 332         cmp             $1, num_blk                             // check number of block
 333         jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
 334
 335         movups  (ctx), %xmm2                    // key0
 336         movups  16(ctx), %xmm3                  // key1
 337         movups  32(ctx), %xmm4                  // key2
 338         movups  48(ctx), %xmm5                  // key3
 339         movups  64(ctx), %xmm6                  // key4
 340         movups  80(ctx), %xmm7                  // key5
 341 #if defined     __x86_64__
 342         movups  96(ctx), %xmm8                  // key6
 343         movups  112(ctx), %xmm9                 // key7
 344         movups  128(ctx), %xmm10                // key8
 345         movups  144(ctx), %xmm11                // key9
 346         movups  160(ctx), %xmm12                // keyA
 347         movups  176(ctx), %xmm13                // keyB
 348         movups  192(ctx), %xmm14                // keyC
 349 #endif
 350
 351         // while (num_blk--) {
 352         //                      *iv ^= *ibuf++;
 353         //                      aes_encrypt(iv, iv, ctx);
 354         //                      *obuf++ = *iv;
 355         // }
 356 0:
 357         movups  (ibuf), %xmm1                   // *ibuf
 358         pxor    %xmm1, iv                               // *iv ^= ibuf
 359
 360         // aes_encrypt(iv, iv, ctx);
 361
 362         pxor    %xmm2, iv
 363     aesenc  %xmm3, iv
 364     aesenc  %xmm4, iv
 365     aesenc  %xmm5, iv
 366     aesenc  %xmm6, iv
 367     aesenc  %xmm7, iv
 368 #if defined     __x86_64__
 369     aesenc  %xmm8, iv
 370     aesenc  %xmm9, iv
 371     aesenc  %xmm10, iv
 372     aesenc  %xmm11, iv
 373     aesenc  %xmm12, iv
 374     aesenc  %xmm13, iv
 375     aesenclast  %xmm14, iv
 376 #else
 377         movups  96(ctx), %xmm1
 378     aesenc  %xmm1, iv
 379         movups  112(ctx), %xmm1
 380     aesenc  %xmm1, iv
 381         movups  128(ctx), %xmm1
 382     aesenc  %xmm1, iv
 383         movups  144(ctx), %xmm1
 384     aesenc  %xmm1, iv
 385         movups  160(ctx), %xmm1
 386     aesenc  %xmm1, iv
 387         movups  176(ctx), %xmm1
 388     aesenc  %xmm1, iv
 389         movups  192(ctx), %xmm1
 390     aesenclast  %xmm1, iv
 391 #endif
 392
 393         movups  iv, (obuf)                              // *obuf = *iv;
 394         add             $16, ibuf                               // ibuf++
 395         add             $16, obuf                               // obuf++
 396
 397         sub             $1, num_blk                             // num_blk --
 398         jg              0b                                              // if num_blk > 0, repeat the loop
 399
 400         jmp             L_HW_cbc_done                   // share with the common exit code
 401
 402         //
 403         // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
 404         //
 405
 406 L_encrypt_256:
 407
 408         cmp             $1, num_blk                             // check number of block
 409         jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
 410
 411         movups  (ctx), %xmm2                    // key0
 412         movups  16(ctx), %xmm3                  // key1
 413         movups  32(ctx), %xmm4                  // key2
 414         movups  48(ctx), %xmm5                  // key3
 415         movups  64(ctx), %xmm6                  // key4
 416         movups  80(ctx), %xmm7                  // key5
 417 #if defined     __x86_64__
 418         movups  96(ctx), %xmm8                  // key6
 419         movups  112(ctx), %xmm9                 // key7
 420         movups  128(ctx), %xmm10                // key8
 421         movups  144(ctx), %xmm11                // key9
 422         movups  160(ctx), %xmm12                // keyA
 423         movups  176(ctx), %xmm13                // keyB
 424         movups  192(ctx), %xmm14                // keyC
 425         movups  208(ctx), %xmm15                // keyD
 426         // movups       224(ctx), %xmm1         // keyE
 427 #endif
 428
 429         // while (num_blk--) {
 430         //                      *iv ^= *ibuf++;
 431         //                      aes_encrypt(iv, iv, ctx);
 432         //                      *obuf++ = *iv;
 433         // }
 434 0:
 435         movups  (ibuf), %xmm1                   // *ibuf
 436         pxor    %xmm1, iv                               // *iv ^= ibuf
 437
 438         // aes_encrypt(iv, iv, ctx);
 439         pxor    %xmm2, iv
 440     aesenc  %xmm3, iv
 441     aesenc  %xmm4, iv
 442     aesenc  %xmm5, iv
 443     aesenc  %xmm6, iv
 444     aesenc  %xmm7, iv
 445 #if defined     __x86_64__
 446         movups  224(ctx), %xmm1                 // keyE
 447     aesenc  %xmm8, iv
 448     aesenc  %xmm9, iv
 449     aesenc  %xmm10, iv
 450     aesenc  %xmm11, iv
 451     aesenc  %xmm12, iv
 452     aesenc  %xmm13, iv
 453     aesenc  %xmm14, iv
 454     aesenc  %xmm15, iv
 455     aesenclast  %xmm1, iv
 456 #else
 457         movups  96(ctx), %xmm1                  // key6
 458     aesenc  %xmm1, iv
 459         movups  112(ctx), %xmm1                 // key7
 460     aesenc  %xmm1, iv
 461         movups  128(ctx), %xmm1                 // key8
 462     aesenc  %xmm1, iv
 463         movups  144(ctx), %xmm1                 // key9
 464     aesenc  %xmm1, iv
 465         movups  160(ctx), %xmm1                 // keyA
 466     aesenc  %xmm1, iv
 467         movups  176(ctx), %xmm1                 // keyB
 468     aesenc  %xmm1, iv
 469         movups  192(ctx), %xmm1                 // keyC
 470     aesenc  %xmm1, iv
 471         movups  208(ctx), %xmm1                 // keyD
 472     aesenc  %xmm1, iv
 473         movups  224(ctx), %xmm1                 // keyE
 474     aesenclast  %xmm1, iv
 475 #endif
 476
 477         movups  iv, (obuf)                              // *obuf = *iv;
 478         add             $16, ibuf                               // ibuf++
 479         add             $16, obuf                               // obuf++
 480
 481         sub             $1, num_blk                             // num_blk --
 482         jg              0b                                              // if num_blk > 0, repeat the loop
 483
 484         jmp             L_HW_cbc_done                   // share with the common exit code
 485
 486
 487
 488         //
 489         // --------- END of aes_encrypt_cbc_hw  -------------------
 490         //
 491
 492
 493 /* ----------------------------------------------------------------------------------------------------------------
 494
 495         aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
 496
 497         For simplicity, I am assuming all variables are in 128-bit data type.
 498
 499         aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
 500         {
 501                 while(num_blk--) {
 502                         aes_decrypt(ibuf, obuf, ctx);
 503                         *obuf++ ^= *iv;
 504                         *iv = *ibuf++;
 505                 }
 506                 return 0;
 507         }
 508
 509         The following is an implementation of this function using Intel AESNI.
 510         This function _aes_decrypt_cbc_hw SHOULD NOT be called directly.
 511         Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
 512         to this aesni-based function should it detecs that aesni is available.
 513         Blindly call this function SURELY will cause a CRASH on systems with no aesni support.
 514
 515         Note that the decryption operation is not related over blocks.
 516         This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
 517         This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
 518         The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
 519
 520         Example C code for packing 4 blocks in an iteration is shown as follows:
 521
 522                 while ((num_blk-=4)>=0) {
 523
 524                         // the following 4 functions can be interleaved to exploit parallelism
 525                         aes_decrypt(ibuf, obuf, ctx);
 526                         aes_decrypt(ibuf+1, obuf+1, ctx);
 527                         aes_decrypt(ibuf+2, obuf+2, ctx);
 528                         aes_decrypt(ibuf+3, obuf+3, ctx);
 529
 530                         obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
 531                         *iv = ibuf[3];          ibuf += 4;      obuf += 4;
 532                 }
 533                 num_blk+=4;
 534
 535    ----------------------------------------------------------------------------------------------------------------*/
 536
 537         .text
 538         .align  4,0x90
 539         .globl  _aes_decrypt_cbc_hw
 540 _aes_decrypt_cbc_hw:
 541
 542         // push/save registers for local use
 543 #if     defined __i386__
 544
 545         push    %ebp
 546         movl    %esp, %ebp
 547         push    %ebx                                    // ibuf
 548         push    %edi                                    // obuf
 549
 550         #define sp      %esp
 551
 552 #else   // __x86_64__
 553
 554         push    %rbp
 555         mov             %rsp, %rbp
 556         push    %rbx
 557         push    %r13
 558         push    %r14
 559         push    %r15
 560
 561         #define sp      %rsp
 562
 563 #endif
 564
 565
 566         // if kernel, allocate stack space to save xmm registers
 567 #ifdef  KERNEL
 568 #if defined __i386__
 569         sub             $(8*16), %esp
 570 #else
 571         sub             $(16*16), %rsp
 572 #endif
 573         movaps  %xmm0, (sp)
 574         movaps  %xmm1, 16(sp)
 575         movaps  %xmm2, 32(sp)
 576         movaps  %xmm3, 48(sp)
 577         movaps  %xmm4, 64(sp)
 578         movaps  %xmm5, 80(sp)
 579         movaps  %xmm6, 96(sp)
 580         movaps  %xmm7, 112(sp)
 581 #if defined     __x86_64__
 582         movaps  %xmm8, 16*8(sp)
 583         movaps  %xmm9, 16*9(sp)
 584         movaps  %xmm10, 16*10(sp)
 585         movaps  %xmm11, 16*11(sp)
 586         movaps  %xmm12, 16*12(sp)
 587         movaps  %xmm13, 16*13(sp)
 588         movaps  %xmm14, 16*14(sp)
 589         movaps  %xmm15, 16*15(sp)
 590 #endif  // __x86_64__
 591 #endif
 592
 593         #undef  iv
 594         #define iv      %xmm0
 595
 596 #if defined     __i386__
 597         mov             12(%ebp), %eax                  // in_iv
 598         mov             24(%ebp), %edx                  // ctx
 599         movups  (%eax), iv                              // iv = in_iv
 600         mov             8(%ebp), %ebx                   // ibuf
 601         mov             16(%ebp), %ecx                  // num_blk
 602         mov             20(%ebp), %edi                  // obuf
 603
 604         #define ibuf    %ebx
 605         #define obuf    %edi
 606         #define num_blk %ecx
 607         #define ctx             %edx
 608
 609 #else   //      __x86_64__, rdi/rsi/rdx/rcx/r8
 610
 611         mov             %rdi, %rbx                              // ibuf
 612         movups  (%rsi), iv                              // iv = in_iv
 613         mov             %rdx, %r13                              // num_blk
 614         mov             %rcx, %r14                              // obuf
 615         mov             %r8, %r15                               // ctx
 616
 617         #define ibuf    %rbx
 618         #define num_blk %r13d
 619         #define obuf    %r14
 620         #define ctx             %r15
 621
 622 #endif
 623
 624         mov             240(ctx), %eax                  // aes length
 625         cmp             $160, %eax                              // aes-128 decrypt
 626         je              L_decrypt_128
 627         cmp             $192, %eax                              // aes-192 decrypt
 628         je              L_decrypt_192
 629         cmp             $224, %eax                              // aes-256 decrypt
 630         je              L_decrypt_256
 631
 632         mov             $-1, %eax                               // wrong aes length, to return -1
 633         jmp             L_error                                 // early exit due to wrong aes length
 634
 635
 636         //
 637         // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
 638         //
 639
 640 L_decrypt_128:
 641
 642         cmp             $1, num_blk
 643         jl              L_HW_cbc_done                   // if num_blk < 1, early return
 644
 645         // aes-128 decrypt expanded keys
 646         movups  160(ctx), %xmm3
 647         movups  144(ctx), %xmm4
 648         movups  128(ctx), %xmm5
 649         movups  112(ctx), %xmm6
 650         movups  96(ctx), %xmm7
 651 #if defined     __x86_64__
 652         movups  80(ctx), %xmm8
 653         movups  64(ctx), %xmm9
 654         movups  48(ctx), %xmm10
 655         movups  32(ctx), %xmm11
 656         movups  16(ctx), %xmm12
 657         movups  0(ctx), %xmm13
 658 #endif
 659
 660         // performs 4 block decryption in an iteration to exploit decrypt in parallel
 661
 662         //              while ((num_blk-=4)>=0) {
 663         //                      aes_decrypt(ibuf, obuf, ctx);
 664         //                      aes_decrypt(ibuf+1, obuf+1, ctx);
 665         //                      aes_decrypt(ibuf+2, obuf+2, ctx);
 666         //                      aes_decrypt(ibuf+3, obuf+3, ctx);
 667         //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
 668         //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;
 669         //              }
 670
 671         sub             $4, num_blk                                     // pre decrement num_blk by 4
 672         jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
 673
 674 0:
 675
 676
 677 #if defined     __x86_64__
 678
 679         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
 680         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
 681         movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
 682         movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
 683
 684         // for x86_64, the expanded keys are already stored in xmm3-xmm13
 685
 686         // aes-128 decrypt round 0 per 4 blocks
 687         pxor    %xmm3, %xmm1
 688         pxor    %xmm3, %xmm2
 689         pxor    %xmm3, %xmm14
 690         pxor    %xmm3, %xmm15
 691
 692         // aes-128 decrypt round 1 per 4 blocks
 693     aesdec  %xmm4, %xmm1
 694     aesdec  %xmm4, %xmm2
 695     aesdec  %xmm4, %xmm14
 696     aesdec  %xmm4, %xmm15
 697
 698         // aes-128 decrypt round 2 per 4 blocks
 699     aesdec  %xmm5, %xmm1
 700     aesdec  %xmm5, %xmm2
 701     aesdec  %xmm5, %xmm14
 702     aesdec  %xmm5, %xmm15
 703
 704         // aes-128 decrypt round 3 per 4 blocks
 705     aesdec  %xmm6, %xmm1
 706     aesdec  %xmm6, %xmm2
 707     aesdec  %xmm6, %xmm14
 708     aesdec  %xmm6, %xmm15
 709
 710         // aes-128 decrypt round 4 per 4 blocks
 711     aesdec  %xmm7, %xmm1
 712     aesdec  %xmm7, %xmm2
 713     aesdec  %xmm7, %xmm14
 714     aesdec  %xmm7, %xmm15
 715
 716         // aes-128 decrypt round 5 per 4 blocks
 717     aesdec  %xmm8, %xmm1
 718     aesdec  %xmm8, %xmm2
 719     aesdec  %xmm8, %xmm14
 720     aesdec  %xmm8, %xmm15
 721
 722         // aes-128 decrypt round 6 per 4 blocks
 723     aesdec  %xmm9, %xmm1
 724     aesdec  %xmm9, %xmm2
 725     aesdec  %xmm9, %xmm14
 726     aesdec  %xmm9, %xmm15
 727
 728         // aes-128 decrypt round 7 per 4 blocks
 729     aesdec  %xmm10, %xmm1
 730     aesdec  %xmm10, %xmm2
 731     aesdec  %xmm10, %xmm14
 732     aesdec  %xmm10, %xmm15
 733
 734         // aes-128 decrypt round 8 per 4 blocks
 735     aesdec  %xmm11, %xmm1
 736     aesdec  %xmm11, %xmm2
 737     aesdec  %xmm11, %xmm14
 738     aesdec  %xmm11, %xmm15
 739
 740         // aes-128 decrypt round 9 per 4 blocks
 741     aesdec  %xmm12, %xmm1
 742     aesdec  %xmm12, %xmm2
 743     aesdec  %xmm12, %xmm14
 744     aesdec  %xmm12, %xmm15
 745
 746         // aes-128 decrypt round 10 (last) per 4 blocks
 747     aesdeclast  %xmm13, %xmm1
 748     aesdeclast  %xmm13, %xmm2
 749     aesdeclast  %xmm13, %xmm14
 750     aesdeclast  %xmm13, %xmm15
 751
 752         pxor    iv, %xmm1                               // obuf[0] ^= *iv;
 753         movups  (ibuf), iv                              // ibuf[0]
 754         pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0];
 755         movups  16(ibuf), iv                    // ibuf[1]
 756         pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1];
 757         movups  32(ibuf), iv                    // ibuf[2]
 758         pxor    iv, %xmm15                              // obuf[3] ^= obuf[2];
 759         movups  48(ibuf), iv                    // *iv = ibuf[3]
 760
 761         movups  %xmm1, (obuf)                   // write 1st obuf
 762         movups  %xmm2, 16(obuf)                 // write 2nd obuf
 763         movups  %xmm14, 32(obuf)                // write 3rd obuf
 764         movups  %xmm15, 48(obuf)                // write 4th obuf
 765
 766
 767 #else
 768
 769         // aes_decrypt_cbc per 4 blocks using aes-128 for i386
 770         // xmm1/xmm2/xmm4/xmm5 used for obuf per block
 771         // xmm3 = key0
 772         // xmm0 = iv
 773         // xmm6/xmm7 dynamically load with other expanded keys
 774
 775         movups  (ibuf), %xmm1                   // tmp = 1st ibuf
 776         movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
 777         movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
 778         movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
 779
 780         // aes_decrypt
 781         // for i386, sequentially load expanded keys into xmm6/xmm7
 782
 783         movups  144(ctx), %xmm6                 // key1
 784
 785         // aes-128 decrypt round 0 per 4 blocks
 786         pxor    %xmm3, %xmm1
 787         pxor    %xmm3, %xmm2
 788         pxor    %xmm3, %xmm4
 789         pxor    %xmm3, %xmm5
 790
 791         movups  128(ctx), %xmm7                 // key2
 792
 793         // aes-128 decrypt round 1 per 4 blocks
 794     aesdec  %xmm6, %xmm1
 795     aesdec  %xmm6, %xmm2
 796     aesdec  %xmm6, %xmm4
 797     aesdec  %xmm6, %xmm5
 798
 799         movups  112(ctx), %xmm6                 // key3
 800
 801         // aes-128 decrypt round 2 per 4 blocks
 802     aesdec  %xmm7, %xmm1
 803     aesdec  %xmm7, %xmm2
 804     aesdec  %xmm7, %xmm4
 805     aesdec  %xmm7, %xmm5
 806
 807         movups  96(ctx), %xmm7                  // key4
 808
 809         // aes-128 decrypt round 3 per 4 blocks
 810     aesdec  %xmm6, %xmm1
 811     aesdec  %xmm6, %xmm2
 812     aesdec  %xmm6, %xmm4
 813     aesdec  %xmm6, %xmm5
 814
 815         movups  80(ctx), %xmm6                  // key5
 816
 817         // aes-128 decrypt round 4 per 4 blocks
 818     aesdec  %xmm7, %xmm1
 819     aesdec  %xmm7, %xmm2
 820     aesdec  %xmm7, %xmm4
 821     aesdec  %xmm7, %xmm5
 822
 823         movups  64(ctx), %xmm7                  // key6
 824
 825         // aes-128 decrypt round 5 per 4 blocks
 826     aesdec  %xmm6, %xmm1
 827     aesdec  %xmm6, %xmm2
 828     aesdec  %xmm6, %xmm4
 829     aesdec  %xmm6, %xmm5
 830
 831         movups  48(ctx), %xmm6                  // key7
 832
 833         // aes-128 decrypt round 6 per 4 blocks
 834     aesdec  %xmm7, %xmm1
 835     aesdec  %xmm7, %xmm2
 836     aesdec  %xmm7, %xmm4
 837     aesdec  %xmm7, %xmm5
 838
 839         movups  32(ctx), %xmm7                  // key8
 840
 841         // aes-128 decrypt round 7 per 4 blocks
 842     aesdec  %xmm6, %xmm1
 843     aesdec  %xmm6, %xmm2
 844     aesdec  %xmm6, %xmm4
 845     aesdec  %xmm6, %xmm5
 846
 847         movups  16(ctx), %xmm6                  // key9
 848
 849         // aes-128 decrypt round 8 per 4 blocks
 850     aesdec  %xmm7, %xmm1
 851     aesdec  %xmm7, %xmm2
 852     aesdec  %xmm7, %xmm4
 853     aesdec  %xmm7, %xmm5
 854
 855         movups  0(ctx), %xmm7                   // keyA
 856
 857         // aes-128 decrypt round 9 per 4 blocks
 858     aesdec  %xmm6, %xmm1
 859     aesdec  %xmm6, %xmm2
 860     aesdec  %xmm6, %xmm4
 861     aesdec  %xmm6, %xmm5
 862
 863         // aes-128 decrypt round 10 (last) per 4 blocks
 864     aesdeclast  %xmm7, %xmm1
 865     aesdeclast  %xmm7, %xmm2
 866     aesdeclast  %xmm7, %xmm4
 867     aesdeclast  %xmm7, %xmm5
 868
 869         pxor    iv, %xmm1                               // 1st obuf ^= iv;
 870         movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
 871         pxor    iv, %xmm2                               // 2nd obuf ^= iv;
 872         movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
 873         pxor    iv, %xmm4                               // 3rd obuf ^= iv;
 874         movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
 875         pxor    iv, %xmm5                               // 4th obuf ^= iv;
 876         movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
 877
 878         movups  %xmm1, (obuf)                   // write 1st obuf
 879         movups  %xmm2, 16(obuf)                 // write 2nd obuf
 880         movups  %xmm4, 32(obuf)                 // write 3rd obuf
 881         movups  %xmm5, 48(obuf)                 // write 4th obuf
 882 #endif
 883
 884         add             $64, ibuf                               // ibuf += 4;
 885         add             $64, obuf                               // obuf += 4;
 886
 887         sub             $4, num_blk                             // num_blk -= 4
 888         jge             0b                                              // if num_blk > 0, repeat the loop
 889
 890 9:      add             $4, num_blk                             // post incremtn num_blk by 4
 891         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
 892
 893 #if defined     __i386__
 894         // updated as they might be needed as expanded keys in the remaining
 895         movups  144(ctx), %xmm4
 896         movups  128(ctx), %xmm5
 897         movups  112(ctx), %xmm6
 898         movups  96(ctx), %xmm7
 899 #endif
 900
 901         test    $2, num_blk                             // check whether num_blk has 2 blocks
 902         je              9f                                              // if num_blk & 2 == 0, skip the per-pair processing code
 903
 904         // do the remaining 2 blocks together
 905
 906         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
 907         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
 908
 909         // aes_decrypt
 910         pxor    %xmm3, %xmm1
 911         pxor    %xmm3, %xmm2
 912     aesdec  %xmm4, %xmm1
 913     aesdec  %xmm4, %xmm2
 914     aesdec  %xmm5, %xmm1
 915     aesdec  %xmm5, %xmm2
 916     aesdec  %xmm6, %xmm1
 917     aesdec  %xmm6, %xmm2
 918 #if defined     __x86_64__
 919     aesdec  %xmm7, %xmm1
 920     aesdec  %xmm7, %xmm2
 921     aesdec  %xmm8, %xmm1
 922     aesdec  %xmm8, %xmm2
 923     aesdec  %xmm9, %xmm1
 924     aesdec  %xmm9, %xmm2
 925     aesdec  %xmm10, %xmm1
 926     aesdec  %xmm10, %xmm2
 927     aesdec  %xmm11, %xmm1
 928     aesdec  %xmm11, %xmm2
 929     aesdec  %xmm12, %xmm1
 930     aesdec  %xmm12, %xmm2
 931     aesdeclast  %xmm13, %xmm1
 932     aesdeclast  %xmm13, %xmm2
 933 #else
 934         movups  80(ctx), %xmm6
 935     aesdec  %xmm7, %xmm1
 936     aesdec  %xmm7, %xmm2
 937         movups  64(ctx), %xmm7
 938     aesdec  %xmm6, %xmm1
 939     aesdec  %xmm6, %xmm2
 940         movups  48(ctx), %xmm6
 941     aesdec  %xmm7, %xmm1
 942     aesdec  %xmm7, %xmm2
 943         movups  32(ctx), %xmm7
 944     aesdec  %xmm6, %xmm1
 945     aesdec  %xmm6, %xmm2
 946         movups  16(ctx), %xmm6
 947     aesdec  %xmm7, %xmm1
 948     aesdec  %xmm7, %xmm2
 949         movups  0(ctx), %xmm7
 950     aesdec  %xmm6, %xmm1
 951     aesdec  %xmm6, %xmm2
 952     aesdeclast  %xmm7, %xmm1
 953     aesdeclast  %xmm7, %xmm2
 954         movups  112(ctx), %xmm6
 955         movups  96(ctx), %xmm7
 956 #endif
 957
 958         pxor    iv, %xmm1                               // obuf[0] ^= *iv;
 959         movups  (ibuf), iv                              // ibuf[0]
 960         pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]
 961         movups  16(ibuf), iv                    // *iv = ibuf[1]
 962
 963         movups  %xmm1, (obuf)                   // write obuf[0]
 964         movups  %xmm2, 16(obuf)                 // write obuf[1]
 965
 966         add             $32, ibuf                               // ibuf += 2
 967         add             $32, obuf                               // obuf += 2
 968
 969 9:
 970         test    $1, num_blk                             // check whether num_blk has residual 1 block
 971         je              L_HW_cbc_done                   // if num_blk == 0, no need for residual processing code
 972
 973         movups  (ibuf), %xmm2                           // tmp = ibuf
 974         // aes_decrypt
 975         pxor    %xmm3, %xmm2
 976     aesdec  %xmm4, %xmm2
 977     aesdec  %xmm5, %xmm2
 978     aesdec  %xmm6, %xmm2
 979     aesdec  %xmm7, %xmm2
 980 #if defined     __x86_64__
 981     aesdec  %xmm8, %xmm2
 982     aesdec  %xmm9, %xmm2
 983     aesdec  %xmm10, %xmm2
 984     aesdec  %xmm11, %xmm2
 985     aesdec  %xmm12, %xmm2
 986     aesdeclast  %xmm13, %xmm2
 987 #else
 988         movups  80(ctx), %xmm1
 989     aesdec  %xmm1, %xmm2
 990         movups  64(ctx), %xmm1
 991     aesdec  %xmm1, %xmm2
 992         movups  48(ctx), %xmm1
 993     aesdec  %xmm1, %xmm2
 994         movups  32(ctx), %xmm1
 995     aesdec  %xmm1, %xmm2
 996         movups  16(ctx), %xmm1
 997     aesdec  %xmm1, %xmm2
 998         movups  (ctx), %xmm1
 999     aesdeclast  %xmm1, %xmm2
1000 #endif
1001
1002         pxor    iv, %xmm2                       // *obuf ^= *iv;
1003         movups  (ibuf), iv                      // *iv = *ibuf;
1004         movups  %xmm2, (obuf)           // write *obuf
1005
1006         jmp             L_HW_cbc_done
1007
1008         //
1009         // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1010         //
1011
1012 L_decrypt_192:
1013
1014         cmp             $1, num_blk
1015         jl              L_HW_cbc_done                   // if num_blk < 1, early return
1016
1017         // aes-192 decryp expanded keys
1018         movups  192(ctx), %xmm3
1019         movups  176(ctx), %xmm4
1020         movups  160(ctx), %xmm5
1021         movups  144(ctx), %xmm6
1022         movups  128(ctx), %xmm7
1023 #if defined     __x86_64__
1024         movups  112(ctx), %xmm8
1025         movups  96(ctx), %xmm9
1026         movups  80(ctx), %xmm10
1027         movups  64(ctx), %xmm11
1028         movups  48(ctx), %xmm12
1029         movups  32(ctx), %xmm13
1030         movups  16(ctx), %xmm14
1031         movups  (ctx), %xmm15
1032 #endif
1033
1034         // performs 4 block decryption in an iteration to exploit decrypt in parallel
1035
1036         //              while ((num_blk-=4)>=0) {
1037         //                      aes_decrypt(ibuf, obuf, ctx);
1038         //                      aes_decrypt(ibuf+1, obuf+1, ctx);
1039         //                      aes_decrypt(ibuf+2, obuf+2, ctx);
1040         //                      aes_decrypt(ibuf+3, obuf+3, ctx);
1041         //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
1042         //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;
1043         //              }
1044
1045         sub             $4, num_blk                                     // pre decrement num_blk by 4
1046         jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
1047 0:
1048
1049 #if defined     __x86_64__
1050
1051         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
1052         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
1053         movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
1054         movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
1055
1056         // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1057         // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
1058
1059         // round 0 for 4 blocks
1060         pxor    %xmm3, %xmm1
1061         pxor    %xmm3, %xmm2
1062         pxor    %xmm3, %xmm14
1063         pxor    %xmm3, %xmm15
1064
1065         // round 1 for 4 blocks
1066     aesdec  %xmm4, %xmm1
1067     aesdec  %xmm4, %xmm2
1068     aesdec  %xmm4, %xmm14
1069     aesdec  %xmm4, %xmm15
1070
1071         // round 2 for 4 blocks
1072     aesdec  %xmm5, %xmm1
1073     aesdec  %xmm5, %xmm2
1074     aesdec  %xmm5, %xmm14
1075     aesdec  %xmm5, %xmm15
1076
1077         // round 3 for 4 blocks
1078     aesdec  %xmm6, %xmm1
1079     aesdec  %xmm6, %xmm2
1080     aesdec  %xmm6, %xmm14
1081     aesdec  %xmm6, %xmm15
1082
1083         // round 4 for 4 blocks
1084     aesdec  %xmm7, %xmm1
1085     aesdec  %xmm7, %xmm2
1086     aesdec  %xmm7, %xmm14
1087     aesdec  %xmm7, %xmm15
1088
1089         // round 5 for 4 blocks
1090     aesdec  %xmm8, %xmm1
1091     aesdec  %xmm8, %xmm2
1092     aesdec  %xmm8, %xmm14
1093     aesdec  %xmm8, %xmm15
1094
1095         // round 6 for 4 blocks
1096     aesdec  %xmm9, %xmm1
1097     aesdec  %xmm9, %xmm2
1098     aesdec  %xmm9, %xmm14
1099     aesdec  %xmm9, %xmm15
1100
1101         // round 7 for 4 blocks
1102     aesdec  %xmm10, %xmm1
1103     aesdec  %xmm10, %xmm2
1104     aesdec  %xmm10, %xmm14
1105     aesdec  %xmm10, %xmm15
1106
1107         // round 8 for 4 blocks
1108     aesdec  %xmm11, %xmm1
1109     aesdec  %xmm11, %xmm2
1110     aesdec  %xmm11, %xmm14
1111     aesdec  %xmm11, %xmm15
1112
1113         // round 9 for 4 blocks
1114     aesdec  %xmm12, %xmm1
1115     aesdec  %xmm12, %xmm2
1116     aesdec  %xmm12, %xmm14
1117     aesdec  %xmm12, %xmm15
1118
1119         movups  16(ctx), %xmm12
1120
1121         // round A for 4 blocks
1122     aesdec  %xmm13, %xmm1
1123     aesdec  %xmm13, %xmm2
1124     aesdec  %xmm13, %xmm14
1125     aesdec  %xmm13, %xmm15
1126
1127         movups  (ctx), %xmm13
1128
1129         // round B for 4 blocks
1130     aesdec  %xmm12, %xmm1
1131     aesdec  %xmm12, %xmm2
1132     aesdec  %xmm12, %xmm14
1133     aesdec  %xmm12, %xmm15
1134
1135         movups  48(ctx), %xmm12         // restore %xmm12 to its original key
1136
1137         // round C (last) for 4 blocks
1138     aesdeclast  %xmm13, %xmm1
1139     aesdeclast  %xmm13, %xmm2
1140     aesdeclast  %xmm13, %xmm14
1141     aesdeclast  %xmm13, %xmm15
1142
1143         movups  32(ctx), %xmm13         // restore %xmm13 to its original key
1144
1145         pxor    iv, %xmm1                               // obuf[0] ^= *iv;
1146         movups  (ibuf), iv                              // ibuf[0]
1147         pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]
1148         movups  16(ibuf), iv                    // ibuf[1]
1149         pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1]
1150         movups  32(ibuf), iv                    // ibuf[2]
1151         pxor    iv, %xmm15                              // obuf[3] ^= ibuf[2]
1152         movups  48(ibuf), iv                    // *iv = ibuf[3]
1153
1154         movups  %xmm1, (obuf)                   // write 1st obuf
1155         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1156         movups  %xmm14, 32(obuf)                // write 3rd obuf
1157         movups  %xmm15, 48(obuf)                // write 4th obuf
1158
1159         add             $64, ibuf                               // ibuf += 4;
1160         add             $64, obuf                               // obuf += 4;
1161
1162         sub             $4, num_blk                             // num_blk -= 4
1163         jge             0b                                              // if num_blk > 0, repeat the loop
1164
1165 9:      add             $4, num_blk                             // post incremtn num_blk by 4
1166         je              L_HW_cbc_done                   // if num_blk == 0, prepare to return
1167
1168         movups  16(ctx), %xmm14                 // restore %xmm14 to its key
1169         movups  (ctx), %xmm15                   // restore %xmm15 to its key
1170
1171 #else
1172
1173         movups  (ibuf), %xmm1                   // tmp = 1st ibuf
1174         movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
1175         movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
1176         movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
1177
1178         // aes_decrypt
1179         // for i386, sequentially load expanded keys into xmm6/xmm7
1180         movups  176(ctx), %xmm6
1181         pxor    %xmm3, %xmm1
1182         pxor    %xmm3, %xmm2
1183         pxor    %xmm3, %xmm4
1184         pxor    %xmm3, %xmm5
1185
1186         movups  160(ctx), %xmm7
1187     aesdec  %xmm6, %xmm1
1188     aesdec  %xmm6, %xmm2
1189     aesdec  %xmm6, %xmm4
1190     aesdec  %xmm6, %xmm5
1191
1192         movups  144(ctx), %xmm6
1193         aesdec    %xmm7, %xmm1
1194         aesdec    %xmm7, %xmm2
1195         aesdec    %xmm7, %xmm4
1196         aesdec    %xmm7, %xmm5
1197
1198         movups  128(ctx), %xmm7
1199     aesdec  %xmm6, %xmm1
1200     aesdec  %xmm6, %xmm2
1201     aesdec  %xmm6, %xmm4
1202     aesdec  %xmm6, %xmm5
1203
1204         movups  112(ctx), %xmm6
1205     aesdec  %xmm7, %xmm1
1206     aesdec  %xmm7, %xmm2
1207     aesdec  %xmm7, %xmm4
1208     aesdec  %xmm7, %xmm5
1209
1210         movups  96(ctx), %xmm7
1211     aesdec  %xmm6, %xmm1
1212     aesdec  %xmm6, %xmm2
1213     aesdec  %xmm6, %xmm4
1214     aesdec  %xmm6, %xmm5
1215
1216         movups  80(ctx), %xmm6
1217     aesdec  %xmm7, %xmm1
1218     aesdec  %xmm7, %xmm2
1219     aesdec  %xmm7, %xmm4
1220     aesdec  %xmm7, %xmm5
1221
1222         movups  64(ctx), %xmm7
1223     aesdec  %xmm6, %xmm1
1224     aesdec  %xmm6, %xmm2
1225     aesdec  %xmm6, %xmm4
1226     aesdec  %xmm6, %xmm5
1227
1228         movups  48(ctx), %xmm6
1229     aesdec  %xmm7, %xmm1
1230     aesdec  %xmm7, %xmm2
1231     aesdec  %xmm7, %xmm4
1232     aesdec  %xmm7, %xmm5
1233
1234         movups  32(ctx), %xmm7
1235     aesdec  %xmm6, %xmm1
1236     aesdec  %xmm6, %xmm2
1237     aesdec  %xmm6, %xmm4
1238     aesdec  %xmm6, %xmm5
1239
1240         movups  16(ctx), %xmm6
1241     aesdec  %xmm7, %xmm1
1242     aesdec  %xmm7, %xmm2
1243     aesdec  %xmm7, %xmm4
1244     aesdec  %xmm7, %xmm5
1245
1246         movups  0(ctx), %xmm7
1247     aesdec  %xmm6, %xmm1
1248     aesdec  %xmm6, %xmm2
1249     aesdec  %xmm6, %xmm4
1250     aesdec  %xmm6, %xmm5
1251
1252     aesdeclast  %xmm7, %xmm1
1253     aesdeclast  %xmm7, %xmm2
1254     aesdeclast  %xmm7, %xmm4
1255     aesdeclast  %xmm7, %xmm5
1256
1257         pxor    iv, %xmm1                               // 1st obuf ^= iv;
1258         movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1259         pxor    iv, %xmm2                               // 2nd obuf ^= iv;
1260         movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1261         pxor    iv, %xmm4                               // 3rd obuf ^= iv;
1262         movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1263         pxor    iv, %xmm5                               // 4th obuf ^= iv;
1264         movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1265         movups  %xmm1, (obuf)                   // write 1st obuf
1266         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1267         movups  %xmm4, 32(obuf)                 // write 3rd obuf
1268         movups  %xmm5, 48(obuf)                 // write 4th obuf
1269
1270         add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4;
1271         add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;
1272
1273         sub             $4, num_blk                             // num_blk -= 4
1274         jge             0b                                              // if num_blk > 0, repeat the loop
1275
1276
1277 9:      add             $4, num_blk                             //      post incremtn num_blk by 4
1278         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
1279
1280         movups  176(ctx), %xmm4
1281         movups  160(ctx), %xmm5
1282         movups  144(ctx), %xmm6
1283         movups  128(ctx), %xmm7
1284
1285 #endif
1286
1287         // per-block aes_decrypt_cbc loop
1288
1289 0:
1290         movups  (ibuf), %xmm2                           // tmp = ibuf
1291
1292         // aes_decrypt
1293         pxor    %xmm3, %xmm2
1294     aesdec  %xmm4, %xmm2
1295     aesdec  %xmm5, %xmm2
1296     aesdec  %xmm6, %xmm2
1297     aesdec  %xmm7, %xmm2
1298 #if defined     __x86_64__
1299     aesdec  %xmm8, %xmm2
1300     aesdec  %xmm9, %xmm2
1301     aesdec  %xmm10, %xmm2
1302     aesdec  %xmm11, %xmm2
1303     aesdec  %xmm12, %xmm2
1304     aesdec  %xmm13, %xmm2
1305     aesdec  %xmm14, %xmm2
1306     aesdeclast  %xmm15, %xmm2
1307 #else
1308         movups  112(ctx), %xmm1
1309     aesdec  %xmm1, %xmm2
1310         movups  96(ctx), %xmm1
1311     aesdec  %xmm1, %xmm2
1312         movups  80(ctx), %xmm1
1313     aesdec  %xmm1, %xmm2
1314         movups  64(ctx), %xmm1
1315     aesdec  %xmm1, %xmm2
1316         movups  48(ctx), %xmm1
1317     aesdec  %xmm1, %xmm2
1318         movups  32(ctx), %xmm1
1319     aesdec  %xmm1, %xmm2
1320         movups  16(ctx), %xmm1
1321     aesdec  %xmm1, %xmm2
1322         movups  (ctx), %xmm1
1323     aesdeclast  %xmm1, %xmm2
1324 #endif
1325
1326         pxor    iv, %xmm2                       // obuf ^= iv;
1327         movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);
1328
1329         movups  %xmm2, (obuf)           // write obuf
1330
1331         add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE;
1332         add             $16, obuf                               // obuf += AES_BLOCK_SIZE;
1333         sub             $1, num_blk                             // num_blk --
1334         jg              0b                                              // if num_blk > 0, repeat the loop
1335
1336         jmp             L_HW_cbc_done
1337
1338         //
1339         // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
1340         //
1341
1342 L_decrypt_256:
1343
1344         cmp             $1, num_blk
1345         jl              L_HW_cbc_done
1346
1347         movups  224(ctx), %xmm3
1348         movups  208(ctx), %xmm4
1349         movups  192(ctx), %xmm5
1350         movups  176(ctx), %xmm6
1351         movups  160(ctx), %xmm7
1352 #if defined     __x86_64__
1353         movups  144(ctx), %xmm8
1354         movups  128(ctx), %xmm9
1355         movups  112(ctx), %xmm10
1356         movups  96(ctx), %xmm11
1357         movups  80(ctx), %xmm12
1358         movups  64(ctx), %xmm13
1359         movups  48(ctx), %xmm14
1360         movups  32(ctx), %xmm15
1361 //      movups  16(ctx), %xmm14
1362 //      movups  (ctx), %xmm15
1363 #endif
1364
1365 #if defined     __x86_64__
1366
1367         sub             $4, num_blk                                     // pre decrement num_blk by 4
1368         jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
1369 0:
1370         movups  (ibuf), %xmm1                           // tmp = 1st ibuf
1371         movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
1372         movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
1373         movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
1374
1375         // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
1376         pxor    %xmm3, %xmm1
1377         pxor    %xmm3, %xmm2
1378         pxor    %xmm3, %xmm14
1379         pxor    %xmm3, %xmm15
1380
1381     aesdec  %xmm4, %xmm1
1382     aesdec  %xmm4, %xmm2
1383     aesdec  %xmm4, %xmm14
1384     aesdec  %xmm4, %xmm15
1385
1386     aesdec  %xmm5, %xmm1
1387     aesdec  %xmm5, %xmm2
1388     aesdec  %xmm5, %xmm14
1389     aesdec  %xmm5, %xmm15
1390
1391     aesdec  %xmm6, %xmm1
1392     aesdec  %xmm6, %xmm2
1393     aesdec  %xmm6, %xmm14
1394     aesdec  %xmm6, %xmm15
1395
1396     aesdec  %xmm7, %xmm1
1397     aesdec  %xmm7, %xmm2
1398     aesdec  %xmm7, %xmm14
1399     aesdec  %xmm7, %xmm15
1400
1401     aesdec  %xmm8, %xmm1
1402     aesdec  %xmm8, %xmm2
1403     aesdec  %xmm8, %xmm14
1404     aesdec  %xmm8, %xmm15
1405
1406     aesdec  %xmm9, %xmm1
1407     aesdec  %xmm9, %xmm2
1408     aesdec  %xmm9, %xmm14
1409     aesdec  %xmm9, %xmm15
1410
1411     aesdec  %xmm10, %xmm1
1412     aesdec  %xmm10, %xmm2
1413     aesdec  %xmm10, %xmm14
1414     aesdec  %xmm10, %xmm15
1415
1416     aesdec  %xmm11, %xmm1
1417     aesdec  %xmm11, %xmm2
1418     aesdec  %xmm11, %xmm14
1419     aesdec  %xmm11, %xmm15
1420
1421     aesdec  %xmm12, %xmm1
1422     aesdec  %xmm12, %xmm2
1423     aesdec  %xmm12, %xmm14
1424     aesdec  %xmm12, %xmm15
1425         movups  48(ctx), %xmm12
1426
1427     aesdec  %xmm13, %xmm1
1428     aesdec  %xmm13, %xmm2
1429     aesdec  %xmm13, %xmm14
1430     aesdec  %xmm13, %xmm15
1431         movups  32(ctx), %xmm13
1432
1433     aesdec  %xmm12, %xmm1
1434     aesdec  %xmm12, %xmm2
1435     aesdec  %xmm12, %xmm14
1436     aesdec  %xmm12, %xmm15
1437         movups  16(ctx), %xmm12
1438
1439     aesdec  %xmm13, %xmm1
1440     aesdec  %xmm13, %xmm2
1441     aesdec  %xmm13, %xmm14
1442     aesdec  %xmm13, %xmm15
1443         movups  (ctx), %xmm13
1444
1445     aesdec  %xmm12, %xmm1
1446     aesdec  %xmm12, %xmm2
1447     aesdec  %xmm12, %xmm14
1448     aesdec  %xmm12, %xmm15
1449         movups  80(ctx), %xmm12
1450
1451     aesdeclast  %xmm13, %xmm1
1452     aesdeclast  %xmm13, %xmm2
1453     aesdeclast  %xmm13, %xmm14
1454     aesdeclast  %xmm13, %xmm15
1455         movups  64(ctx), %xmm13
1456
1457         pxor    iv, %xmm1                               // obuf ^= iv;
1458         movups  (ibuf), iv                              // memcpy(iv, tmp, AES_BLOCK_SIZE);
1459         pxor    iv, %xmm2                               // obuf ^= iv;
1460         movups  16(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
1461         pxor    iv, %xmm14                              // obuf ^= iv;
1462         movups  32(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
1463         pxor    iv, %xmm15                              // obuf ^= iv;
1464         movups  48(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
1465
1466         movups  %xmm1, (obuf)                   // write 1st obuf
1467         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1468         movups  %xmm14, 32(obuf)                // write 3rd obuf
1469         movups  %xmm15, 48(obuf)                // write 4th obuf
1470
1471         add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE*4;
1472         add             $64, obuf                               // obuf += AES_BLOCK_SIZE*4;
1473
1474         sub             $4, num_blk                             // num_blk -= 4
1475         jge             0b                                              // if num_blk > 0, repeat the loop
1476
1477 9:      add             $4, num_blk                             //      post incremtn num_blk by 4
1478         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
1479
1480         movups  48(ctx), %xmm14
1481         movups  32(ctx), %xmm15
1482
1483 #else
1484
1485         sub             $4, num_blk                             // pre decrement num_blk by 4
1486         jl              9f                                              // if num_blk < 4, skip the per-pair processing code
1487 0:
1488         movups  (ibuf), %xmm1                   // tmp = 1st ibuf
1489         movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
1490         movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
1491         movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
1492
1493         // aes_decrypt
1494         // for i386, sequentially load expanded keys into xmm6/xmm7
1495         movups  208(ctx), %xmm6
1496         pxor    %xmm3, %xmm1
1497         pxor    %xmm3, %xmm2
1498         pxor    %xmm3, %xmm4
1499         pxor    %xmm3, %xmm5
1500
1501         movups  192(ctx), %xmm7
1502     aesdec  %xmm6, %xmm1
1503     aesdec  %xmm6, %xmm2
1504     aesdec  %xmm6, %xmm4
1505     aesdec  %xmm6, %xmm5
1506
1507         movups  176(ctx), %xmm6
1508         aesdec  %xmm7, %xmm1
1509         aesdec  %xmm7, %xmm2
1510         aesdec  %xmm7, %xmm4
1511         aesdec  %xmm7, %xmm5
1512
1513         movups  160(ctx), %xmm7
1514     aesdec  %xmm6, %xmm1
1515     aesdec  %xmm6, %xmm2
1516     aesdec  %xmm6, %xmm4
1517     aesdec  %xmm6, %xmm5
1518
1519         movups  144(ctx), %xmm6
1520         aesdec  %xmm7, %xmm1
1521         aesdec  %xmm7, %xmm2
1522         aesdec  %xmm7, %xmm4
1523         aesdec  %xmm7, %xmm5
1524
1525         movups  128(ctx), %xmm7
1526     aesdec  %xmm6, %xmm1
1527     aesdec  %xmm6, %xmm2
1528     aesdec  %xmm6, %xmm4
1529     aesdec  %xmm6, %xmm5
1530
1531         movups  112(ctx), %xmm6
1532     aesdec  %xmm7, %xmm1
1533     aesdec  %xmm7, %xmm2
1534     aesdec  %xmm7, %xmm4
1535     aesdec  %xmm7, %xmm5
1536
1537         movups  96(ctx), %xmm7
1538     aesdec  %xmm6, %xmm1
1539     aesdec  %xmm6, %xmm2
1540     aesdec  %xmm6, %xmm4
1541     aesdec  %xmm6, %xmm5
1542
1543         movups  80(ctx), %xmm6
1544     aesdec  %xmm7, %xmm1
1545     aesdec  %xmm7, %xmm2
1546     aesdec  %xmm7, %xmm4
1547     aesdec  %xmm7, %xmm5
1548
1549         movups  64(ctx), %xmm7
1550     aesdec  %xmm6, %xmm1
1551     aesdec  %xmm6, %xmm2
1552     aesdec  %xmm6, %xmm4
1553     aesdec  %xmm6, %xmm5
1554
1555         movups  48(ctx), %xmm6
1556     aesdec  %xmm7, %xmm1
1557     aesdec  %xmm7, %xmm2
1558     aesdec  %xmm7, %xmm4
1559     aesdec  %xmm7, %xmm5
1560
1561         movups  32(ctx), %xmm7
1562     aesdec  %xmm6, %xmm1
1563     aesdec  %xmm6, %xmm2
1564     aesdec  %xmm6, %xmm4
1565     aesdec  %xmm6, %xmm5
1566
1567         movups  16(ctx), %xmm6
1568     aesdec  %xmm7, %xmm1
1569     aesdec  %xmm7, %xmm2
1570     aesdec  %xmm7, %xmm4
1571     aesdec  %xmm7, %xmm5
1572
1573         movups  0(ctx), %xmm7
1574     aesdec  %xmm6, %xmm1
1575     aesdec  %xmm6, %xmm2
1576     aesdec  %xmm6, %xmm4
1577     aesdec  %xmm6, %xmm5
1578
1579     aesdeclast  %xmm7, %xmm1
1580     aesdeclast  %xmm7, %xmm2
1581     aesdeclast  %xmm7, %xmm4
1582     aesdeclast  %xmm7, %xmm5
1583
1584         pxor    iv, %xmm1                               // 1st obuf ^= iv;
1585         movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
1586         pxor    iv, %xmm2                               // 2nd obuf ^= iv;
1587         movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
1588         pxor    iv, %xmm4                               // 3rd obuf ^= iv;
1589         movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
1590         pxor    iv, %xmm5                               // 4th obuf ^= iv;
1591         movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
1592         movups  %xmm1, (obuf)                   // write 1st obuf
1593         movups  %xmm2, 16(obuf)                 // write 2nd obuf
1594         movups  %xmm4, 32(obuf)                 // write 3rd obuf
1595         movups  %xmm5, 48(obuf)                 // write 4th obuf
1596
1597         add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4;
1598         add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;
1599
1600         sub             $4, num_blk                             // num_blk -= 4
1601         jge             0b                                              // if num_blk > 0, repeat the loop
1602
1603
1604 9:      add             $4, num_blk                             //      post incremtn num_blk by 4
1605         je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
1606
1607         movups  208(ctx), %xmm4
1608         movups  192(ctx), %xmm5
1609         movups  176(ctx), %xmm6
1610         movups  160(ctx), %xmm7
1611
1612 #endif
1613
1614 0:
1615         movups  (ibuf), %xmm2                           // tmp = ibuf
1616
1617         // aes_decrypt
1618         pxor    %xmm3, %xmm2
1619     aesdec  %xmm4, %xmm2
1620     aesdec  %xmm5, %xmm2
1621     aesdec  %xmm6, %xmm2
1622     aesdec  %xmm7, %xmm2
1623 #if defined     __x86_64__
1624     aesdec  %xmm8, %xmm2
1625     aesdec  %xmm9, %xmm2
1626     aesdec  %xmm10, %xmm2
1627     aesdec  %xmm11, %xmm2
1628     aesdec  %xmm12, %xmm2
1629     aesdec  %xmm13, %xmm2
1630     aesdec  %xmm14, %xmm2
1631     aesdec  %xmm15, %xmm2
1632 #else
1633         movups  144(ctx), %xmm1
1634     aesdec  %xmm1, %xmm2
1635         movups  128(ctx), %xmm1
1636     aesdec  %xmm1, %xmm2
1637         movups  112(ctx), %xmm1
1638     aesdec  %xmm1, %xmm2
1639         movups  96(ctx), %xmm1
1640     aesdec  %xmm1, %xmm2
1641         movups  80(ctx), %xmm1
1642     aesdec  %xmm1, %xmm2
1643         movups  64(ctx), %xmm1
1644     aesdec  %xmm1, %xmm2
1645         movups  48(ctx), %xmm1
1646     aesdec  %xmm1, %xmm2
1647         movups  32(ctx), %xmm1
1648     aesdec  %xmm1, %xmm2
1649 #endif
1650         movups  16(ctx), %xmm1
1651     aesdec  %xmm1, %xmm2
1652         movups  (ctx), %xmm1
1653     aesdeclast  %xmm1, %xmm2
1654
1655         pxor    iv, %xmm2                       // obuf ^= iv;
1656         movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);
1657
1658         movups  %xmm2, (obuf)           // write obuf
1659
1660         add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE;
1661         add             $16, obuf                               // obuf += AES_BLOCK_SIZE;
1662         sub             $1, num_blk                             // num_blk --
1663         jg              0b                                              // if num_blk > 0, repeat the loop
1664
1665         jmp             L_HW_cbc_done
1666
1667         //
1668         // --------- END of aes_decrypt_cbc_hw  -------------------
1669         //