bsd/crypto/aes/i386/aesxts_asm.s

   1 /*
   2         This file "aesxts.s" provides x86_64 / i386 optimization of the following functions
   3
   4         0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions
   5         1. void xts_mult_x(uint8_t *I);
   6         2. int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx);
   7         3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim);
   8         4. int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx);
   9         5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim);
  10
  11         This file should be compiled together with xtsClearC.c
  12
  13         functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures
  14         functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c)
  15           - we can possibly add C code for functions 3 and 5 for future porting to other architectures
  16
  17         cclee 4-29-10
  18
  19 */
  20
  21 #ifdef KERNEL
  22 #include <i386/cpu_capabilities.h>
  23 #else
  24 #include <System/i386/cpu_capabilities.h>
  25 #endif
  26 #define CRYPT_OK        0               // can not include "crypt.h" in which CRYPT_OK is from enum
  27
  28 /*
  29         The following macro is used throughout the functions in this file.
  30         It is the core function within the function xts_mult_x defined in (xtsClearC.c)
  31
  32         upon entry, %xmm7 = the input tweak (128-bit),
  33         on return, %xmm7 = the updated tweak (128-bit)
  34         the macro uses %xmm1/%xmm2/%ecx in the computation
  35         the operation can be described as follows :
  36         0. let x = %xmm7;                                       // 128-bit little-endian input
  37         1. x = rotate_left(x,1);                        // rotate left by 1 -bit
  38         2. if (x&1) x ^= 0x0000...0086;         // if least significant bit = 1, least significant byte ^= 0x86;
  39         3. return x;
  40
  41         It's a pity that SSE does not support shifting of the whole 128-bit xmm registers.
  42         The workaround is
  43                 1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits
  44                 2. manipulating the shifted quad words to form the 128-bit shifted result.
  45
  46         Input : %xmm7
  47         Output : %xmm7
  48         Used : %xmm1/%xmm2/%ecx
  49
  50         The macro is good for both x86_64 and i386.
  51
  52 */
  53
  54         .macro          xts_mult_x_on_xmm7                      // input : x = %xmm7, MS = most significant, LS = least significant
  55         movaps          %xmm7, %xmm1                            // %xmm1 = a copy of x
  56         movaps          %xmm7, %xmm2                            // %xmm2 = a copy of x
  57         psllq           $$1, %xmm7                                      // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled
  58         psrlq           $$63, %xmm1                                     // 2 leading bits, each in the least significant bit of a quad word
  59         psrad           $$31, %xmm2                                     // the MS 32-bit will be either 0 or -1, depending on the MS bit of x
  60         pshufd          $$0xc6, %xmm1, %xmm1            // switch the positions of the 2 leading bits
  61         pshufd          $$0x03, %xmm2, %xmm2            // the LS 32-bit will be either 0 or -1, depending on the MS bit of x
  62         por                     %xmm1, %xmm7                            // we finally has %xmm7 = rotate_left(x,1);
  63         movl            $$0x86, %ecx                            // a potential byte to xor the bottom byte
  64         movd            %ecx, %xmm1                                     // copy it to %xmm1, the other is 0
  65         pand            %xmm2, %xmm1                            // %xmm1 = 0 or 0x86, depending on the MS bit of x
  66         pxor            %xmm1, %xmm7                            // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x
  67         .endm
  68
  69
  70 /*
  71         function : void xts_mult_x(uint8_t *I);
  72
  73         1. load (__m128*) (I) into xmm7
  74         2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx)
  75         3. save output (%xmm7) to memory pointed by I
  76
  77         input : 16-byte memory pointed by I
  78         output : same 16-byte memory pointed by I
  79
  80         if kernel code, xmm1/xmm2/xmm7 saved and restored
  81         other used registers : eax/ecx
  82
  83  */
  84         .text
  85         .align  4,0x90
  86         .globl  _xts_mult_x
  87 _xts_mult_x:
  88
  89 #if defined __x86_64__
  90         #define I       %rdi                                            // 1st argument at %rdi for x86_64
  91         #define sp      %rsp
  92 #else
  93         mov             4(%esp), %eax                                   // 1st argument at stack, offset 4 for ret_addr for i386
  94         #define I       %eax
  95         #define sp      %esp
  96 #endif
  97
  98         // if KERNEL code, allocate memory and save xmm1/xmm2/xmm7
  99 #ifdef  KERNEL
 100 #if defined __x86_64__
 101         sub             $0x38, sp                                               // 8-bytes alignment + 3 * 16 bytes
 102 #else
 103         sub             $0x3c, sp                                               // 12-bytes alignment + 3 * 16 bytes
 104 #endif
 105         movaps  %xmm1, (sp)
 106         movaps  %xmm2, 16(sp)
 107         movaps  %xmm7, 32(sp)
 108 #endif
 109
 110         // load, compute, and save
 111         movups  (I), %xmm7                                              // load input tweak 128-bit into %xmm7
 112         xts_mult_x_on_xmm7                                              // the macro (also used else where) will update %xmm7 as the output
 113         movups  %xmm7, (I)                                              // save the xts_mult_x output
 114
 115         // if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory
 116 #ifdef  KERNEL
 117         movaps  (sp), %xmm1
 118         movaps  16(sp), %xmm2
 119         movaps  32(sp), %xmm7
 120 #if defined __x86_64__
 121         add             $0x38, sp                                               // 8-bytes alignment + 3 * 16 bytes
 122 #else
 123         add             $0x3c, sp                                               // 12-bytes alignment + 3 * 16 bytes
 124 #endif
 125 #endif
 126
 127         ret                                                                             // return
 128
 129         #undef  I
 130         #undef  sp
 131
 132 /*
 133         The following is x86_64/i386 assembly implementation of
 134
 135         int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx);
 136
 137         Its C code implementation is given in xtsClearC.c
 138
 139         all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data.
 140
 141         The operation of tweak_crypt
 142
 143         1. C = P ^ T
 144         2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err;
 145         3. C = C ^ T
 146         4. xts_mult_x(T)
 147         5. return CRYPT_OK;
 148
 149         The following is the assembly implementation flow
 150
 151         1. save used xmm registers (xmm1/xmm7) if kernel code
 152         2. load xmm1 = P, xmm7 = T
 153         3. xmm1 = C = P ^ T
 154         4. write xmm1 to C
 155         5. call aes_encryp(C,C,ctx); note that it will use aesni if available, also xmm will return intact
 156         6. load xmm1 = C
 157         7. xmm1 = C = C^T = xmm1 ^ xmm7
 158         8. write xmm1 to C
 159         9. update T (in xmm7) via xts_mult_x macro
 160         a. restore xmm registers (xmm1/xmm7) if kernel code
 161         b. return CRYPT_OK (in eax)
 162
 163         Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
 164
 165 */
 166
 167         .text
 168         .align  4,0x90
 169         .globl  _tweak_crypt
 170 _tweak_crypt:
 171 #if defined     __i386__
 172
 173         // push into stack for local use
 174         push    %ebp
 175         mov             %esp, %ebp
 176         push    %ebx
 177         push    %edi
 178         push    %esi
 179
 180         // alllocate stack memory for local use
 181         sub             $12+16*4, %esp                          // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
 182
 183         // load with called arguments
 184         mov             8(%ebp), %eax                           // P, we need this only briefly, so eax is fine
 185         mov             12(%ebp), %edi                          // C
 186         mov             16(%ebp), %ebx                          // T
 187         mov             20(%ebp), %esi                          // ctx
 188
 189         #define P       %eax
 190         #define C       %edi
 191         #define T       %ebx
 192         #define ctx     %esi
 193         #define sp      %esp
 194
 195 #else
 196         // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
 197
 198         // push into stack for local use
 199         push    %rbp
 200         mov             %rsp, %rbp
 201         push    %r12
 202         push    %r13
 203         push    %r14
 204         push    %r15
 205
 206         // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
 207 #ifdef KERNEL
 208         sub             $4*16, %rsp                                     // only need 3*16, add 16 extra so to make save/restore xmm common to i386
 209 #endif
 210
 211         // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt
 212         mov             %rsi, %r13
 213         mov             %rdx, %r14
 214         mov             %rcx, %r15
 215
 216         #define P       %rdi
 217         #define C       %r13
 218         #define T       %r14
 219         #define ctx     %r15
 220         #define sp      %rsp
 221
 222 #endif
 223
 224         // if kernel, save used xmm registers
 225 #ifdef  KERNEL
 226         movaps  %xmm1, 16(sp)
 227         movaps  %xmm2, 32(sp)
 228         movaps  %xmm7, 48(sp)
 229 #endif
 230
 231         movups  (P), %xmm1                                      // P
 232         movups  (T), %xmm7                                      // T
 233
 234         // setup caliing arguments for aes_encrypt
 235 #if defined     __i386__
 236         mov             C, (%esp)                                       // C
 237         mov             C, 4(%esp)                                      // C
 238         mov             ctx, 8(%esp)                            // ctx
 239 #else
 240         mov             C, %rdi                                         // C
 241         mov             C, %rsi                                         // C
 242         mov             ctx, %rdx                                       // ctx
 243 #endif
 244
 245         pxor    %xmm7, %xmm1                            // C = P ^ T
 246         movups  %xmm1, (C)                                      // save C into memory
 247
 248         call    _aes_encrypt                            // err = aes_encrypt(C,C,ctx);
 249
 250         cmp             $CRYPT_OK, %eax                         // check err == CRYPT_OK
 251         jne             9f                                                      // if err != CRYPT_OK, exit
 252
 253         movups  (C), %xmm1                                      // load xmm1 = C
 254         pxor    %xmm7, %xmm1                            // C ^= T
 255         movups  %xmm1, (C)                                      // write C with xmm1, xmm1 is freed now, will be changed in the following macro
 256
 257         xts_mult_x_on_xmm7                                      // update T (on xmm7)
 258
 259         movups  %xmm7, (T)                                      // write xmm7 to T
 260 9:
 261
 262         // restore used xmm registers if this is for kernel
 263 #ifdef  KERNEL
 264         movaps  16(sp), %xmm1
 265         movaps  32(sp), %xmm2
 266         movaps  48(sp), %xmm7
 267 #endif
 268
 269         // free stack memory and restore callee registers
 270 #if defined     __i386__
 271         add             $12+16*4, %esp                          // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
 272         pop             %esi
 273         pop             %edi
 274         pop             %ebx
 275 #else
 276 #ifdef  KERNEL
 277         add             $4*16, %rsp                                     // only need 3*16, add 16 extra so make save/restore xmm common to i386
 278 #endif
 279         pop             %r15
 280         pop             %r14
 281         pop             %r13
 282         pop             %r12
 283 #endif
 284
 285         // return, eax/rax already has the return val
 286         leave
 287         ret
 288
 289         #undef  P
 290         #undef  C
 291         #undef  T
 292         #undef  ctx
 293         #undef  sp
 294
 295 /*
 296         The following is x86_64/i386 assembly implementation of
 297
 298         int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim);
 299
 300         TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
 301         This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need
 302         to happen once for all grouped blocks.
 303
 304         The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
 305         If aesni is available, the code branch to optimized code that uses aesni.
 306
 307         The optimized aesni code operates as follows:
 308
 309         while (more than 4 consecutive blocks available) {
 310
 311                 do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
 312
 313                 perform 4 C = P ^ T;    // T is on 16-byte aligned stack
 314
 315                 perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput)
 316
 317                 perform 4 C = C ^ T             // T is on 16-byte aligned stack
 318
 319         }
 320
 321         The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
 322
 323         1. C = P ^ T
 324         2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err;
 325         3. C = C ^ T
 326         4. xts_mult_x(T)
 327
 328         Note: used xmm registers :
 329                         xmm0-xmm5, xmm7 if aesni is available
 330                         xmm0-xmm4, xmm7 if aesni is not available.
 331
 332 */
 333
 334     .text
 335         .align  4,0x90
 336         .globl  _tweak_crypt_group
 337 _tweak_crypt_group:
 338
 339 #if defined     __i386__
 340
 341         // push callee-saved registers for local use
 342         push    %ebp
 343         mov             %esp, %ebp
 344         push    %ebx
 345         push    %edi
 346         push    %esi
 347
 348         // allocate stack memory for local use and/or xmm register save for kernel code
 349         sub             $(12+8*16+16*4), %esp           // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
 350                                                                                 // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni
 351         // transfer calling arguments
 352         mov             20(%ebp), %eax                          // ctx
 353         mov             12(%ebp), %edi                          // C
 354         mov             16(%ebp), %ebx                          // T
 355         mov             8(%ebp), %esi                           // P
 356         mov             %eax, 8(%esp)                           // ctx as the 3rd parameter to aes_decrypt
 357
 358         #define P       %esi
 359         #define C       %edi
 360         #define T       %ebx
 361         #define lim     24(%ebp)
 362         #define sp      %esp
 363
 364 #else
 365
 366         // push callee-saved registers for local use
 367         push    %rbp
 368         mov             %rsp, %rbp
 369         push    %rbx
 370         push    %r12
 371         push    %r13
 372         push    %r14
 373         push    %r15
 374
 375         // allocate stack memory for local use and/or xmm register save for kernel code
 376         sub             $(8+8*16+16*5), %rsp            // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
 377
 378         // rdi/rsi/rdx/rcx/r8
 379         // transfer calling arguments
 380         mov             %rdi, %r12
 381         mov             %rsi, %r13
 382         mov             %rdx, %r14
 383         mov             %rcx, %r15
 384         mov             %r8,  %rbx
 385
 386         #define P       %r12
 387         #define C       %r13
 388         #define T       %r14
 389         #define ctx     %r15
 390         #define lim     %ebx
 391         #define sp      %rsp
 392 #endif
 393
 394 #ifdef  KERNEL
 395         movaps  %xmm0, 0x50(sp)
 396         movaps  %xmm1, 0x60(sp)
 397         movaps  %xmm2, 0x70(sp)
 398         movaps  %xmm3, 0x80(sp)
 399         movaps  %xmm4, 0x90(sp)
 400         movaps  %xmm7, 0xa0(sp)
 401 #endif
 402
 403         // probe __cpu_capabilities to detect aesni
 404 #if defined __x86_64__
 405     movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
 406     mov     (%rax), %eax                                    // %eax = __cpu_capabilities
 407 #else           // i386
 408 #if defined KERNEL
 409     leal    __cpu_capabilities, %eax                        // %eax -> __cpu_capabilities
 410     mov     (%eax), %eax                                    // %eax = __cpu_capabilities
 411 #else
 412     movl    _COMM_PAGE_CPU_CAPABILITIES, %eax
 413 #endif
 414 #endif
 415         test    $(kHasAES), %eax
 416         je              L_crypt_group_sw                                                                // if aesni not available, jump to sw-based implementation
 417
 418         // aesni-based implementation
 419
 420         sub             $4, lim                                                                                 // pre-decrement lim by 4
 421         jl              9f                                                                                              // if lim < 4, skip the following code
 422
 423         movups  (T), %xmm7                                                                              // xmm7 is the tweak before encrypting every 4 blocks
 424 #ifdef  KERNEL
 425         movaps  %xmm5, 0xb0(sp)                                                                 // hw-aes-based uses extra xmm5
 426 #endif
 427
 428 0:
 429         // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
 430         // xmm7 will be the tweak for next 4-blocks iteration
 431
 432         #define tweak1  16(sp)
 433         #define tweak2  32(sp)
 434         #define tweak3  48(sp)
 435         #define tweak4  64(sp)
 436
 437         movaps  %xmm7, tweak1                                                                   // save 1st tweak on stack
 438         xts_mult_x_on_xmm7                                                                              // compute 2nd tweak
 439         movaps  %xmm7, tweak2                                                                   // save 2nd tweak on stack
 440         xts_mult_x_on_xmm7                                                                              // compute 3rd tweak
 441         movaps  %xmm7, tweak3                                                                   // save 3rd tweak on stack
 442         xts_mult_x_on_xmm7                                                                              // compute 4th tweak
 443         movaps  %xmm7, tweak4                                                                   // save 4th tweak on stack
 444         xts_mult_x_on_xmm7                                                                              // compute 1st tweak for next iteration
 445
 446         // read 4 Ps
 447         movups  (P), %xmm0
 448         movups  16(P), %xmm1
 449         movups  32(P), %xmm2
 450         movups  48(P), %xmm3
 451
 452         // 4 C = P ^ T
 453         pxor    tweak1, %xmm0
 454         pxor    tweak2, %xmm1
 455         pxor    tweak3, %xmm2
 456         pxor    tweak4, %xmm3
 457
 458         // 4 interleaved aes_encrypt
 459
 460 #if defined     __i386__
 461         mov             8(sp), %ecx     // ctx
 462         #undef  ctx
 463         #define ctx     %ecx
 464 #endif
 465
 466         mov             240(ctx), %eax                                  // aes length
 467
 468         cmp             $160, %eax                                              // AES-128 ?
 469         je              160f
 470         cmp             $192, %eax                                              // AES-192 ?
 471         je              192f
 472         cmp             $224, %eax                                              // AES-256 ?
 473         je              224f
 474         mov             $-1, %eax                                               // error : non-supported aes length
 475 #ifdef  KERNEL
 476         movaps  0xb0(sp), %xmm5                                 // hw-aes-based uses extra xmm5
 477 #endif
 478         jmp             L_error_crypt
 479
 480         // definitions, macros, and constructs for 4 blocks hw-aes-encrypt
 481
 482         // the following key definitions will also be used in tweak_uncrypt_group
 483         #define key0                    0(ctx)
 484         #define key1                    16(ctx)
 485         #define key2                    32(ctx)
 486         #define key3                    48(ctx)
 487         #define key4                    64(ctx)
 488         #define key5                    80(ctx)
 489         #define key6                    96(ctx)
 490         #define key7                    112(ctx)
 491         #define key8                    128(ctx)
 492         #define key9                    144(ctx)
 493         #define keyA                    160(ctx)
 494         #define keyB                    176(ctx)
 495         #define keyC                    192(ctx)
 496         #define keyD                    208(ctx)
 497         #define keyE                    224(ctx)
 498
 499         #define aes             aesenc
 500         #define aeslast aesenclast
 501
 502         // all aes encrypt operations start with the following sequence
 503         .macro  aes_common_part
 504         movups  key0, %xmm4
 505         movups  key1, %xmm5
 506         pxor    %xmm4, %xmm0
 507         pxor    %xmm4, %xmm1
 508         pxor    %xmm4, %xmm2
 509         pxor    %xmm4, %xmm3
 510         movups  key2, %xmm4
 511         aes             %xmm5, %xmm0
 512         aes             %xmm5, %xmm1
 513         aes             %xmm5, %xmm2
 514         aes             %xmm5, %xmm3
 515         movups  key3, %xmm5
 516         aes             %xmm4, %xmm0
 517         aes             %xmm4, %xmm1
 518         aes             %xmm4, %xmm2
 519         aes             %xmm4, %xmm3
 520         movups  key4, %xmm4
 521         aes             %xmm5, %xmm0
 522         aes             %xmm5, %xmm1
 523         aes             %xmm5, %xmm2
 524         aes             %xmm5, %xmm3
 525         movups  key5, %xmm5
 526         aes             %xmm4, %xmm0
 527         aes             %xmm4, %xmm1
 528         aes             %xmm4, %xmm2
 529         aes             %xmm4, %xmm3
 530         movups  key6, %xmm4
 531         aes             %xmm5, %xmm0
 532         aes             %xmm5, %xmm1
 533         aes             %xmm5, %xmm2
 534         aes             %xmm5, %xmm3
 535         movups  key7, %xmm5
 536         aes             %xmm4, %xmm0
 537         aes             %xmm4, %xmm1
 538         aes             %xmm4, %xmm2
 539         aes             %xmm4, %xmm3
 540         movups  key8, %xmm4
 541         aes             %xmm5, %xmm0
 542         aes             %xmm5, %xmm1
 543         aes             %xmm5, %xmm2
 544         aes             %xmm5, %xmm3
 545         movups  key9, %xmm5
 546         aes             %xmm4, %xmm0
 547         aes             %xmm4, %xmm1
 548         aes             %xmm4, %xmm2
 549         aes             %xmm4, %xmm3
 550         movups  keyA, %xmm4
 551         aes             %xmm5, %xmm0
 552         aes             %xmm5, %xmm1
 553         aes             %xmm5, %xmm2
 554         aes             %xmm5, %xmm3
 555         .endm
 556
 557         // all aes encypt operations end with the following 4 instructions
 558         .macro  aes_last
 559         aeslast %xmm4, %xmm0
 560         aeslast %xmm4, %xmm1
 561         aeslast %xmm4, %xmm2
 562         aeslast %xmm4, %xmm3
 563         .endm
 564
 565         .macro  aes_128
 566         aes_common_part                 // encrypt common part
 567         aes_last                                // encrypt ending part
 568         .endm
 569
 570         .macro  aes_192
 571         aes_common_part                 // encrypt common part
 572
 573         // 10 extra instructions in between common and ending
 574         movups  keyB, %xmm5
 575         aes             %xmm4, %xmm0
 576         aes             %xmm4, %xmm1
 577         aes             %xmm4, %xmm2
 578         aes             %xmm4, %xmm3
 579         movups  keyC, %xmm4
 580         aes             %xmm5, %xmm0
 581         aes             %xmm5, %xmm1
 582         aes             %xmm5, %xmm2
 583         aes             %xmm5, %xmm3
 584
 585         aes_last                                // encrypt ending part
 586         .endm
 587
 588         .macro  aes_256
 589         aes_common_part                 // encrypt common part
 590
 591         // 20 extra instructions in between common and ending
 592         movups  keyB, %xmm5
 593         aes             %xmm4, %xmm0
 594         aes             %xmm4, %xmm1
 595         aes             %xmm4, %xmm2
 596         aes             %xmm4, %xmm3
 597         movups  keyC, %xmm4
 598         aes             %xmm5, %xmm0
 599         aes             %xmm5, %xmm1
 600         aes             %xmm5, %xmm2
 601         aes             %xmm5, %xmm3
 602         movups  keyD, %xmm5
 603         aes             %xmm4, %xmm0
 604         aes             %xmm4, %xmm1
 605         aes             %xmm4, %xmm2
 606         aes             %xmm4, %xmm3
 607         movups  keyE, %xmm4
 608         aes             %xmm5, %xmm0
 609         aes             %xmm5, %xmm1
 610         aes             %xmm5, %xmm2
 611         aes             %xmm5, %xmm3
 612
 613         aes_last                                // encrypt ending part
 614         .endm
 615
 616 160:    // AES-128 encrypt
 617         aes_128
 618         jmp             8f
 619
 620 192:    // AES-192 encrypt
 621         aes_192
 622         jmp             8f
 623
 624 224:    // AES-256 encrypt
 625         aes_256
 626
 627 8:
 628
 629         // 4 C = C ^ T
 630         pxor    tweak1, %xmm0
 631         pxor    tweak2, %xmm1
 632         pxor    tweak3, %xmm2
 633         pxor    tweak4, %xmm3
 634
 635         // write 4 Cs
 636         movups  %xmm0, (C)
 637         movups  %xmm1, 16(C)
 638         movups  %xmm2, 32(C)
 639         movups  %xmm3, 48(C)
 640
 641         add     $64, P
 642         add     $64, C
 643
 644         sub             $4, lim
 645         jge             0b
 646
 647 #ifdef  KERNEL
 648         movaps  0xb0(sp), %xmm5                         // hw-aes-based uses extra xmm5
 649 #endif
 650         movups  %xmm7, (T)
 651
 652 9:
 653         xor             %eax, %eax                                      // to return CRYPT_OK
 654         add             $4, lim                                         // post-increment lim by 4
 655         je              9f                                                      // if lim==0, branch to prepare to return
 656
 657 L_crypt_group_sw:
 658
 659         movups  (T), %xmm7                                      // T, xmm7 will be used as T (128-bit) throughtout the loop
 660
 661         sub             $1, lim                                         // pre-decrement lim by 1
 662         jl              1f                                                      // if lim < 1, branch to prepare to return
 663 0:
 664         movups  (P), %xmm0                                      // P
 665
 666         // prepare for calling aes_encrypt
 667 #if defined     __i386__
 668         mov             C, (%esp)                                       // C
 669         mov             C, 4(%esp)                                      // C
 670                                                                                 // ctx was prepared previously in preamble
 671 #else
 672         mov             C, %rdi                                         // C
 673         mov             C, %rsi                                         // C
 674         mov             ctx, %rdx                                       // ctx
 675 #endif
 676
 677         pxor    %xmm7, %xmm0                            // C = P ^ T
 678         movups  %xmm0, (C)                                      // save C into memory
 679
 680         call    _aes_encrypt_xmm_no_save        // err = aes_encrypt(C,C,ctx);
 681
 682         cmp             $CRYPT_OK, %eax                         // err == CRYPT_OK ?
 683         jne             9f                                                      // if err != CRYPT_OK, branch to exit with error
 684
 685         movups  (C), %xmm0                                      // load xmm0 with C
 686         pxor    %xmm7, %xmm0                            // C ^= T
 687         movups  %xmm0, (C)                                      // save output C
 688
 689         xts_mult_x_on_xmm7
 690
 691         add             $16, C                                          // next C
 692         add             $16, P                                          // next P
 693         sub             $1, lim                                         // lim--
 694         jge             0b                                                      // if (lim>0) repeat the scalar loop
 695
 696 1:      movups  %xmm7, (T)                                      // save final tweak
 697 L_error_crypt:
 698 9:
 699         // if kernel, restore used xmm registers
 700 #ifdef  KERNEL
 701         movaps  0x50(sp), %xmm0
 702         movaps  0x60(sp), %xmm1
 703         movaps  0x70(sp), %xmm2
 704         movaps  0x80(sp), %xmm3
 705         movaps  0x90(sp), %xmm4
 706         movaps  0xa0(sp), %xmm7
 707 #endif
 708
 709 #if defined     __i386__
 710         add             $(12+16*8+16*4), %esp
 711         pop             %esi
 712         pop             %edi
 713         pop             %ebx
 714 #else
 715         add             $(8+16*8+16*5), %rsp
 716         pop             %r15
 717         pop             %r14
 718         pop             %r13
 719         pop             %r12
 720         pop             %rbx
 721 #endif
 722         leave
 723         ret
 724
 725         #undef  P
 726         #undef  C
 727         #undef  T
 728         #undef  ctx
 729         #undef  sp
 730
 731 /*
 732         The following is x86_64/i386 assembly implementation of
 733
 734         int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx);
 735
 736         Its C code implementation is given in xtsClearC.c
 737
 738         all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data.
 739
 740         The operation of tweak_crypt
 741
 742         1. P = C ^ T
 743         2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err;
 744         3. P = P ^ T
 745         4. xts_mult_x(T)
 746         5. return CRYPT_OK;
 747
 748         The following is the assembly implementation flow
 749
 750         1. save used xmm registers (xmm1/xmm7) if kernel code
 751         2. load xmm1 = C, xmm7 = T
 752         3. xmm1 = P = C ^ T
 753         4. write xmm1 to P
 754         5. call aes_decryp(P,P,ctx); note that it will use aesni if available, also xmm will return intact
 755         6. load xmm1 = P
 756         7. xmm1 = P = P^T = xmm1 ^ xmm7
 757         8. write xmm1 to P
 758         9. update T (in xmm7) via xts_mult_x macro
 759         a. restore xmm registers (xmm1/xmm7) if kernel code
 760         b. return CRYPT_OK (in eax)
 761
 762         Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro
 763
 764 */
 765
 766         .text
 767         .align  4,0x90
 768         .globl  _tweak_uncrypt
 769 _tweak_uncrypt:
 770 #if defined     __i386__
 771
 772         // push into stack for local use
 773         push    %ebp
 774         mov             %esp, %ebp
 775         push    %ebx
 776         push    %edi
 777         push    %esi
 778
 779         // alllocate stack memory for local use
 780         sub             $12+16*4, %esp                          // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
 781
 782         // load with called arguments
 783         mov             8(%ebp), %eax                           // C, we need this only briefly, so eax is fine
 784         mov             12(%ebp), %edi                          // P
 785         mov             16(%ebp), %ebx                          // T
 786         mov             20(%ebp), %esi                          // ctx
 787
 788         #define C       %eax
 789         #define P       %edi
 790         #define T       %ebx
 791         #define ctx     %esi
 792         #define sp      %esp
 793
 794 #else
 795         // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8
 796
 797         // push into stack for local use
 798         push    %rbp
 799         mov             %rsp, %rbp
 800         push    %r12
 801         push    %r13
 802         push    %r14
 803         push    %r15
 804
 805         // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers
 806 #ifdef KERNEL
 807         sub             $4*16, %rsp                                     // only need 3*16, add 16 extra so to make save/restore xmm common to i386
 808 #endif
 809
 810         // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt
 811         mov             %rsi, %r13
 812         mov             %rdx, %r14
 813         mov             %rcx, %r15
 814
 815         #define C       %rdi
 816         #define P       %r13
 817         #define T       %r14
 818         #define ctx     %r15
 819         #define sp      %rsp
 820
 821 #endif
 822
 823         // if kernel, save used xmm registers
 824 #ifdef  KERNEL
 825         movaps  %xmm1, 16(sp)
 826         movaps  %xmm2, 32(sp)
 827         movaps  %xmm7, 48(sp)
 828 #endif
 829
 830         movups  (C), %xmm1                                      // C
 831         movups  (T), %xmm7                                      // T
 832
 833         // setup caliing arguments for aes_decrypt
 834 #if defined     __i386__
 835         mov             P, (%esp)                                       // P
 836         mov             P, 4(%esp)                                      // P
 837         mov             ctx, 8(%esp)                            // ctx
 838 #else
 839         mov             P, %rdi                                         // P
 840         mov             P, %rsi                                         // P
 841         mov             ctx, %rdx                                       // ctx
 842 #endif
 843
 844         pxor    %xmm7, %xmm1                            // P = C ^ T
 845         movups  %xmm1, (P)                                      // save P into memory
 846
 847         call    _aes_decrypt                            // err = aes_decrypt(P,P,ctx);
 848
 849         cmp             $CRYPT_OK, %eax                         // check err == CRYPT_OK
 850         jne             9f                                                      // if err != CRYPT_OK, exit
 851
 852         movups  (P), %xmm1                                      // load xmm1 = P
 853         pxor    %xmm7, %xmm1                            // P ^= T
 854         movups  %xmm1, (P)                                      // write P with xmm1, xmm1 is freed now, will be changed in the following macro
 855
 856         xts_mult_x_on_xmm7                                      // update T (on xmm7)
 857
 858         movups  %xmm7, (T)                                      // write xmm7 to T
 859 9:
 860
 861         // restore used xmm registers if this is for kernel
 862 #ifdef  KERNEL
 863         movaps  16(sp), %xmm1
 864         movaps  32(sp), %xmm2
 865         movaps  48(sp), %xmm7
 866 #endif
 867
 868         // free stack memory and restore callee registers
 869 #if defined     __i386__
 870         add             $12+16*4, %esp                          // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments)
 871         pop             %esi
 872         pop             %edi
 873         pop             %ebx
 874 #else
 875 #ifdef  KERNEL
 876         add             $4*16, %rsp                                     // only need 3*16, add 16 extra so make save/restore xmm common to i386
 877 #endif
 878         pop             %r15
 879         pop             %r14
 880         pop             %r13
 881         pop             %r12
 882 #endif
 883
 884         // return, eax/rax already has the return val
 885         leave
 886         ret
 887
 888         #undef  P
 889         #undef  C
 890         #undef  T
 891         #undef  ctx
 892         #undef  sp
 893
 894 /*
 895         The following is x86_64/i386 assembly implementation of
 896
 897         int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim);
 898
 899         TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs)
 900         This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need
 901         to happen once for all grouped blocks.
 902
 903         The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available.
 904         If aesni is available, the code branch to optimized code that uses aesni.
 905
 906         The optimized aesni code operates as follows:
 907
 908         while (more than 4 consecutive blocks available) {
 909
 910                 do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned)
 911
 912                 perform 4 P = C ^ T;    // T is on 16-byte aligned stack
 913
 914                 perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput)
 915
 916                 perform 4 P = P ^ T             // T is on 16-byte aligned stack
 917
 918         }
 919
 920         The code then falls through to the scalar code, that sequentially performs what tweak_crypt does
 921
 922         1. P = C ^ T
 923         2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err;
 924         3. P = P ^ T
 925         4. xts_mult_x(T)
 926
 927         Note: used xmm registers :
 928                         xmm0-xmm5, xmm7 if aesni is available
 929                         xmm0-xmm4, xmm7 if aesni is not available.
 930
 931 */
 932
 933     .text
 934         .align  4,0x90
 935         .globl  _tweak_uncrypt_group
 936 _tweak_uncrypt_group:
 937
 938 #if defined     __i386__
 939
 940         // push callee-saved registers for local use
 941         push    %ebp
 942         mov             %esp, %ebp
 943         push    %ebx
 944         push    %edi
 945         push    %esi
 946
 947         // allocate stack memory for local use and/or xmm register save for kernel code
 948         sub             $(12+8*16+16*4), %esp           // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni
 949                                                                                 // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni
 950         // transfer calling arguments
 951         mov             20(%ebp), %eax                          // ctx
 952         mov             12(%ebp), %edi                          // P
 953         mov             16(%ebp), %ebx                          // T
 954         mov             8(%ebp), %esi                           // C
 955         mov             %eax, 8(%esp)                           // ctx as the 3rd parameter to aes_decrypt
 956
 957         #define C       %esi
 958         #define P       %edi
 959         #define T       %ebx
 960         #define lim     24(%ebp)
 961         #define sp      %esp
 962
 963 #else
 964
 965         // push callee-saved registers for local use
 966         push    %rbp
 967         mov             %rsp, %rbp
 968         push    %rbx
 969         push    %r12
 970         push    %r13
 971         push    %r14
 972         push    %r15
 973
 974         // allocate stack memory for local use and/or xmm register save for kernel code
 975         sub             $(8+8*16+16*5), %rsp            // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386)
 976
 977         // rdi/rsi/rdx/rcx/r8
 978         // transfer calling arguments
 979         mov             %rdi, %r12
 980         mov             %rsi, %r13
 981         mov             %rdx, %r14
 982         mov             %rcx, %r15
 983         mov             %r8,  %rbx
 984
 985         #define C       %r12
 986         #define P       %r13
 987         #define T       %r14
 988         #define ctx     %r15
 989         #define lim     %ebx
 990         #define sp      %rsp
 991 #endif
 992
 993 #ifdef  KERNEL
 994         movaps  %xmm0, 0x50(sp)
 995         movaps  %xmm1, 0x60(sp)
 996         movaps  %xmm2, 0x70(sp)
 997         movaps  %xmm3, 0x80(sp)
 998         movaps  %xmm4, 0x90(sp)
 999         movaps  %xmm7, 0xa0(sp)
1000 #endif
1001
1002         // probe __cpu_capabilities to detect aesni
1003 #if defined __x86_64__
1004     movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
1005     mov     (%rax), %eax                                    // %eax = __cpu_capabilities
1006 #else           // i386
1007 #if defined KERNEL
1008     leal    __cpu_capabilities, %eax                        // %eax -> __cpu_capabilities
1009     mov     (%eax), %eax                                    // %eax = __cpu_capabilities
1010 #else
1011     movl    _COMM_PAGE_CPU_CAPABILITIES, %eax
1012 #endif
1013 #endif
1014         test    $(kHasAES), %eax
1015         je              L_uncrypt_group_sw                                                              // if aesni not available, jump to sw-based implementation
1016
1017         // aesni-based implementation
1018
1019         sub             $4, lim                                                                                 // pre-decrement lim by 4
1020         jl              9f                                                                                              // if lim < 4, skip the following code
1021
1022         movups  (T), %xmm7                                                                              // xmm7 is the tweak before decrypting every 4 blocks
1023 #ifdef  KERNEL
1024         movaps  %xmm5, 0xb0(sp)                                                                 // hw-aes-based uses extra xmm5
1025 #endif
1026
1027 0:
1028         // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space
1029         // xmm7 will be the tweak for next 4-blocks iteration
1030
1031         #define tweak1  16(sp)
1032         #define tweak2  32(sp)
1033         #define tweak3  48(sp)
1034         #define tweak4  64(sp)
1035
1036         movaps  %xmm7, tweak1                                                                   // save 1st tweak on stack
1037         xts_mult_x_on_xmm7                                                                              // compute 2nd tweak
1038         movaps  %xmm7, tweak2                                                                   // save 2nd tweak on stack
1039         xts_mult_x_on_xmm7                                                                              // compute 3rd tweak
1040         movaps  %xmm7, tweak3                                                                   // save 3rd tweak on stack
1041         xts_mult_x_on_xmm7                                                                              // compute 4th tweak
1042         movaps  %xmm7, tweak4                                                                   // save 4th tweak on stack
1043         xts_mult_x_on_xmm7                                                                              // compute 1st tweak for next iteration
1044
1045         // read 4 Cs
1046         movups  (C), %xmm0
1047         movups  16(C), %xmm1
1048         movups  32(C), %xmm2
1049         movups  48(C), %xmm3
1050
1051         // 4 P = C ^ T
1052         pxor    tweak1, %xmm0
1053         pxor    tweak2, %xmm1
1054         pxor    tweak3, %xmm2
1055         pxor    tweak4, %xmm3
1056
1057         // 4 interleaved aes_decrypt
1058
1059 #if defined     __i386__
1060         mov             8(sp), %ecx     // ctx
1061         #undef  ctx
1062         #define ctx     %ecx
1063 #endif
1064
1065         mov             240(ctx), %eax                                  // aes length
1066
1067         cmp             $160, %eax                                              // AES-128 ?
1068         je              160f
1069         cmp             $192, %eax                                              // AES-192 ?
1070         je              192f
1071         cmp             $224, %eax                                              // AES-256 ?
1072         je              224f
1073         mov             $-1, %eax                                               // error : non-supported aes length
1074 #ifdef  KERNEL
1075         movaps  0xb0(sp), %xmm5                                 // hw-aes-based uses extra xmm5
1076 #endif
1077         jmp             L_error_uncrypt
1078
1079         // definitions, macros to construc hw-aes-decrypt
1080         // will reuse previously defined key0 = (ctx), key1 = 16(ctx), ....
1081         #undef  aes
1082         #undef  aeslast
1083         #define aes     aesdec
1084         #define aeslast aesdeclast
1085
1086         .macro  aes_decrypt_common
1087         movups  key8, %xmm4
1088         aes             %xmm5, %xmm0
1089         aes             %xmm5, %xmm1
1090         aes             %xmm5, %xmm2
1091         aes             %xmm5, %xmm3
1092         movups  key7, %xmm5
1093         aes             %xmm4, %xmm0
1094         aes             %xmm4, %xmm1
1095         aes             %xmm4, %xmm2
1096         aes             %xmm4, %xmm3
1097         movups  key6, %xmm4
1098         aes             %xmm5, %xmm0
1099         aes             %xmm5, %xmm1
1100         aes             %xmm5, %xmm2
1101         aes             %xmm5, %xmm3
1102         movups  key5, %xmm5
1103         aes             %xmm4, %xmm0
1104         aes             %xmm4, %xmm1
1105         aes             %xmm4, %xmm2
1106         aes             %xmm4, %xmm3
1107         movups  key4, %xmm4
1108         aes             %xmm5, %xmm0
1109         aes             %xmm5, %xmm1
1110         aes             %xmm5, %xmm2
1111         aes             %xmm5, %xmm3
1112         movups  key3, %xmm5
1113         aes             %xmm4, %xmm0
1114         aes             %xmm4, %xmm1
1115         aes             %xmm4, %xmm2
1116         aes             %xmm4, %xmm3
1117         movups  key2, %xmm4
1118         aes             %xmm5, %xmm0
1119         aes             %xmm5, %xmm1
1120         aes             %xmm5, %xmm2
1121         aes             %xmm5, %xmm3
1122         movups  key1, %xmm5
1123         aes             %xmm4, %xmm0
1124         aes             %xmm4, %xmm1
1125         aes             %xmm4, %xmm2
1126         aes             %xmm4, %xmm3
1127         movups  key0, %xmm4
1128         aes             %xmm5, %xmm0
1129         aes             %xmm5, %xmm1
1130         aes             %xmm5, %xmm2
1131         aes             %xmm5, %xmm3
1132         aeslast %xmm4, %xmm0
1133         aeslast %xmm4, %xmm1
1134         aeslast %xmm4, %xmm2
1135         aeslast %xmm4, %xmm3
1136         .endm
1137
1138         .macro  aes_dec_128
1139         movups  keyA, %xmm4
1140         movups  key9, %xmm5
1141         pxor    %xmm4, %xmm0
1142         pxor    %xmm4, %xmm1
1143         pxor    %xmm4, %xmm2
1144         pxor    %xmm4, %xmm3
1145         aes_decrypt_common
1146         .endm
1147
1148         .macro  aes_dec_192
1149         movups  keyC, %xmm4
1150         movups  keyB, %xmm5
1151         pxor    %xmm4, %xmm0
1152         pxor    %xmm4, %xmm1
1153         pxor    %xmm4, %xmm2
1154         pxor    %xmm4, %xmm3
1155         movups  keyA, %xmm4
1156         aes             %xmm5, %xmm0
1157         aes             %xmm5, %xmm1
1158         aes             %xmm5, %xmm2
1159         aes             %xmm5, %xmm3
1160         movups  key9, %xmm5
1161         aes             %xmm4, %xmm0
1162         aes             %xmm4, %xmm1
1163         aes             %xmm4, %xmm2
1164         aes             %xmm4, %xmm3
1165         aes_decrypt_common
1166         .endm
1167
1168         .macro  aes_dec_256
1169         movups  keyE, %xmm4
1170         movups  keyD, %xmm5
1171         pxor    %xmm4, %xmm0
1172         pxor    %xmm4, %xmm1
1173         pxor    %xmm4, %xmm2
1174         pxor    %xmm4, %xmm3
1175         movups  keyC, %xmm4
1176         aes             %xmm5, %xmm0
1177         aes             %xmm5, %xmm1
1178         aes             %xmm5, %xmm2
1179         aes             %xmm5, %xmm3
1180         movups  keyB, %xmm5
1181         aes             %xmm4, %xmm0
1182         aes             %xmm4, %xmm1
1183         aes             %xmm4, %xmm2
1184         aes             %xmm4, %xmm3
1185         movups  keyA, %xmm4
1186         aes             %xmm5, %xmm0
1187         aes             %xmm5, %xmm1
1188         aes             %xmm5, %xmm2
1189         aes             %xmm5, %xmm3
1190         movups  key9, %xmm5
1191         aes             %xmm4, %xmm0
1192         aes             %xmm4, %xmm1
1193         aes             %xmm4, %xmm2
1194         aes             %xmm4, %xmm3
1195         aes_decrypt_common
1196         .endm
1197
1198 160:    // AES-128 decrypt
1199         aes_dec_128
1200         jmp             8f
1201
1202 192:    // AES-192 decrypt
1203         aes_dec_192
1204         jmp             8f
1205
1206 224:    // AES-256 decrypt
1207         aes_dec_256
1208
1209 8:
1210
1211         // 4 P = P ^ T
1212         pxor    tweak1, %xmm0
1213         pxor    tweak2, %xmm1
1214         pxor    tweak3, %xmm2
1215         pxor    tweak4, %xmm3
1216
1217         // write 4 Ps
1218         movups  %xmm0, (P)
1219         movups  %xmm1, 16(P)
1220         movups  %xmm2, 32(P)
1221         movups  %xmm3, 48(P)
1222
1223         add     $64, C
1224         add     $64, P
1225
1226         sub             $4, lim
1227         jge             0b
1228
1229 #ifdef  KERNEL
1230         movaps  0xb0(sp), %xmm5                         // hw-aes-based uses extra xmm5
1231 #endif
1232         movups  %xmm7, (T)
1233
1234 9:
1235         xor             %eax, %eax                                      // to return CRYPT_OK
1236         add             $4, lim                                         // post-increment lim by 4
1237         je              9f                                                      // if lim==0, branch to prepare to return
1238
1239 L_uncrypt_group_sw:
1240
1241         movups  (T), %xmm7                                      // T, xmm7 will be used as T (128-bit) throughtout the loop
1242
1243         sub             $1, lim                                         // pre-decrement lim by 1
1244         jl              1f                                                      // if lim < 1, branch to prepare to return
1245 0:
1246         movups  (C), %xmm0                                      // C
1247
1248         // prepare for calling aes_decrypt
1249 #if defined     __i386__
1250         mov             P, (%esp)                                       // P
1251         mov             P, 4(%esp)                                      // P
1252                                                                                 // ctx was prepared previously in preamble
1253 #else
1254         mov             P, %rdi                                         // P
1255         mov             P, %rsi                                         // P
1256         mov             ctx, %rdx                                       // ctx
1257 #endif
1258
1259         pxor    %xmm7, %xmm0                            // P = C ^ T
1260         movups  %xmm0, (P)                                      // save P into memory
1261
1262         call    _aes_decrypt_xmm_no_save        // err = aes_decrypt(P,P,ctx);
1263
1264         cmp             $CRYPT_OK, %eax                         // err == CRYPT_OK ?
1265         jne             9f                                                      // if err != CRYPT_OK, branch to exit with error
1266
1267         movups  (P), %xmm0                                      // load xmm0 with P
1268         pxor    %xmm7, %xmm0                            // P ^= T
1269         movups  %xmm0, (P)                                      // save output P
1270
1271         xts_mult_x_on_xmm7
1272
1273         add             $16, C                                          // next C
1274         add             $16, P                                          // next P
1275         sub             $1, lim                                         // lim--
1276         jge             0b                                                      // if (lim>0) repeat the scalar loop
1277
1278 1:      movups  %xmm7, (T)                                      // save final tweak
1279 L_error_uncrypt:
1280 9:
1281         // if kernel, restore used xmm registers
1282 #ifdef  KERNEL
1283         movaps  0x50(sp), %xmm0
1284         movaps  0x60(sp), %xmm1
1285         movaps  0x70(sp), %xmm2
1286         movaps  0x80(sp), %xmm3
1287         movaps  0x90(sp), %xmm4
1288         movaps  0xa0(sp), %xmm7
1289 #endif
1290
1291 #if defined     __i386__
1292         add             $(12+16*8+16*4), %esp
1293         pop             %esi
1294         pop             %edi
1295         pop             %ebx
1296 #else
1297         add             $(8+16*8+16*5), %rsp
1298         pop             %r15
1299         pop             %r14
1300         pop             %r13
1301         pop             %r12
1302         pop             %rbx
1303 #endif
1304         leave
1305         ret