xnu-1699.24.23.tar.gz

author Apple <opensource@apple.com>

Thu, 2 Feb 2012 16:16:40 +0000 (16:16 +0000)

committer Apple <opensource@apple.com>

Thu, 2 Feb 2012 16:16:40 +0000 (16:16 +0000)
author Apple <opensource@apple.com>
Thu, 2 Feb 2012 16:16:40 +0000 (16:16 +0000)
committer Apple <opensource@apple.com>
Thu, 2 Feb 2012 16:16:40 +0000 (16:16 +0000)
diff --git a/bsd/crypto/aes/gen/aesopt.h b/bsd/crypto/aes/gen/aesopt.h

index fc28e4a48a93b28a70ef004f330dddd14b063802..a00794865bdb862af594e39ef6f066bfe71afcaf 100644 (file)
--- a/bsd/crypto/aes/gen/aesopt.h
+++ b/bsd/crypto/aes/gen/aesopt.h
@@ -283,9 +283,6 @@
      assembler code routines for encryption and decryption with the C code
      only providing key scheduling
  */
-#if 0 && !defined(AES_ASM)
-#define AES_ASM
-#endif
  
  /*  3. BYTE ORDER WITHIN 32 BIT WORDS
  
@@ -316,15 +313,7 @@
  
      NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set
  */
-#if 1 || defined(AES_ASM)
  #define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
-#elif 0
-#define ALGORITHM_BYTE_ORDER BRG_LITTLE_ENDIAN
-#elif 0
-#define ALGORITHM_BYTE_ORDER BRG_BIG_ENDIAN
-#else
-#error The algorithm byte order is not defined
-#endif
  
  /*  4. FAST INPUT/OUTPUT OPERATIONS.
  
@@ -342,9 +331,6 @@
      assumed that access to byte arrays as if they are arrays of 32-bit
      words will not cause problems when such accesses are misaligned.
  */
-#if 0 && !defined(_MSC_VER)
-#define SAFE_IO
-#endif
  
  /*  5. LOOP UNROLLING
  
@@ -429,9 +415,6 @@
      it seems to sometimes cause trouble for the VC++ version 6 compiler.
  */
  
-#if 0 && defined(_MSC_VER) && (_MSC_VER >= 1300)
-#define TABLE_ALIGN 64
-#endif
  
  /*  10. INTERNAL TABLE CONFIGURATION
  
diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s

index c9702eaec5d4cdaa859833f99663d0c2076c9349..b9e35085c9c182c6578f57893888dfc149ee27b2 100644 (file)
--- a/bsd/crypto/aes/i386/aes_modes_hw.s
+++ b/bsd/crypto/aes/i386/aes_modes_hw.s
@@ -1,1622 +1,1623 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
-
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
-
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
-
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- ALTERNATIVELY, provided that this notice is retained in full, this product
- may be distributed under the terms of the GNU General Public License (GPL),
- in which case the provisions of the GPL apply INSTEAD OF those given above.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue 31/01/2006
-
- These subroutines implement multiple block AES modes for ECB, CBC, CFB,
- OFB and CTR encryption,  The code provides support for the VIA Advanced 
- Cryptography Engine (ACE).
-
- NOTE: In the following subroutines, the AES contexts (ctx) must be
- 16 byte aligned if VIA ACE is being used
-*/
-
-/* ---------------------------------------------------------------------------------------------------------------- 
-
-       aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
-
-       For simplicity, I am assuming all variables are in 128-bit data type.
-
-       aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)
-       {
-               while(num_blk--) {
-                       *iv ^= *ibuf++;
-                       aes_encrypt(iv, iv, ctx);
-                       *obuf++ = *iv;
-               }
-               return 0;
-       }
-
-       The following is an implementation of this function using Intel AESNI.
-       This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. 
-       Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
-       to this aesni-based function should it detecs that aesni is available.
-       Blindly call this function SURELY will cause a CRASH on systems with no aesni support. 
-
-       Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks
-       are serially chained. This prevents us from arranging several blocks for encryption in parallel.
-
-   ----------------------------------------------------------------------------------------------------------------*/
-
-       .text
-       .align  4,0x90
-       .globl  _aes_encrypt_cbc_hw
-_aes_encrypt_cbc_hw:
-
-       // push/save registers for local use
-#if    defined __i386__
-
-       push    %ebp
-       movl    %esp, %ebp
-       push    %ebx
-       push    %edi
-
-       #define sp      %esp
-
-#else  // __x86_64__
-
-       push    %rbp
-       mov             %rsp, %rbp
-       push    %rbx
-       push    %r13
-       push    %r14
-       push    %r15
-
-       #define sp      %rsp
-
-#endif
-
-       // if this is kernel code, need to save used xmm registers
-#ifdef KERNEL
-
-#if defined __i386__
-       sub             $(8*16), %esp                   // for possible xmm0-xmm7 save/restore
-#else
-       sub             $(16*16), %rsp          // xmm0-xmm15 save/restore      
-#endif
-
-       movaps  %xmm0, (sp)
-       movaps  %xmm1, 16(sp)
-       movaps  %xmm2, 32(sp)
-       movaps  %xmm3, 48(sp)
-       movaps  %xmm4, 64(sp)
-       movaps  %xmm5, 80(sp)
-       movaps  %xmm6, 96(sp)
-       movaps  %xmm7, 112(sp)
-#if defined    __x86_64__
-       movaps  %xmm8, 16*8(sp)
-       movaps  %xmm9, 16*9(sp)
-       movaps  %xmm10, 16*10(sp)
-       movaps  %xmm11, 16*11(sp)
-       movaps  %xmm12, 16*12(sp)
-       movaps  %xmm13, 16*13(sp)
-       movaps  %xmm14, 16*14(sp)
-       movaps  %xmm15, 16*15(sp)
-#endif // __x86_64__
-
-#endif // KERNEL
-
-       #define iv      %xmm0
-
-#ifdef __i386__
-
-       mov             12(%ebp), %eax                  // in_iv
-       mov             24(%ebp), %edx                  // ctx
-       movups  (%eax), iv                              // iv = in_iv   
-       mov             8(%ebp), %ebx                   // ibuf
-       mov             16(%ebp), %ecx                  // num_blk
-       mov             20(%ebp), %edi                  // obuf
-
-       #define ibuf    %ebx
-       #define obuf    %edi
-       #define num_blk %ecx    
-       #define ctx             %edx
-
-#else
-
-       mov             %rdi, %rbx                              // ibuf
-       movups  (%rsi), iv                              // iv = in_iv
-       mov             %rdx, %r13                              // num_blk
-       mov             %rcx, %r14                              // obuf
-       mov             %r8, %r15                               // ctx  
-
-       #define ibuf    %rbx
-       #define num_blk %r13d
-       #define obuf    %r14    
-       #define ctx             %r15
-
-#endif
-
-       mov             240(ctx), %eax                  // aes length
-       cmp             $160, %eax                              // aes-128 encrypt ?
-       je              L_encrypt_128
-       cmp             $192, %eax                              // aes-192 encrypt ?
-       je              L_encrypt_192
-       cmp             $224, %eax                              // aes-256 encrypt ?
-       je              L_encrypt_256
-       mov             $-1, %eax                               // return error
-       jmp             L_error 
-
-       //
-       // aes-128 encrypt_cbc operation, up to L_HW_cbc_done
-       //
-
-L_encrypt_128:
-
-       cmp             $1, num_blk                             // check number of block
-       jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
-
-       movups  (ctx), %xmm2                    // key0
-       movups  16(ctx), %xmm3                  // key1
-       movups  32(ctx), %xmm4                  // key2
-       movups  48(ctx), %xmm5                  // key3
-       movups  64(ctx), %xmm6                  // key4
-       movups  80(ctx), %xmm7                  // key5
-#if defined    __x86_64__
-       movups  96(ctx), %xmm8                  // key6
-       movups  112(ctx), %xmm9                 // key7
-       movups  128(ctx), %xmm10                // key8
-       movups  144(ctx), %xmm11                // key9
-       movups  160(ctx), %xmm12                // keyA
-#endif
-
-       // while (num_blk--) {
-       //                      *iv ^= *ibuf++;
-       //                      aes_encrypt(iv, iv, ctx);
-       //                      *obuf++ = *iv;
-       // }
-0:
-       movups  (ibuf), %xmm1                           // *ibuf
-       pxor    %xmm2, iv                                       // 1st instruction inside aes_encrypt
-       pxor    %xmm1, iv                                       // *iv ^= *ibuf
-
-       // finishing up the rest of aes_encrypt
-    aesenc  %xmm3, iv
-    aesenc  %xmm4, iv
-    aesenc  %xmm5, iv
-    aesenc  %xmm6, iv
-    aesenc  %xmm7, iv
-#if defined    __x86_64__
-    aesenc  %xmm8, iv
-    aesenc  %xmm9, iv
-    aesenc  %xmm10, iv
-    aesenc  %xmm11, iv
-    aesenclast  %xmm12, iv
-#else
-       movups  96(ctx), %xmm1                          // key6
-    aesenc  %xmm1, iv
-       movups  112(ctx), %xmm1                         // key7
-    aesenc  %xmm1, iv
-       movups  128(ctx), %xmm1                         // key8
-    aesenc  %xmm1, iv
-       movups  144(ctx), %xmm1                         // key9
-    aesenc  %xmm1, iv
-       movups  160(ctx), %xmm1                         // keyA
-    aesenclast  %xmm1, iv
-#endif
-
-       movups  iv, (obuf)                                      // *obuf = *iv;
-       add             $16, obuf                                       // obuf++;
-       add             $16, ibuf                                       // ibuf++;
-       sub             $1, num_blk                                     // num_blk --
-       jg              0b                                                      // if num_blk > 0, repeat the loop
-
-       // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)
-
-L_HW_cbc_done:
-
-       xor             %eax, %eax                              // to return CRYPT_OK
-
-L_error:
-
-       // if kernel, restore xmm registers
-#ifdef KERNEL 
-       movaps  0(sp), %xmm0
-       movaps  16(sp), %xmm1
-       movaps  32(sp), %xmm2
-       movaps  48(sp), %xmm3
-       movaps  64(sp), %xmm4
-       movaps  80(sp), %xmm5
-       movaps  96(sp), %xmm6
-       movaps  112(sp), %xmm7
-#if defined    __x86_64__
-       movaps  16*8(sp), %xmm8
-       movaps  16*9(sp), %xmm9
-       movaps  16*10(sp), %xmm10
-       movaps  16*11(sp), %xmm11
-       movaps  16*12(sp), %xmm12
-       movaps  16*13(sp), %xmm13
-       movaps  16*14(sp), %xmm14
-       movaps  16*15(sp), %xmm15
-#endif // __x86_64__
-#endif // KERNEL
-
-       // release used stack memory, restore used callee-saved registers, and return 
-#if    defined __i386__
-#ifdef KERNEL
-       add             $(8*16), %esp
-#endif
-       pop             %edi
-       pop             %ebx
-#else
-#ifdef KERNEL
-       add             $(16*16), %rsp  
-#endif
-       pop             %r15
-       pop             %r14
-       pop             %r13
-       pop             %rbx
-#endif
-       leave
-       ret
-
-       //
-       // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
-       //
-
-L_encrypt_192:
-
-       cmp             $1, num_blk                             // check number of block
-       jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
-
-       movups  (ctx), %xmm2                    // key0
-       movups  16(ctx), %xmm3                  // key1
-       movups  32(ctx), %xmm4                  // key2
-       movups  48(ctx), %xmm5                  // key3
-       movups  64(ctx), %xmm6                  // key4
-       movups  80(ctx), %xmm7                  // key5
-#if defined    __x86_64__
-       movups  96(ctx), %xmm8                  // key6
-       movups  112(ctx), %xmm9                 // key7
-       movups  128(ctx), %xmm10                // key8
-       movups  144(ctx), %xmm11                // key9
-       movups  160(ctx), %xmm12                // keyA
-       movups  176(ctx), %xmm13                // keyB
-       movups  192(ctx), %xmm14                // keyC
-#endif
-       
-       // while (num_blk--) {
-       //                      *iv ^= *ibuf++;
-       //                      aes_encrypt(iv, iv, ctx);
-       //                      *obuf++ = *iv;
-       // }
-0:
-       movups  (ibuf), %xmm1                   // *ibuf
-       pxor    %xmm1, iv                               // *iv ^= ibuf
-
-       // aes_encrypt(iv, iv, ctx);
-
-       pxor    %xmm2, iv
-    aesenc  %xmm3, iv
-    aesenc  %xmm4, iv
-    aesenc  %xmm5, iv
-    aesenc  %xmm6, iv
-    aesenc  %xmm7, iv
-#if defined    __x86_64__
-    aesenc  %xmm8, iv
-    aesenc  %xmm9, iv
-    aesenc  %xmm10, iv
-    aesenc  %xmm11, iv
-    aesenc  %xmm12, iv
-    aesenc  %xmm13, iv
-    aesenclast  %xmm14, iv
-#else
-       movups  96(ctx), %xmm1
-    aesenc  %xmm1, iv
-       movups  112(ctx), %xmm1
-    aesenc  %xmm1, iv
-       movups  128(ctx), %xmm1
-    aesenc  %xmm1, iv
-       movups  144(ctx), %xmm1
-    aesenc  %xmm1, iv
-       movups  160(ctx), %xmm1
-    aesenc  %xmm1, iv
-       movups  176(ctx), %xmm1
-    aesenc  %xmm1, iv
-       movups  192(ctx), %xmm1
-    aesenclast  %xmm1, iv
-#endif
-
-       movups  iv, (obuf)                              // *obuf = *iv;
-       add             $16, ibuf                               // ibuf++
-       add             $16, obuf                               // obuf++
-
-       sub             $1, num_blk                             // num_blk --
-       jg              0b                                              // if num_blk > 0, repeat the loop
-
-       jmp             L_HW_cbc_done                   // share with the common exit code
-
-       //
-       // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done
-       //
-
-L_encrypt_256:
-
-       cmp             $1, num_blk                             // check number of block
-       jl              L_HW_cbc_done                   // should it be less than 1, nothing to do
-
-       movups  (ctx), %xmm2                    // key0
-       movups  16(ctx), %xmm3                  // key1
-       movups  32(ctx), %xmm4                  // key2
-       movups  48(ctx), %xmm5                  // key3
-       movups  64(ctx), %xmm6                  // key4
-       movups  80(ctx), %xmm7                  // key5
-#if defined    __x86_64__
-       movups  96(ctx), %xmm8                  // key6
-       movups  112(ctx), %xmm9                 // key7
-       movups  128(ctx), %xmm10                // key8
-       movups  144(ctx), %xmm11                // key9
-       movups  160(ctx), %xmm12                // keyA
-       movups  176(ctx), %xmm13                // keyB
-       movups  192(ctx), %xmm14                // keyC
-       movups  208(ctx), %xmm15                // keyD
-       // movups       224(ctx), %xmm1         // keyE
-#endif
-
-       // while (num_blk--) {
-       //                      *iv ^= *ibuf++;
-       //                      aes_encrypt(iv, iv, ctx);
-       //                      *obuf++ = *iv;
-       // }
-0:
-       movups  (ibuf), %xmm1                   // *ibuf
-       pxor    %xmm1, iv                               // *iv ^= ibuf
-       
-       // aes_encrypt(iv, iv, ctx);
-       pxor    %xmm2, iv
-    aesenc  %xmm3, iv
-    aesenc  %xmm4, iv
-    aesenc  %xmm5, iv
-    aesenc  %xmm6, iv
-    aesenc  %xmm7, iv
-#if defined    __x86_64__
-       movups  224(ctx), %xmm1                 // keyE
-    aesenc  %xmm8, iv
-    aesenc  %xmm9, iv
-    aesenc  %xmm10, iv
-    aesenc  %xmm11, iv
-    aesenc  %xmm12, iv
-    aesenc  %xmm13, iv
-    aesenc  %xmm14, iv
-    aesenc  %xmm15, iv
-    aesenclast  %xmm1, iv
-#else
-       movups  96(ctx), %xmm1                  // key6
-    aesenc  %xmm1, iv
-       movups  112(ctx), %xmm1                 // key7
-    aesenc  %xmm1, iv
-       movups  128(ctx), %xmm1                 // key8
-    aesenc  %xmm1, iv
-       movups  144(ctx), %xmm1                 // key9
-    aesenc  %xmm1, iv
-       movups  160(ctx), %xmm1                 // keyA
-    aesenc  %xmm1, iv
-       movups  176(ctx), %xmm1                 // keyB
-    aesenc  %xmm1, iv
-       movups  192(ctx), %xmm1                 // keyC
-    aesenc  %xmm1, iv
-       movups  208(ctx), %xmm1                 // keyD
-    aesenc  %xmm1, iv
-       movups  224(ctx), %xmm1                 // keyE
-    aesenclast  %xmm1, iv
-#endif
-
-       movups  iv, (obuf)                              // *obuf = *iv;
-       add             $16, ibuf                               // ibuf++
-       add             $16, obuf                               // obuf++
-
-       sub             $1, num_blk                             // num_blk --
-       jg              0b                                              // if num_blk > 0, repeat the loop
-
-       jmp             L_HW_cbc_done                   // share with the common exit code
-
-
-
-       //
-       // --------- END of aes_encrypt_cbc_hw  -------------------
-       //
-
-
-/* ---------------------------------------------------------------------------------------------------------------- 
-
-       aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :
-
-       For simplicity, I am assuming all variables are in 128-bit data type.
-
-       aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)
-       {
-               while(num_blk--) {
-                       aes_decrypt(ibuf, obuf, ctx);
-                       *obuf++ ^= *iv;
-                       *iv = *ibuf++;
-               }
-               return 0;
-       }
-
-       The following is an implementation of this function using Intel AESNI.
-       This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. 
-       Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch
-       to this aesni-based function should it detecs that aesni is available.
-       Blindly call this function SURELY will cause a CRASH on systems with no aesni support. 
-
-       Note that the decryption operation is not related over blocks.
-       This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.
-       This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)
-       The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.
-
-       Example C code for packing 4 blocks in an iteration is shown as follows:
-
-               while ((num_blk-=4)>=0) {
-
-                       // the following 4 functions can be interleaved to exploit parallelism
-                       aes_decrypt(ibuf, obuf, ctx);
-                       aes_decrypt(ibuf+1, obuf+1, ctx);
-                       aes_decrypt(ibuf+2, obuf+2, ctx);
-                       aes_decrypt(ibuf+3, obuf+3, ctx);
-
-                       obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
-                       *iv = ibuf[3];          ibuf += 4;      obuf += 4;
-               }
-               num_blk+=4;
-
-   ----------------------------------------------------------------------------------------------------------------*/
-
-       .text
-       .align  4,0x90
-       .globl  _aes_decrypt_cbc_hw
-_aes_decrypt_cbc_hw:
-
-       // push/save registers for local use
-#if    defined __i386__
-
-       push    %ebp
-       movl    %esp, %ebp
-       push    %ebx                                    // ibuf
-       push    %edi                                    // obuf
-
-       #define sp      %esp
-
-#else  // __x86_64__
-
-       push    %rbp
-       mov             %rsp, %rbp
-       push    %rbx
-       push    %r13
-       push    %r14
-       push    %r15
-
-       #define sp      %rsp
-
-#endif
-
-
-       // if kernel, allocate stack space to save xmm registers
-#ifdef KERNEL
-#if defined __i386__
-       sub             $(8*16), %esp
-#else
-       sub             $(16*16), %rsp
-#endif
-       movaps  %xmm0, (sp)
-       movaps  %xmm1, 16(sp)
-       movaps  %xmm2, 32(sp)
-       movaps  %xmm3, 48(sp)
-       movaps  %xmm4, 64(sp)
-       movaps  %xmm5, 80(sp)
-       movaps  %xmm6, 96(sp)
-       movaps  %xmm7, 112(sp)
-#if defined    __x86_64__
-       movaps  %xmm8, 16*8(sp)
-       movaps  %xmm9, 16*9(sp)
-       movaps  %xmm10, 16*10(sp)
-       movaps  %xmm11, 16*11(sp)
-       movaps  %xmm12, 16*12(sp)
-       movaps  %xmm13, 16*13(sp)
-       movaps  %xmm14, 16*14(sp)
-       movaps  %xmm15, 16*15(sp)
-#endif // __x86_64__
-#endif
-
-       #undef  iv
-       #define iv      %xmm0
-
-#if defined    __i386__
-       mov             12(%ebp), %eax                  // in_iv
-       mov             24(%ebp), %edx                  // ctx
-       movups  (%eax), iv                              // iv = in_iv   
-       mov             8(%ebp), %ebx                   // ibuf
-       mov             16(%ebp), %ecx                  // num_blk
-       mov             20(%ebp), %edi                  // obuf
-
-       #define ibuf    %ebx
-       #define obuf    %edi
-       #define num_blk %ecx    
-       #define ctx             %edx
-
-#else  //      __x86_64__, rdi/rsi/rdx/rcx/r8
-
-       mov             %rdi, %rbx                              // ibuf
-       movups  (%rsi), iv                              // iv = in_iv
-       mov             %rdx, %r13                              // num_blk
-       mov             %rcx, %r14                              // obuf
-       mov             %r8, %r15                               // ctx  
-
-       #define ibuf    %rbx
-       #define num_blk %r13d
-       #define obuf    %r14    
-       #define ctx             %r15
-
-#endif
-
-       mov             240(ctx), %eax                  // aes length
-       cmp             $160, %eax                              // aes-128 decrypt
-       je              L_decrypt_128
-       cmp             $192, %eax                              // aes-192 decrypt
-       je              L_decrypt_192
-       cmp             $224, %eax                              // aes-256 decrypt
-       je              L_decrypt_256
-
-       mov             $-1, %eax                               // wrong aes length, to return -1
-       jmp             L_error                                 // early exit due to wrong aes length
-
-
-       //
-       // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
-       //
-
-L_decrypt_128:
-
-       cmp             $1, num_blk
-       jl              L_HW_cbc_done                   // if num_blk < 1, early return
-
-       // aes-128 decrypt expanded keys
-       movups  160(ctx), %xmm3
-       movups  144(ctx), %xmm4
-       movups  128(ctx), %xmm5
-       movups  112(ctx), %xmm6
-       movups  96(ctx), %xmm7
-#if defined    __x86_64__
-       movups  80(ctx), %xmm8
-       movups  64(ctx), %xmm9
-       movups  48(ctx), %xmm10
-       movups  32(ctx), %xmm11
-       movups  16(ctx), %xmm12
-       movups  0(ctx), %xmm13
-#endif
-
-       // performs 4 block decryption in an iteration to exploit decrypt in parallel
-
-       //              while ((num_blk-=4)>=0) {
-       //                      aes_decrypt(ibuf, obuf, ctx);
-       //                      aes_decrypt(ibuf+1, obuf+1, ctx);
-       //                      aes_decrypt(ibuf+2, obuf+2, ctx);
-       //                      aes_decrypt(ibuf+3, obuf+3, ctx);
-       //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
-       //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;
-       //              }
-
-       sub             $4, num_blk                                     // pre decrement num_blk by 4
-       jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
-
-0:
-
-
-#if defined    __x86_64__
-
-       movups  (ibuf), %xmm1                           // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
-       movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
-       movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
-
-       // for x86_64, the expanded keys are already stored in xmm3-xmm13
-
-       // aes-128 decrypt round 0 per 4 blocks
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-       pxor    %xmm3, %xmm14
-       pxor    %xmm3, %xmm15
-
-       // aes-128 decrypt round 1 per 4 blocks
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm4, %xmm14
-    aesdec  %xmm4, %xmm15
-
-       // aes-128 decrypt round 2 per 4 blocks
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm5, %xmm14
-    aesdec  %xmm5, %xmm15
-
-       // aes-128 decrypt round 3 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm14
-    aesdec  %xmm6, %xmm15
-
-       // aes-128 decrypt round 4 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm14
-    aesdec  %xmm7, %xmm15
-
-       // aes-128 decrypt round 5 per 4 blocks
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm8, %xmm14
-    aesdec  %xmm8, %xmm15
-
-       // aes-128 decrypt round 6 per 4 blocks
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm9, %xmm14
-    aesdec  %xmm9, %xmm15
-
-       // aes-128 decrypt round 7 per 4 blocks
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm10, %xmm14
-    aesdec  %xmm10, %xmm15
-
-       // aes-128 decrypt round 8 per 4 blocks
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm11, %xmm14
-    aesdec  %xmm11, %xmm15
-
-       // aes-128 decrypt round 9 per 4 blocks
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-
-       // aes-128 decrypt round 10 (last) per 4 blocks
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-    aesdeclast  %xmm13, %xmm14
-    aesdeclast  %xmm13, %xmm15
-
-       pxor    iv, %xmm1                               // obuf[0] ^= *iv; 
-       movups  (ibuf), iv                              // ibuf[0]
-       pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]; 
-       movups  16(ibuf), iv                    // ibuf[1]
-       pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1]; 
-       movups  32(ibuf), iv                    // ibuf[2] 
-       pxor    iv, %xmm15                              // obuf[3] ^= obuf[2]; 
-       movups  48(ibuf), iv                    // *iv = ibuf[3]
-
-       movups  %xmm1, (obuf)                   // write 1st obuf
-       movups  %xmm2, 16(obuf)                 // write 2nd obuf
-       movups  %xmm14, 32(obuf)                // write 3rd obuf
-       movups  %xmm15, 48(obuf)                // write 4th obuf
-
-
-#else
-
-       // aes_decrypt_cbc per 4 blocks using aes-128 for i386
-       // xmm1/xmm2/xmm4/xmm5 used for obuf per block
-       // xmm3 = key0
-       // xmm0 = iv
-       // xmm6/xmm7 dynamically load with other expanded keys
-
-       movups  (ibuf), %xmm1                   // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
-       movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
-       movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
-
-       // aes_decrypt
-       // for i386, sequentially load expanded keys into xmm6/xmm7
-
-       movups  144(ctx), %xmm6                 // key1
-
-       // aes-128 decrypt round 0 per 4 blocks
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-       pxor    %xmm3, %xmm4
-       pxor    %xmm3, %xmm5
-
-       movups  128(ctx), %xmm7                 // key2
-
-       // aes-128 decrypt round 1 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  112(ctx), %xmm6                 // key3
-
-       // aes-128 decrypt round 2 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  96(ctx), %xmm7                  // key4
-
-       // aes-128 decrypt round 3 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  80(ctx), %xmm6                  // key5
-
-       // aes-128 decrypt round 4 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  64(ctx), %xmm7                  // key6
-
-       // aes-128 decrypt round 5 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  48(ctx), %xmm6                  // key7
-
-       // aes-128 decrypt round 6 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  32(ctx), %xmm7                  // key8
-
-       // aes-128 decrypt round 7 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  16(ctx), %xmm6                  // key9
-
-       // aes-128 decrypt round 8 per 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  0(ctx), %xmm7                   // keyA
-
-       // aes-128 decrypt round 9 per 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       // aes-128 decrypt round 10 (last) per 4 blocks
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-    aesdeclast  %xmm7, %xmm4
-    aesdeclast  %xmm7, %xmm5
-
-       pxor    iv, %xmm1                               // 1st obuf ^= iv; 
-       movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm2                               // 2nd obuf ^= iv; 
-       movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm4                               // 3rd obuf ^= iv; 
-       movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm5                               // 4th obuf ^= iv; 
-       movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-       movups  %xmm1, (obuf)                   // write 1st obuf
-       movups  %xmm2, 16(obuf)                 // write 2nd obuf
-       movups  %xmm4, 32(obuf)                 // write 3rd obuf
-       movups  %xmm5, 48(obuf)                 // write 4th obuf
-#endif
-
-       add             $64, ibuf                               // ibuf += 4; 
-       add             $64, obuf                               // obuf += 4;   
-
-       sub             $4, num_blk                             // num_blk -= 4
-       jge             0b                                              // if num_blk > 0, repeat the loop
-
-9:     add             $4, num_blk                             // post incremtn num_blk by 4
-       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
-
-#if defined    __i386__
-       // updated as they might be needed as expanded keys in the remaining
-       movups  144(ctx), %xmm4
-       movups  128(ctx), %xmm5
-       movups  112(ctx), %xmm6
-       movups  96(ctx), %xmm7
-#endif
-
-       test    $2, num_blk                             // check whether num_blk has 2 blocks
-       je              9f                                              // if num_blk & 2 == 0, skip the per-pair processing code
-
-       // do the remaining 2 blocks together
-
-       movups  (ibuf), %xmm1                           // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
-
-       // aes_decrypt
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-#if defined    __x86_64__
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-#else
-       movups  80(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-       movups  64(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-       movups  48(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-       movups  32(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-       movups  16(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-       movups  0(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-       movups  112(ctx), %xmm6
-       movups  96(ctx), %xmm7
-#endif
-
-       pxor    iv, %xmm1                               // obuf[0] ^= *iv; 
-       movups  (ibuf), iv                              // ibuf[0]
-       pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]
-       movups  16(ibuf), iv                    // *iv = ibuf[1]
-
-       movups  %xmm1, (obuf)                   // write obuf[0]
-       movups  %xmm2, 16(obuf)                 // write obuf[1]
-
-       add             $32, ibuf                               // ibuf += 2
-       add             $32, obuf                               // obuf += 2
-
-9:
-       test    $1, num_blk                             // check whether num_blk has residual 1 block
-       je              L_HW_cbc_done                   // if num_blk == 0, no need for residual processing code
-       
-       movups  (ibuf), %xmm2                           // tmp = ibuf
-       // aes_decrypt
-       pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm7, %xmm2
-#if defined    __x86_64__
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm2
-    aesdeclast  %xmm13, %xmm2
-#else
-       movups  80(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  64(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  48(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  32(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  16(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  (ctx), %xmm1
-    aesdeclast  %xmm1, %xmm2
-#endif
-
-       pxor    iv, %xmm2                       // *obuf ^= *iv; 
-       movups  (ibuf), iv                      // *iv = *ibuf;
-       movups  %xmm2, (obuf)           // write *obuf
-
-       jmp             L_HW_cbc_done
-
-       //
-       // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
-       //
-
-L_decrypt_192:
-
-       cmp             $1, num_blk
-       jl              L_HW_cbc_done                   // if num_blk < 1, early return
-
-       // aes-192 decryp expanded keys
-       movups  192(ctx), %xmm3
-       movups  176(ctx), %xmm4
-       movups  160(ctx), %xmm5
-       movups  144(ctx), %xmm6
-       movups  128(ctx), %xmm7
-#if defined    __x86_64__
-       movups  112(ctx), %xmm8
-       movups  96(ctx), %xmm9
-       movups  80(ctx), %xmm10
-       movups  64(ctx), %xmm11
-       movups  48(ctx), %xmm12
-       movups  32(ctx), %xmm13
-       movups  16(ctx), %xmm14
-       movups  (ctx), %xmm15
-#endif
-
-       // performs 4 block decryption in an iteration to exploit decrypt in parallel
-
-       //              while ((num_blk-=4)>=0) {
-       //                      aes_decrypt(ibuf, obuf, ctx);
-       //                      aes_decrypt(ibuf+1, obuf+1, ctx);
-       //                      aes_decrypt(ibuf+2, obuf+2, ctx);
-       //                      aes_decrypt(ibuf+3, obuf+3, ctx);
-       //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];
-       //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;
-       //              }
-
-       sub             $4, num_blk                                     // pre decrement num_blk by 4
-       jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
-0:
-
-#if defined    __x86_64__
-
-       movups  (ibuf), %xmm1                           // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
-       movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
-       movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
-
-       // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
-       // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards
-
-       // round 0 for 4 blocks
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-       pxor    %xmm3, %xmm14
-       pxor    %xmm3, %xmm15
-
-       // round 1 for 4 blocks
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm4, %xmm14
-    aesdec  %xmm4, %xmm15
-
-       // round 2 for 4 blocks
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm5, %xmm14
-    aesdec  %xmm5, %xmm15
-
-       // round 3 for 4 blocks
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm14
-    aesdec  %xmm6, %xmm15
-
-       // round 4 for 4 blocks
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm14
-    aesdec  %xmm7, %xmm15
-
-       // round 5 for 4 blocks
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm8, %xmm14
-    aesdec  %xmm8, %xmm15
-
-       // round 6 for 4 blocks
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm9, %xmm14
-    aesdec  %xmm9, %xmm15
-
-       // round 7 for 4 blocks
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm10, %xmm14
-    aesdec  %xmm10, %xmm15
-
-       // round 8 for 4 blocks
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm11, %xmm14
-    aesdec  %xmm11, %xmm15
-
-       // round 9 for 4 blocks
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-
-       movups  16(ctx), %xmm12
-
-       // round A for 4 blocks
-    aesdec  %xmm13, %xmm1
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm13, %xmm14
-    aesdec  %xmm13, %xmm15
-
-       movups  (ctx), %xmm13
-
-       // round B for 4 blocks
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-
-       movups  48(ctx), %xmm12         // restore %xmm12 to its original key
-
-       // round C (last) for 4 blocks
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-    aesdeclast  %xmm13, %xmm14
-    aesdeclast  %xmm13, %xmm15
-
-       movups  32(ctx), %xmm13         // restore %xmm13 to its original key
-
-       pxor    iv, %xmm1                               // obuf[0] ^= *iv; 
-       movups  (ibuf), iv                              // ibuf[0]
-       pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0] 
-       movups  16(ibuf), iv                    // ibuf[1]
-       pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1] 
-       movups  32(ibuf), iv                    // ibuf[2] 
-       pxor    iv, %xmm15                              // obuf[3] ^= ibuf[2] 
-       movups  48(ibuf), iv                    // *iv = ibuf[3] 
-
-       movups  %xmm1, (obuf)                   // write 1st obuf
-       movups  %xmm2, 16(obuf)                 // write 2nd obuf
-       movups  %xmm14, 32(obuf)                // write 3rd obuf
-       movups  %xmm15, 48(obuf)                // write 4th obuf
-
-       add             $64, ibuf                               // ibuf += 4; 
-       add             $64, obuf                               // obuf += 4;   
-
-       sub             $4, num_blk                             // num_blk -= 4
-       jge             0b                                              // if num_blk > 0, repeat the loop
-
-9:     add             $4, num_blk                             // post incremtn num_blk by 4
-       je              L_HW_cbc_done                   // if num_blk == 0, prepare to return 
-
-       movups  16(ctx), %xmm14                 // restore %xmm14 to its key
-       movups  (ctx), %xmm15                   // restore %xmm15 to its key
-
-#else
-
-       movups  (ibuf), %xmm1                   // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
-       movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
-       movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
-
-       // aes_decrypt
-       // for i386, sequentially load expanded keys into xmm6/xmm7
-       movups  176(ctx), %xmm6
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-       pxor    %xmm3, %xmm4
-       pxor    %xmm3, %xmm5
-
-       movups  160(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  144(ctx), %xmm6
-       aesdec    %xmm7, %xmm1
-       aesdec    %xmm7, %xmm2
-       aesdec    %xmm7, %xmm4
-       aesdec    %xmm7, %xmm5
-
-       movups  128(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  112(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  96(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  80(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  64(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  48(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  32(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  16(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  0(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-    aesdeclast  %xmm7, %xmm4
-    aesdeclast  %xmm7, %xmm5
-
-       pxor    iv, %xmm1                               // 1st obuf ^= iv; 
-       movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm2                               // 2nd obuf ^= iv; 
-       movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm4                               // 3rd obuf ^= iv; 
-       movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm5                               // 4th obuf ^= iv; 
-       movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-       movups  %xmm1, (obuf)                   // write 1st obuf
-       movups  %xmm2, 16(obuf)                 // write 2nd obuf
-       movups  %xmm4, 32(obuf)                 // write 3rd obuf
-       movups  %xmm5, 48(obuf)                 // write 4th obuf
-
-       add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4; 
-       add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;  
-
-       sub             $4, num_blk                             // num_blk -= 4
-       jge             0b                                              // if num_blk > 0, repeat the loop
-
-
-9:     add             $4, num_blk                             //      post incremtn num_blk by 4
-       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
-
-       movups  176(ctx), %xmm4
-       movups  160(ctx), %xmm5
-       movups  144(ctx), %xmm6
-       movups  128(ctx), %xmm7
-
-#endif
-
-       // per-block aes_decrypt_cbc loop
-
-0:
-       movups  (ibuf), %xmm2                           // tmp = ibuf
-
-       // aes_decrypt
-       pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm7, %xmm2
-#if defined    __x86_64__
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm14, %xmm2
-    aesdeclast  %xmm15, %xmm2
-#else
-       movups  112(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  96(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  80(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  64(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  48(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  32(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  16(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  (ctx), %xmm1
-    aesdeclast  %xmm1, %xmm2
-#endif
-
-       pxor    iv, %xmm2                       // obuf ^= iv; 
-       movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-       movups  %xmm2, (obuf)           // write obuf
-
-       add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE; 
-       add             $16, obuf                               // obuf += AES_BLOCK_SIZE;      
-       sub             $1, num_blk                             // num_blk --
-       jg              0b                                              // if num_blk > 0, repeat the loop
-
-       jmp             L_HW_cbc_done
-
-       //
-       // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done
-       //
-
-L_decrypt_256:
-
-       cmp             $1, num_blk
-       jl              L_HW_cbc_done   
-
-       movups  224(ctx), %xmm3
-       movups  208(ctx), %xmm4
-       movups  192(ctx), %xmm5
-       movups  176(ctx), %xmm6
-       movups  160(ctx), %xmm7
-#if defined    __x86_64__
-       movups  144(ctx), %xmm8
-       movups  128(ctx), %xmm9
-       movups  112(ctx), %xmm10
-       movups  96(ctx), %xmm11
-       movups  80(ctx), %xmm12
-       movups  64(ctx), %xmm13
-       movups  48(ctx), %xmm14
-       movups  32(ctx), %xmm15
-//     movups  16(ctx), %xmm14
-//     movups  (ctx), %xmm15
-#endif
-
-#if defined    __x86_64__
-
-       sub             $4, num_blk                                     // pre decrement num_blk by 4
-       jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code
-0:
-       movups  (ibuf), %xmm1                           // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf
-       movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf
-       movups  48(ibuf), %xmm15                        // tmp = 4th ibuf
-
-       // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-       pxor    %xmm3, %xmm14
-       pxor    %xmm3, %xmm15
-
-    aesdec  %xmm4, %xmm1
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm4, %xmm14
-    aesdec  %xmm4, %xmm15
-
-    aesdec  %xmm5, %xmm1
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm5, %xmm14
-    aesdec  %xmm5, %xmm15
-
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm14
-    aesdec  %xmm6, %xmm15
-
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm14
-    aesdec  %xmm7, %xmm15
-
-    aesdec  %xmm8, %xmm1
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm8, %xmm14
-    aesdec  %xmm8, %xmm15
-
-    aesdec  %xmm9, %xmm1
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm9, %xmm14
-    aesdec  %xmm9, %xmm15
-
-    aesdec  %xmm10, %xmm1
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm10, %xmm14
-    aesdec  %xmm10, %xmm15
-
-    aesdec  %xmm11, %xmm1
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm11, %xmm14
-    aesdec  %xmm11, %xmm15
-
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-       movups  48(ctx), %xmm12
-
-    aesdec  %xmm13, %xmm1
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm13, %xmm14
-    aesdec  %xmm13, %xmm15
-       movups  32(ctx), %xmm13
-
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-       movups  16(ctx), %xmm12
-
-    aesdec  %xmm13, %xmm1
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm13, %xmm14
-    aesdec  %xmm13, %xmm15
-       movups  (ctx), %xmm13
-
-    aesdec  %xmm12, %xmm1
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm12, %xmm14
-    aesdec  %xmm12, %xmm15
-       movups  80(ctx), %xmm12
-
-    aesdeclast  %xmm13, %xmm1
-    aesdeclast  %xmm13, %xmm2
-    aesdeclast  %xmm13, %xmm14
-    aesdeclast  %xmm13, %xmm15
-       movups  64(ctx), %xmm13
-
-       pxor    iv, %xmm1                               // obuf ^= iv; 
-       movups  (ibuf), iv                              // memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm2                               // obuf ^= iv; 
-       movups  16(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm14                              // obuf ^= iv; 
-       movups  32(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm15                              // obuf ^= iv; 
-       movups  48(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-       movups  %xmm1, (obuf)                   // write 1st obuf
-       movups  %xmm2, 16(obuf)                 // write 2nd obuf
-       movups  %xmm14, 32(obuf)                // write 3rd obuf
-       movups  %xmm15, 48(obuf)                // write 4th obuf
-
-       add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE*4; 
-       add             $64, obuf                               // obuf += AES_BLOCK_SIZE*4;    
-
-       sub             $4, num_blk                             // num_blk -= 4
-       jge             0b                                              // if num_blk > 0, repeat the loop
-
-9:     add             $4, num_blk                             //      post incremtn num_blk by 4
-       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
-
-       movups  48(ctx), %xmm14
-       movups  32(ctx), %xmm15
-
-#else
-
-       sub             $4, num_blk                             // pre decrement num_blk by 4
-       jl              9f                                              // if num_blk < 4, skip the per-pair processing code
-0:
-       movups  (ibuf), %xmm1                   // tmp = 1st ibuf
-       movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf
-       movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf
-       movups  48(ibuf), %xmm5                 // tmp = 4th ibuf
-
-       // aes_decrypt
-       // for i386, sequentially load expanded keys into xmm6/xmm7
-       movups  208(ctx), %xmm6
-       pxor    %xmm3, %xmm1
-       pxor    %xmm3, %xmm2
-       pxor    %xmm3, %xmm4
-       pxor    %xmm3, %xmm5
-
-       movups  192(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  176(ctx), %xmm6
-       aesdec  %xmm7, %xmm1
-       aesdec  %xmm7, %xmm2
-       aesdec  %xmm7, %xmm4
-       aesdec  %xmm7, %xmm5
-
-       movups  160(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  144(ctx), %xmm6
-       aesdec  %xmm7, %xmm1
-       aesdec  %xmm7, %xmm2
-       aesdec  %xmm7, %xmm4
-       aesdec  %xmm7, %xmm5
-
-       movups  128(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  112(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  96(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  80(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  64(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  48(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  32(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-       movups  16(ctx), %xmm6
-    aesdec  %xmm7, %xmm1
-    aesdec  %xmm7, %xmm2
-    aesdec  %xmm7, %xmm4
-    aesdec  %xmm7, %xmm5
-
-       movups  0(ctx), %xmm7
-    aesdec  %xmm6, %xmm1
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm6, %xmm4
-    aesdec  %xmm6, %xmm5
-
-    aesdeclast  %xmm7, %xmm1
-    aesdeclast  %xmm7, %xmm2
-    aesdeclast  %xmm7, %xmm4
-    aesdeclast  %xmm7, %xmm5
-
-       pxor    iv, %xmm1                               // 1st obuf ^= iv; 
-       movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm2                               // 2nd obuf ^= iv; 
-       movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm4                               // 3rd obuf ^= iv; 
-       movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);
-       pxor    iv, %xmm5                               // 4th obuf ^= iv; 
-       movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);
-       movups  %xmm1, (obuf)                   // write 1st obuf
-       movups  %xmm2, 16(obuf)                 // write 2nd obuf
-       movups  %xmm4, 32(obuf)                 // write 3rd obuf
-       movups  %xmm5, 48(obuf)                 // write 4th obuf
-
-       add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4; 
-       add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;  
-
-       sub             $4, num_blk                             // num_blk -= 4
-       jge             0b                                              // if num_blk > 0, repeat the loop
-
-
-9:     add             $4, num_blk                             //      post incremtn num_blk by 4
-       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code
-
-       movups  208(ctx), %xmm4
-       movups  192(ctx), %xmm5
-       movups  176(ctx), %xmm6
-       movups  160(ctx), %xmm7
-
-#endif
-
-0:
-       movups  (ibuf), %xmm2                           // tmp = ibuf
-
-       // aes_decrypt
-       pxor    %xmm3, %xmm2
-    aesdec  %xmm4, %xmm2
-    aesdec  %xmm5, %xmm2
-    aesdec  %xmm6, %xmm2
-    aesdec  %xmm7, %xmm2
-#if defined    __x86_64__
-    aesdec  %xmm8, %xmm2
-    aesdec  %xmm9, %xmm2
-    aesdec  %xmm10, %xmm2
-    aesdec  %xmm11, %xmm2
-    aesdec  %xmm12, %xmm2
-    aesdec  %xmm13, %xmm2
-    aesdec  %xmm14, %xmm2
-    aesdec  %xmm15, %xmm2
-#else
-       movups  144(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  128(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  112(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  96(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  80(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  64(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  48(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  32(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-#endif
-       movups  16(ctx), %xmm1
-    aesdec  %xmm1, %xmm2
-       movups  (ctx), %xmm1
-    aesdeclast  %xmm1, %xmm2
-
-       pxor    iv, %xmm2                       // obuf ^= iv; 
-       movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);
-
-       movups  %xmm2, (obuf)           // write obuf
-
-       add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE; 
-       add             $16, obuf                               // obuf += AES_BLOCK_SIZE;      
-       sub             $1, num_blk                             // num_blk --
-       jg              0b                                              // if num_blk > 0, repeat the loop
-
-       jmp             L_HW_cbc_done
-
-       //
-       // --------- END of aes_decrypt_cbc_hw  -------------------
-       //
+/*\r
+ ---------------------------------------------------------------------------\r
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.\r
+\r
+ LICENSE TERMS\r
+\r
+ The free distribution and use of this software in both source and binary\r
+ form is allowed (with or without changes) provided that:\r
+\r
+   1. distributions of this source code include the above copyright\r
+      notice, this list of conditions and the following disclaimer;\r
+\r
+   2. distributions in binary form include the above copyright\r
+      notice, this list of conditions and the following disclaimer\r
+      in the documentation and/or other associated materials;\r
+\r
+   3. the copyright holder's name is not used to endorse products\r
+      built using this software without specific written permission.\r
+\r
+ ALTERNATIVELY, provided that this notice is retained in full, this product\r
+ may be distributed under the terms of the GNU General Public License (GPL),\r
+ in which case the provisions of the GPL apply INSTEAD OF those given above.\r
+\r
+ DISCLAIMER\r
+\r
+ This software is provided 'as is' with no explicit or implied warranties\r
+ in respect of its properties, including, but not limited to, correctness\r
+ and/or fitness for purpose.\r
+ ---------------------------------------------------------------------------\r
+ Issue 31/01/2006\r
+\r
+ These subroutines implement multiple block AES modes for ECB, CBC, CFB,\r
+ OFB and CTR encryption,  The code provides support for the VIA Advanced \r
+ Cryptography Engine (ACE).\r
+\r
+ NOTE: In the following subroutines, the AES contexts (ctx) must be\r
+ 16 byte aligned if VIA ACE is being used\r
+*/\r
+\r
+\r
+/* ---------------------------------------------------------------------------------------------------------------- \r
+\r
+       aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
+\r
+       For simplicity, I am assuming all variables are in 128-bit data type.\r
+\r
+       aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)\r
+       {\r
+               while(num_blk--) {\r
+                       *iv ^= *ibuf++;\r
+                       aes_encrypt(iv, iv, ctx);\r
+                       *obuf++ = *iv;\r
+               }\r
+               return 0;\r
+       }\r
+\r
+       The following is an implementation of this function using Intel AESNI.\r
+       This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. \r
+       Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
+       to this aesni-based function should it detecs that aesni is available.\r
+       Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
+\r
+       Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks\r
+       are serially chained. This prevents us from arranging several blocks for encryption in parallel.\r
+\r
+   ----------------------------------------------------------------------------------------------------------------*/\r
+\r
+       .text\r
+       .align  4,0x90\r
+       .globl  _aes_encrypt_cbc_hw\r
+_aes_encrypt_cbc_hw:\r
+\r
+       // push/save registers for local use\r
+#if    defined __i386__\r
+\r
+       push    %ebp\r
+       movl    %esp, %ebp\r
+       push    %ebx\r
+       push    %edi\r
+\r
+       #define sp      %esp\r
+\r
+#else  // __x86_64__\r
+\r
+       push    %rbp\r
+       mov             %rsp, %rbp\r
+       push    %rbx\r
+       push    %r13\r
+       push    %r14\r
+       push    %r15\r
+\r
+       #define sp      %rsp\r
+\r
+#endif\r
+\r
+       // if this is kernel code, need to save used xmm registers\r
+#ifdef KERNEL\r
+\r
+#if defined __i386__\r
+       sub             $(8*16), %esp                   // for possible xmm0-xmm7 save/restore\r
+#else\r
+       sub             $(16*16), %rsp          // xmm0-xmm15 save/restore      \r
+#endif\r
+\r
+       movaps  %xmm0, (sp)\r
+       movaps  %xmm1, 16(sp)\r
+       movaps  %xmm2, 32(sp)\r
+       movaps  %xmm3, 48(sp)\r
+       movaps  %xmm4, 64(sp)\r
+       movaps  %xmm5, 80(sp)\r
+       movaps  %xmm6, 96(sp)\r
+       movaps  %xmm7, 112(sp)\r
+#if defined    __x86_64__\r
+       movaps  %xmm8, 16*8(sp)\r
+       movaps  %xmm9, 16*9(sp)\r
+       movaps  %xmm10, 16*10(sp)\r
+       movaps  %xmm11, 16*11(sp)\r
+       movaps  %xmm12, 16*12(sp)\r
+       movaps  %xmm13, 16*13(sp)\r
+       movaps  %xmm14, 16*14(sp)\r
+       movaps  %xmm15, 16*15(sp)\r
+#endif // __x86_64__\r
+\r
+#endif // KERNEL\r
+\r
+       #define iv      %xmm0\r
+\r
+#ifdef __i386__\r
+\r
+       mov             12(%ebp), %eax                  // in_iv\r
+       mov             24(%ebp), %edx                  // ctx\r
+       movups  (%eax), iv                              // iv = in_iv   \r
+       mov             8(%ebp), %ebx                   // ibuf\r
+       mov             16(%ebp), %ecx                  // num_blk\r
+       mov             20(%ebp), %edi                  // obuf\r
+\r
+       #define ibuf    %ebx\r
+       #define obuf    %edi\r
+       #define num_blk %ecx    \r
+       #define ctx             %edx\r
+\r
+#else\r
+\r
+       mov             %rdi, %rbx                              // ibuf\r
+       movups  (%rsi), iv                              // iv = in_iv\r
+       mov             %rdx, %r13                              // num_blk\r
+       mov             %rcx, %r14                              // obuf\r
+       mov             %r8, %r15                               // ctx  \r
+\r
+       #define ibuf    %rbx\r
+       #define num_blk %r13d\r
+       #define obuf    %r14    \r
+       #define ctx             %r15\r
+\r
+#endif\r
+\r
+       mov             240(ctx), %eax                  // aes length\r
+       cmp             $160, %eax                              // aes-128 encrypt ?\r
+       je              L_encrypt_128\r
+       cmp             $192, %eax                              // aes-192 encrypt ?\r
+       je              L_encrypt_192\r
+       cmp             $224, %eax                              // aes-256 encrypt ?\r
+       je              L_encrypt_256\r
+       mov             $-1, %eax                               // return error\r
+       jmp             L_error \r
+\r
+       //\r
+       // aes-128 encrypt_cbc operation, up to L_HW_cbc_done\r
+       //\r
+\r
+L_encrypt_128:\r
+\r
+       cmp             $1, num_blk                             // check number of block\r
+       jl              L_HW_cbc_done                   // should it be less than 1, nothing to do\r
+\r
+       movups  (ctx), %xmm2                    // key0\r
+       movups  16(ctx), %xmm3                  // key1\r
+       movups  32(ctx), %xmm4                  // key2\r
+       movups  48(ctx), %xmm5                  // key3\r
+       movups  64(ctx), %xmm6                  // key4\r
+       movups  80(ctx), %xmm7                  // key5\r
+#if defined    __x86_64__\r
+       movups  96(ctx), %xmm8                  // key6\r
+       movups  112(ctx), %xmm9                 // key7\r
+       movups  128(ctx), %xmm10                // key8\r
+       movups  144(ctx), %xmm11                // key9\r
+       movups  160(ctx), %xmm12                // keyA\r
+#endif\r
+\r
+       // while (num_blk--) {\r
+       //                      *iv ^= *ibuf++;\r
+       //                      aes_encrypt(iv, iv, ctx);\r
+       //                      *obuf++ = *iv;\r
+       // }\r
+0:\r
+       movups  (ibuf), %xmm1                           // *ibuf\r
+       pxor    %xmm2, iv                                       // 1st instruction inside aes_encrypt\r
+       pxor    %xmm1, iv                                       // *iv ^= *ibuf\r
+\r
+       // finishing up the rest of aes_encrypt\r
+    aesenc  %xmm3, iv\r
+    aesenc  %xmm4, iv\r
+    aesenc  %xmm5, iv\r
+    aesenc  %xmm6, iv\r
+    aesenc  %xmm7, iv\r
+#if defined    __x86_64__\r
+    aesenc  %xmm8, iv\r
+    aesenc  %xmm9, iv\r
+    aesenc  %xmm10, iv\r
+    aesenc  %xmm11, iv\r
+    aesenclast  %xmm12, iv\r
+#else\r
+       movups  96(ctx), %xmm1                          // key6\r
+    aesenc  %xmm1, iv\r
+       movups  112(ctx), %xmm1                         // key7\r
+    aesenc  %xmm1, iv\r
+       movups  128(ctx), %xmm1                         // key8\r
+    aesenc  %xmm1, iv\r
+       movups  144(ctx), %xmm1                         // key9\r
+    aesenc  %xmm1, iv\r
+       movups  160(ctx), %xmm1                         // keyA\r
+    aesenclast  %xmm1, iv\r
+#endif\r
+\r
+       movups  iv, (obuf)                                      // *obuf = *iv;\r
+       add             $16, obuf                                       // obuf++;\r
+       add             $16, ibuf                                       // ibuf++;\r
+       sub             $1, num_blk                                     // num_blk --\r
+       jg              0b                                                      // if num_blk > 0, repeat the loop\r
+\r
+       // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)\r
+\r
+L_HW_cbc_done:\r
+\r
+       xor             %eax, %eax                              // to return CRYPT_OK\r
+\r
+L_error:\r
+\r
+       // if kernel, restore xmm registers\r
+#ifdef KERNEL \r
+       movaps  0(sp), %xmm0\r
+       movaps  16(sp), %xmm1\r
+       movaps  32(sp), %xmm2\r
+       movaps  48(sp), %xmm3\r
+       movaps  64(sp), %xmm4\r
+       movaps  80(sp), %xmm5\r
+       movaps  96(sp), %xmm6\r
+       movaps  112(sp), %xmm7\r
+#if defined    __x86_64__\r
+       movaps  16*8(sp), %xmm8\r
+       movaps  16*9(sp), %xmm9\r
+       movaps  16*10(sp), %xmm10\r
+       movaps  16*11(sp), %xmm11\r
+       movaps  16*12(sp), %xmm12\r
+       movaps  16*13(sp), %xmm13\r
+       movaps  16*14(sp), %xmm14\r
+       movaps  16*15(sp), %xmm15\r
+#endif // __x86_64__\r
+#endif // KERNEL\r
+\r
+       // release used stack memory, restore used callee-saved registers, and return \r
+#if    defined __i386__\r
+#ifdef KERNEL\r
+       add             $(8*16), %esp\r
+#endif\r
+       pop             %edi\r
+       pop             %ebx\r
+#else\r
+#ifdef KERNEL\r
+       add             $(16*16), %rsp  \r
+#endif\r
+       pop             %r15\r
+       pop             %r14\r
+       pop             %r13\r
+       pop             %rbx\r
+#endif\r
+       leave\r
+       ret\r
+\r
+       //\r
+       // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+       //\r
+\r
+L_encrypt_192:\r
+\r
+       cmp             $1, num_blk                             // check number of block\r
+       jl              L_HW_cbc_done                   // should it be less than 1, nothing to do\r
+\r
+       movups  (ctx), %xmm2                    // key0\r
+       movups  16(ctx), %xmm3                  // key1\r
+       movups  32(ctx), %xmm4                  // key2\r
+       movups  48(ctx), %xmm5                  // key3\r
+       movups  64(ctx), %xmm6                  // key4\r
+       movups  80(ctx), %xmm7                  // key5\r
+#if defined    __x86_64__\r
+       movups  96(ctx), %xmm8                  // key6\r
+       movups  112(ctx), %xmm9                 // key7\r
+       movups  128(ctx), %xmm10                // key8\r
+       movups  144(ctx), %xmm11                // key9\r
+       movups  160(ctx), %xmm12                // keyA\r
+       movups  176(ctx), %xmm13                // keyB\r
+       movups  192(ctx), %xmm14                // keyC\r
+#endif\r
+       \r
+       // while (num_blk--) {\r
+       //                      *iv ^= *ibuf++;\r
+       //                      aes_encrypt(iv, iv, ctx);\r
+       //                      *obuf++ = *iv;\r
+       // }\r
+0:\r
+       movups  (ibuf), %xmm1                   // *ibuf\r
+       pxor    %xmm1, iv                               // *iv ^= ibuf\r
+\r
+       // aes_encrypt(iv, iv, ctx);\r
+\r
+       pxor    %xmm2, iv\r
+    aesenc  %xmm3, iv\r
+    aesenc  %xmm4, iv\r
+    aesenc  %xmm5, iv\r
+    aesenc  %xmm6, iv\r
+    aesenc  %xmm7, iv\r
+#if defined    __x86_64__\r
+    aesenc  %xmm8, iv\r
+    aesenc  %xmm9, iv\r
+    aesenc  %xmm10, iv\r
+    aesenc  %xmm11, iv\r
+    aesenc  %xmm12, iv\r
+    aesenc  %xmm13, iv\r
+    aesenclast  %xmm14, iv\r
+#else\r
+       movups  96(ctx), %xmm1\r
+    aesenc  %xmm1, iv\r
+       movups  112(ctx), %xmm1\r
+    aesenc  %xmm1, iv\r
+       movups  128(ctx), %xmm1\r
+    aesenc  %xmm1, iv\r
+       movups  144(ctx), %xmm1\r
+    aesenc  %xmm1, iv\r
+       movups  160(ctx), %xmm1\r
+    aesenc  %xmm1, iv\r
+       movups  176(ctx), %xmm1\r
+    aesenc  %xmm1, iv\r
+       movups  192(ctx), %xmm1\r
+    aesenclast  %xmm1, iv\r
+#endif\r
+\r
+       movups  iv, (obuf)                              // *obuf = *iv;\r
+       add             $16, ibuf                               // ibuf++\r
+       add             $16, obuf                               // obuf++\r
+\r
+       sub             $1, num_blk                             // num_blk --\r
+       jg              0b                                              // if num_blk > 0, repeat the loop\r
+\r
+       jmp             L_HW_cbc_done                   // share with the common exit code\r
+\r
+       //\r
+       // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+       //\r
+\r
+L_encrypt_256:\r
+\r
+       cmp             $1, num_blk                             // check number of block\r
+       jl              L_HW_cbc_done                   // should it be less than 1, nothing to do\r
+\r
+       movups  (ctx), %xmm2                    // key0\r
+       movups  16(ctx), %xmm3                  // key1\r
+       movups  32(ctx), %xmm4                  // key2\r
+       movups  48(ctx), %xmm5                  // key3\r
+       movups  64(ctx), %xmm6                  // key4\r
+       movups  80(ctx), %xmm7                  // key5\r
+#if defined    __x86_64__\r
+       movups  96(ctx), %xmm8                  // key6\r
+       movups  112(ctx), %xmm9                 // key7\r
+       movups  128(ctx), %xmm10                // key8\r
+       movups  144(ctx), %xmm11                // key9\r
+       movups  160(ctx), %xmm12                // keyA\r
+       movups  176(ctx), %xmm13                // keyB\r
+       movups  192(ctx), %xmm14                // keyC\r
+       movups  208(ctx), %xmm15                // keyD\r
+       // movups       224(ctx), %xmm1         // keyE\r
+#endif\r
+\r
+       // while (num_blk--) {\r
+       //                      *iv ^= *ibuf++;\r
+       //                      aes_encrypt(iv, iv, ctx);\r
+       //                      *obuf++ = *iv;\r
+       // }\r
+0:\r
+       movups  (ibuf), %xmm1                   // *ibuf\r
+       pxor    %xmm1, iv                               // *iv ^= ibuf\r
+       \r
+       // aes_encrypt(iv, iv, ctx);\r
+       pxor    %xmm2, iv\r
+    aesenc  %xmm3, iv\r
+    aesenc  %xmm4, iv\r
+    aesenc  %xmm5, iv\r
+    aesenc  %xmm6, iv\r
+    aesenc  %xmm7, iv\r
+#if defined    __x86_64__\r
+       movups  224(ctx), %xmm1                 // keyE\r
+    aesenc  %xmm8, iv\r
+    aesenc  %xmm9, iv\r
+    aesenc  %xmm10, iv\r
+    aesenc  %xmm11, iv\r
+    aesenc  %xmm12, iv\r
+    aesenc  %xmm13, iv\r
+    aesenc  %xmm14, iv\r
+    aesenc  %xmm15, iv\r
+    aesenclast  %xmm1, iv\r
+#else\r
+       movups  96(ctx), %xmm1                  // key6\r
+    aesenc  %xmm1, iv\r
+       movups  112(ctx), %xmm1                 // key7\r
+    aesenc  %xmm1, iv\r
+       movups  128(ctx), %xmm1                 // key8\r
+    aesenc  %xmm1, iv\r
+       movups  144(ctx), %xmm1                 // key9\r
+    aesenc  %xmm1, iv\r
+       movups  160(ctx), %xmm1                 // keyA\r
+    aesenc  %xmm1, iv\r
+       movups  176(ctx), %xmm1                 // keyB\r
+    aesenc  %xmm1, iv\r
+       movups  192(ctx), %xmm1                 // keyC\r
+    aesenc  %xmm1, iv\r
+       movups  208(ctx), %xmm1                 // keyD\r
+    aesenc  %xmm1, iv\r
+       movups  224(ctx), %xmm1                 // keyE\r
+    aesenclast  %xmm1, iv\r
+#endif\r
+\r
+       movups  iv, (obuf)                              // *obuf = *iv;\r
+       add             $16, ibuf                               // ibuf++\r
+       add             $16, obuf                               // obuf++\r
+\r
+       sub             $1, num_blk                             // num_blk --\r
+       jg              0b                                              // if num_blk > 0, repeat the loop\r
+\r
+       jmp             L_HW_cbc_done                   // share with the common exit code\r
+\r
+\r
+\r
+       //\r
+       // --------- END of aes_encrypt_cbc_hw  -------------------\r
+       //\r
+\r
+\r
+/* ---------------------------------------------------------------------------------------------------------------- \r
+\r
+       aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r
+\r
+       For simplicity, I am assuming all variables are in 128-bit data type.\r
+\r
+       aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)\r
+       {\r
+               while(num_blk--) {\r
+                       aes_decrypt(ibuf, obuf, ctx);\r
+                       *obuf++ ^= *iv;\r
+                       *iv = *ibuf++;\r
+               }\r
+               return 0;\r
+       }\r
+\r
+       The following is an implementation of this function using Intel AESNI.\r
+       This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. \r
+       Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r
+       to this aesni-based function should it detecs that aesni is available.\r
+       Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r
+\r
+       Note that the decryption operation is not related over blocks.\r
+       This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.\r
+       This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)\r
+       The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.\r
+\r
+       Example C code for packing 4 blocks in an iteration is shown as follows:\r
+\r
+               while ((num_blk-=4)>=0) {\r
+\r
+                       // the following 4 functions can be interleaved to exploit parallelism\r
+                       aes_decrypt(ibuf, obuf, ctx);\r
+                       aes_decrypt(ibuf+1, obuf+1, ctx);\r
+                       aes_decrypt(ibuf+2, obuf+2, ctx);\r
+                       aes_decrypt(ibuf+3, obuf+3, ctx);\r
+\r
+                       obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
+                       *iv = ibuf[3];          ibuf += 4;      obuf += 4;\r
+               }\r
+               num_blk+=4;\r
+\r
+   ----------------------------------------------------------------------------------------------------------------*/\r
+\r
+       .text\r
+       .align  4,0x90\r
+       .globl  _aes_decrypt_cbc_hw\r
+_aes_decrypt_cbc_hw:\r
+\r
+       // push/save registers for local use\r
+#if    defined __i386__\r
+\r
+       push    %ebp\r
+       movl    %esp, %ebp\r
+       push    %ebx                                    // ibuf\r
+       push    %edi                                    // obuf\r
+\r
+       #define sp      %esp\r
+\r
+#else  // __x86_64__\r
+\r
+       push    %rbp\r
+       mov             %rsp, %rbp\r
+       push    %rbx\r
+       push    %r13\r
+       push    %r14\r
+       push    %r15\r
+\r
+       #define sp      %rsp\r
+\r
+#endif\r
+\r
+\r
+       // if kernel, allocate stack space to save xmm registers\r
+#ifdef KERNEL\r
+#if defined __i386__\r
+       sub             $(8*16), %esp\r
+#else\r
+       sub             $(16*16), %rsp\r
+#endif\r
+       movaps  %xmm0, (sp)\r
+       movaps  %xmm1, 16(sp)\r
+       movaps  %xmm2, 32(sp)\r
+       movaps  %xmm3, 48(sp)\r
+       movaps  %xmm4, 64(sp)\r
+       movaps  %xmm5, 80(sp)\r
+       movaps  %xmm6, 96(sp)\r
+       movaps  %xmm7, 112(sp)\r
+#if defined    __x86_64__\r
+       movaps  %xmm8, 16*8(sp)\r
+       movaps  %xmm9, 16*9(sp)\r
+       movaps  %xmm10, 16*10(sp)\r
+       movaps  %xmm11, 16*11(sp)\r
+       movaps  %xmm12, 16*12(sp)\r
+       movaps  %xmm13, 16*13(sp)\r
+       movaps  %xmm14, 16*14(sp)\r
+       movaps  %xmm15, 16*15(sp)\r
+#endif // __x86_64__\r
+#endif\r
+\r
+       #undef  iv\r
+       #define iv      %xmm0\r
+\r
+#if defined    __i386__\r
+       mov             12(%ebp), %eax                  // in_iv\r
+       mov             24(%ebp), %edx                  // ctx\r
+       movups  (%eax), iv                              // iv = in_iv   \r
+       mov             8(%ebp), %ebx                   // ibuf\r
+       mov             16(%ebp), %ecx                  // num_blk\r
+       mov             20(%ebp), %edi                  // obuf\r
+\r
+       #define ibuf    %ebx\r
+       #define obuf    %edi\r
+       #define num_blk %ecx    \r
+       #define ctx             %edx\r
+\r
+#else  //      __x86_64__, rdi/rsi/rdx/rcx/r8\r
+\r
+       mov             %rdi, %rbx                              // ibuf\r
+       movups  (%rsi), iv                              // iv = in_iv\r
+       mov             %rdx, %r13                              // num_blk\r
+       mov             %rcx, %r14                              // obuf\r
+       mov             %r8, %r15                               // ctx  \r
+\r
+       #define ibuf    %rbx\r
+       #define num_blk %r13d\r
+       #define obuf    %r14    \r
+       #define ctx             %r15\r
+\r
+#endif\r
+\r
+       mov             240(ctx), %eax                  // aes length\r
+       cmp             $160, %eax                              // aes-128 decrypt\r
+       je              L_decrypt_128\r
+       cmp             $192, %eax                              // aes-192 decrypt\r
+       je              L_decrypt_192\r
+       cmp             $224, %eax                              // aes-256 decrypt\r
+       je              L_decrypt_256\r
+\r
+       mov             $-1, %eax                               // wrong aes length, to return -1\r
+       jmp             L_error                                 // early exit due to wrong aes length\r
+\r
+\r
+       //\r
+       // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+       //\r
+\r
+L_decrypt_128:\r
+\r
+       cmp             $1, num_blk\r
+       jl              L_HW_cbc_done                   // if num_blk < 1, early return\r
+\r
+       // aes-128 decrypt expanded keys\r
+       movups  160(ctx), %xmm3\r
+       movups  144(ctx), %xmm4\r
+       movups  128(ctx), %xmm5\r
+       movups  112(ctx), %xmm6\r
+       movups  96(ctx), %xmm7\r
+#if defined    __x86_64__\r
+       movups  80(ctx), %xmm8\r
+       movups  64(ctx), %xmm9\r
+       movups  48(ctx), %xmm10\r
+       movups  32(ctx), %xmm11\r
+       movups  16(ctx), %xmm12\r
+       movups  0(ctx), %xmm13\r
+#endif\r
+\r
+       // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
+\r
+       //              while ((num_blk-=4)>=0) {\r
+       //                      aes_decrypt(ibuf, obuf, ctx);\r
+       //                      aes_decrypt(ibuf+1, obuf+1, ctx);\r
+       //                      aes_decrypt(ibuf+2, obuf+2, ctx);\r
+       //                      aes_decrypt(ibuf+3, obuf+3, ctx);\r
+       //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
+       //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
+       //              }\r
+\r
+       sub             $4, num_blk                                     // pre decrement num_blk by 4\r
+       jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code\r
+\r
+0:\r
+\r
+\r
+#if defined    __x86_64__\r
+\r
+       movups  (ibuf), %xmm1                           // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf\r
+       movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf\r
+       movups  48(ibuf), %xmm15                        // tmp = 4th ibuf\r
+\r
+       // for x86_64, the expanded keys are already stored in xmm3-xmm13\r
+\r
+       // aes-128 decrypt round 0 per 4 blocks\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+       pxor    %xmm3, %xmm14\r
+       pxor    %xmm3, %xmm15\r
+\r
+       // aes-128 decrypt round 1 per 4 blocks\r
+    aesdec  %xmm4, %xmm1\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm4, %xmm14\r
+    aesdec  %xmm4, %xmm15\r
+\r
+       // aes-128 decrypt round 2 per 4 blocks\r
+    aesdec  %xmm5, %xmm1\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm5, %xmm14\r
+    aesdec  %xmm5, %xmm15\r
+\r
+       // aes-128 decrypt round 3 per 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm14\r
+    aesdec  %xmm6, %xmm15\r
+\r
+       // aes-128 decrypt round 4 per 4 blocks\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm14\r
+    aesdec  %xmm7, %xmm15\r
+\r
+       // aes-128 decrypt round 5 per 4 blocks\r
+    aesdec  %xmm8, %xmm1\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm8, %xmm14\r
+    aesdec  %xmm8, %xmm15\r
+\r
+       // aes-128 decrypt round 6 per 4 blocks\r
+    aesdec  %xmm9, %xmm1\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm9, %xmm14\r
+    aesdec  %xmm9, %xmm15\r
+\r
+       // aes-128 decrypt round 7 per 4 blocks\r
+    aesdec  %xmm10, %xmm1\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm10, %xmm14\r
+    aesdec  %xmm10, %xmm15\r
+\r
+       // aes-128 decrypt round 8 per 4 blocks\r
+    aesdec  %xmm11, %xmm1\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm11, %xmm14\r
+    aesdec  %xmm11, %xmm15\r
+\r
+       // aes-128 decrypt round 9 per 4 blocks\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm12, %xmm14\r
+    aesdec  %xmm12, %xmm15\r
+\r
+       // aes-128 decrypt round 10 (last) per 4 blocks\r
+    aesdeclast  %xmm13, %xmm1\r
+    aesdeclast  %xmm13, %xmm2\r
+    aesdeclast  %xmm13, %xmm14\r
+    aesdeclast  %xmm13, %xmm15\r
+\r
+       pxor    iv, %xmm1                               // obuf[0] ^= *iv; \r
+       movups  (ibuf), iv                              // ibuf[0]\r
+       pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]; \r
+       movups  16(ibuf), iv                    // ibuf[1]\r
+       pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1]; \r
+       movups  32(ibuf), iv                    // ibuf[2] \r
+       pxor    iv, %xmm15                              // obuf[3] ^= obuf[2]; \r
+       movups  48(ibuf), iv                    // *iv = ibuf[3]\r
+\r
+       movups  %xmm1, (obuf)                   // write 1st obuf\r
+       movups  %xmm2, 16(obuf)                 // write 2nd obuf\r
+       movups  %xmm14, 32(obuf)                // write 3rd obuf\r
+       movups  %xmm15, 48(obuf)                // write 4th obuf\r
+\r
+\r
+#else\r
+\r
+       // aes_decrypt_cbc per 4 blocks using aes-128 for i386\r
+       // xmm1/xmm2/xmm4/xmm5 used for obuf per block\r
+       // xmm3 = key0\r
+       // xmm0 = iv\r
+       // xmm6/xmm7 dynamically load with other expanded keys\r
+\r
+       movups  (ibuf), %xmm1                   // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf\r
+       movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf\r
+       movups  48(ibuf), %xmm5                 // tmp = 4th ibuf\r
+\r
+       // aes_decrypt\r
+       // for i386, sequentially load expanded keys into xmm6/xmm7\r
+\r
+       movups  144(ctx), %xmm6                 // key1\r
+\r
+       // aes-128 decrypt round 0 per 4 blocks\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+       pxor    %xmm3, %xmm4\r
+       pxor    %xmm3, %xmm5\r
+\r
+       movups  128(ctx), %xmm7                 // key2\r
+\r
+       // aes-128 decrypt round 1 per 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  112(ctx), %xmm6                 // key3\r
+\r
+       // aes-128 decrypt round 2 per 4 blocks\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  96(ctx), %xmm7                  // key4\r
+\r
+       // aes-128 decrypt round 3 per 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  80(ctx), %xmm6                  // key5\r
+\r
+       // aes-128 decrypt round 4 per 4 blocks\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  64(ctx), %xmm7                  // key6\r
+\r
+       // aes-128 decrypt round 5 per 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  48(ctx), %xmm6                  // key7\r
+\r
+       // aes-128 decrypt round 6 per 4 blocks\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  32(ctx), %xmm7                  // key8\r
+\r
+       // aes-128 decrypt round 7 per 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  16(ctx), %xmm6                  // key9\r
+\r
+       // aes-128 decrypt round 8 per 4 blocks\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  0(ctx), %xmm7                   // keyA\r
+\r
+       // aes-128 decrypt round 9 per 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       // aes-128 decrypt round 10 (last) per 4 blocks\r
+    aesdeclast  %xmm7, %xmm1\r
+    aesdeclast  %xmm7, %xmm2\r
+    aesdeclast  %xmm7, %xmm4\r
+    aesdeclast  %xmm7, %xmm5\r
+\r
+       pxor    iv, %xmm1                               // 1st obuf ^= iv; \r
+       movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm2                               // 2nd obuf ^= iv; \r
+       movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm4                               // 3rd obuf ^= iv; \r
+       movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm5                               // 4th obuf ^= iv; \r
+       movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+       movups  %xmm1, (obuf)                   // write 1st obuf\r
+       movups  %xmm2, 16(obuf)                 // write 2nd obuf\r
+       movups  %xmm4, 32(obuf)                 // write 3rd obuf\r
+       movups  %xmm5, 48(obuf)                 // write 4th obuf\r
+#endif\r
+\r
+       add             $64, ibuf                               // ibuf += 4; \r
+       add             $64, obuf                               // obuf += 4;   \r
+\r
+       sub             $4, num_blk                             // num_blk -= 4\r
+       jge             0b                                              // if num_blk > 0, repeat the loop\r
+\r
+9:     add             $4, num_blk                             // post incremtn num_blk by 4\r
+       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code\r
+\r
+#if defined    __i386__\r
+       // updated as they might be needed as expanded keys in the remaining\r
+       movups  144(ctx), %xmm4\r
+       movups  128(ctx), %xmm5\r
+       movups  112(ctx), %xmm6\r
+       movups  96(ctx), %xmm7\r
+#endif\r
+\r
+       test    $2, num_blk                             // check whether num_blk has 2 blocks\r
+       je              9f                                              // if num_blk & 2 == 0, skip the per-pair processing code\r
+\r
+       // do the remaining 2 blocks together\r
+\r
+       movups  (ibuf), %xmm1                           // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf\r
+\r
+       // aes_decrypt\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+    aesdec  %xmm4, %xmm1\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm5, %xmm1\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+#if defined    __x86_64__\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm8, %xmm1\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm9, %xmm1\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm10, %xmm1\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm11, %xmm1\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdeclast  %xmm13, %xmm1\r
+    aesdeclast  %xmm13, %xmm2\r
+#else\r
+       movups  80(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+       movups  64(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+       movups  48(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+       movups  32(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+       movups  16(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+       movups  0(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdeclast  %xmm7, %xmm1\r
+    aesdeclast  %xmm7, %xmm2\r
+       movups  112(ctx), %xmm6\r
+       movups  96(ctx), %xmm7\r
+#endif\r
+\r
+       pxor    iv, %xmm1                               // obuf[0] ^= *iv; \r
+       movups  (ibuf), iv                              // ibuf[0]\r
+       pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0]\r
+       movups  16(ibuf), iv                    // *iv = ibuf[1]\r
+\r
+       movups  %xmm1, (obuf)                   // write obuf[0]\r
+       movups  %xmm2, 16(obuf)                 // write obuf[1]\r
+\r
+       add             $32, ibuf                               // ibuf += 2\r
+       add             $32, obuf                               // obuf += 2\r
+\r
+9:\r
+       test    $1, num_blk                             // check whether num_blk has residual 1 block\r
+       je              L_HW_cbc_done                   // if num_blk == 0, no need for residual processing code\r
+       \r
+       movups  (ibuf), %xmm2                           // tmp = ibuf\r
+       // aes_decrypt\r
+       pxor    %xmm3, %xmm2\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm7, %xmm2\r
+#if defined    __x86_64__\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm12, %xmm2\r
+    aesdeclast  %xmm13, %xmm2\r
+#else\r
+       movups  80(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  64(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  48(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  32(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  16(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  (ctx), %xmm1\r
+    aesdeclast  %xmm1, %xmm2\r
+#endif\r
+\r
+       pxor    iv, %xmm2                       // *obuf ^= *iv; \r
+       movups  (ibuf), iv                      // *iv = *ibuf;\r
+       movups  %xmm2, (obuf)           // write *obuf\r
+\r
+       jmp             L_HW_cbc_done\r
+\r
+       //\r
+       // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+       //\r
+\r
+L_decrypt_192:\r
+\r
+       cmp             $1, num_blk\r
+       jl              L_HW_cbc_done                   // if num_blk < 1, early return\r
+\r
+       // aes-192 decryp expanded keys\r
+       movups  192(ctx), %xmm3\r
+       movups  176(ctx), %xmm4\r
+       movups  160(ctx), %xmm5\r
+       movups  144(ctx), %xmm6\r
+       movups  128(ctx), %xmm7\r
+#if defined    __x86_64__\r
+       movups  112(ctx), %xmm8\r
+       movups  96(ctx), %xmm9\r
+       movups  80(ctx), %xmm10\r
+       movups  64(ctx), %xmm11\r
+       movups  48(ctx), %xmm12\r
+       movups  32(ctx), %xmm13\r
+       movups  16(ctx), %xmm14\r
+       movups  (ctx), %xmm15\r
+#endif\r
+\r
+       // performs 4 block decryption in an iteration to exploit decrypt in parallel\r
+\r
+       //              while ((num_blk-=4)>=0) {\r
+       //                      aes_decrypt(ibuf, obuf, ctx);\r
+       //                      aes_decrypt(ibuf+1, obuf+1, ctx);\r
+       //                      aes_decrypt(ibuf+2, obuf+2, ctx);\r
+       //                      aes_decrypt(ibuf+3, obuf+3, ctx);\r
+       //                      obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r
+       //                      *iv = ibuf[3]; ibuf += 4; obuf += 4;\r
+       //              }\r
+\r
+       sub             $4, num_blk                                     // pre decrement num_blk by 4\r
+       jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code\r
+0:\r
+\r
+#if defined    __x86_64__\r
+\r
+       movups  (ibuf), %xmm1                           // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf\r
+       movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf\r
+       movups  48(ibuf), %xmm15                        // tmp = 4th ibuf\r
+\r
+       // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
+       // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards\r
+\r
+       // round 0 for 4 blocks\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+       pxor    %xmm3, %xmm14\r
+       pxor    %xmm3, %xmm15\r
+\r
+       // round 1 for 4 blocks\r
+    aesdec  %xmm4, %xmm1\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm4, %xmm14\r
+    aesdec  %xmm4, %xmm15\r
+\r
+       // round 2 for 4 blocks\r
+    aesdec  %xmm5, %xmm1\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm5, %xmm14\r
+    aesdec  %xmm5, %xmm15\r
+\r
+       // round 3 for 4 blocks\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm14\r
+    aesdec  %xmm6, %xmm15\r
+\r
+       // round 4 for 4 blocks\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm14\r
+    aesdec  %xmm7, %xmm15\r
+\r
+       // round 5 for 4 blocks\r
+    aesdec  %xmm8, %xmm1\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm8, %xmm14\r
+    aesdec  %xmm8, %xmm15\r
+\r
+       // round 6 for 4 blocks\r
+    aesdec  %xmm9, %xmm1\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm9, %xmm14\r
+    aesdec  %xmm9, %xmm15\r
+\r
+       // round 7 for 4 blocks\r
+    aesdec  %xmm10, %xmm1\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm10, %xmm14\r
+    aesdec  %xmm10, %xmm15\r
+\r
+       // round 8 for 4 blocks\r
+    aesdec  %xmm11, %xmm1\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm11, %xmm14\r
+    aesdec  %xmm11, %xmm15\r
+\r
+       // round 9 for 4 blocks\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm12, %xmm14\r
+    aesdec  %xmm12, %xmm15\r
+\r
+       movups  16(ctx), %xmm12\r
+\r
+       // round A for 4 blocks\r
+    aesdec  %xmm13, %xmm1\r
+    aesdec  %xmm13, %xmm2\r
+    aesdec  %xmm13, %xmm14\r
+    aesdec  %xmm13, %xmm15\r
+\r
+       movups  (ctx), %xmm13\r
+\r
+       // round B for 4 blocks\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm12, %xmm14\r
+    aesdec  %xmm12, %xmm15\r
+\r
+       movups  48(ctx), %xmm12         // restore %xmm12 to its original key\r
+\r
+       // round C (last) for 4 blocks\r
+    aesdeclast  %xmm13, %xmm1\r
+    aesdeclast  %xmm13, %xmm2\r
+    aesdeclast  %xmm13, %xmm14\r
+    aesdeclast  %xmm13, %xmm15\r
+\r
+       movups  32(ctx), %xmm13         // restore %xmm13 to its original key\r
+\r
+       pxor    iv, %xmm1                               // obuf[0] ^= *iv; \r
+       movups  (ibuf), iv                              // ibuf[0]\r
+       pxor    iv, %xmm2                               // obuf[1] ^= ibuf[0] \r
+       movups  16(ibuf), iv                    // ibuf[1]\r
+       pxor    iv, %xmm14                              // obuf[2] ^= ibuf[1] \r
+       movups  32(ibuf), iv                    // ibuf[2] \r
+       pxor    iv, %xmm15                              // obuf[3] ^= ibuf[2] \r
+       movups  48(ibuf), iv                    // *iv = ibuf[3] \r
+\r
+       movups  %xmm1, (obuf)                   // write 1st obuf\r
+       movups  %xmm2, 16(obuf)                 // write 2nd obuf\r
+       movups  %xmm14, 32(obuf)                // write 3rd obuf\r
+       movups  %xmm15, 48(obuf)                // write 4th obuf\r
+\r
+       add             $64, ibuf                               // ibuf += 4; \r
+       add             $64, obuf                               // obuf += 4;   \r
+\r
+       sub             $4, num_blk                             // num_blk -= 4\r
+       jge             0b                                              // if num_blk > 0, repeat the loop\r
+\r
+9:     add             $4, num_blk                             // post incremtn num_blk by 4\r
+       je              L_HW_cbc_done                   // if num_blk == 0, prepare to return \r
+\r
+       movups  16(ctx), %xmm14                 // restore %xmm14 to its key\r
+       movups  (ctx), %xmm15                   // restore %xmm15 to its key\r
+\r
+#else\r
+\r
+       movups  (ibuf), %xmm1                   // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf\r
+       movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf\r
+       movups  48(ibuf), %xmm5                 // tmp = 4th ibuf\r
+\r
+       // aes_decrypt\r
+       // for i386, sequentially load expanded keys into xmm6/xmm7\r
+       movups  176(ctx), %xmm6\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+       pxor    %xmm3, %xmm4\r
+       pxor    %xmm3, %xmm5\r
+\r
+       movups  160(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  144(ctx), %xmm6\r
+       aesdec    %xmm7, %xmm1\r
+       aesdec    %xmm7, %xmm2\r
+       aesdec    %xmm7, %xmm4\r
+       aesdec    %xmm7, %xmm5\r
+\r
+       movups  128(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  112(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  96(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  80(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  64(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  48(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  32(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  16(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  0(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+    aesdeclast  %xmm7, %xmm1\r
+    aesdeclast  %xmm7, %xmm2\r
+    aesdeclast  %xmm7, %xmm4\r
+    aesdeclast  %xmm7, %xmm5\r
+\r
+       pxor    iv, %xmm1                               // 1st obuf ^= iv; \r
+       movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm2                               // 2nd obuf ^= iv; \r
+       movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm4                               // 3rd obuf ^= iv; \r
+       movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm5                               // 4th obuf ^= iv; \r
+       movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       movups  %xmm1, (obuf)                   // write 1st obuf\r
+       movups  %xmm2, 16(obuf)                 // write 2nd obuf\r
+       movups  %xmm4, 32(obuf)                 // write 3rd obuf\r
+       movups  %xmm5, 48(obuf)                 // write 4th obuf\r
+\r
+       add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4; \r
+       add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;  \r
+\r
+       sub             $4, num_blk                             // num_blk -= 4\r
+       jge             0b                                              // if num_blk > 0, repeat the loop\r
+\r
+\r
+9:     add             $4, num_blk                             //      post incremtn num_blk by 4\r
+       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code\r
+\r
+       movups  176(ctx), %xmm4\r
+       movups  160(ctx), %xmm5\r
+       movups  144(ctx), %xmm6\r
+       movups  128(ctx), %xmm7\r
+\r
+#endif\r
+\r
+       // per-block aes_decrypt_cbc loop\r
+\r
+0:\r
+       movups  (ibuf), %xmm2                           // tmp = ibuf\r
+\r
+       // aes_decrypt\r
+       pxor    %xmm3, %xmm2\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm7, %xmm2\r
+#if defined    __x86_64__\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm13, %xmm2\r
+    aesdec  %xmm14, %xmm2\r
+    aesdeclast  %xmm15, %xmm2\r
+#else\r
+       movups  112(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  96(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  80(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  64(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  48(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  32(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  16(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  (ctx), %xmm1\r
+    aesdeclast  %xmm1, %xmm2\r
+#endif\r
+\r
+       pxor    iv, %xmm2                       // obuf ^= iv; \r
+       movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+       movups  %xmm2, (obuf)           // write obuf\r
+\r
+       add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE; \r
+       add             $16, obuf                               // obuf += AES_BLOCK_SIZE;      \r
+       sub             $1, num_blk                             // num_blk --\r
+       jg              0b                                              // if num_blk > 0, repeat the loop\r
+\r
+       jmp             L_HW_cbc_done\r
+\r
+       //\r
+       // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r
+       //\r
+\r
+L_decrypt_256:\r
+\r
+       cmp             $1, num_blk\r
+       jl              L_HW_cbc_done   \r
+\r
+       movups  224(ctx), %xmm3\r
+       movups  208(ctx), %xmm4\r
+       movups  192(ctx), %xmm5\r
+       movups  176(ctx), %xmm6\r
+       movups  160(ctx), %xmm7\r
+#if defined    __x86_64__\r
+       movups  144(ctx), %xmm8\r
+       movups  128(ctx), %xmm9\r
+       movups  112(ctx), %xmm10\r
+       movups  96(ctx), %xmm11\r
+       movups  80(ctx), %xmm12\r
+       movups  64(ctx), %xmm13\r
+       movups  48(ctx), %xmm14\r
+       movups  32(ctx), %xmm15\r
+//     movups  16(ctx), %xmm14\r
+//     movups  (ctx), %xmm15\r
+#endif\r
+\r
+#if defined    __x86_64__\r
+\r
+       sub             $4, num_blk                                     // pre decrement num_blk by 4\r
+       jl              9f                                                      // if num_blk < 4, skip the per-4-blocks processing code\r
+0:\r
+       movups  (ibuf), %xmm1                           // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                         // tmp = 2nd ibuf\r
+       movups  32(ibuf), %xmm14                        // tmp = 3rd ibuf\r
+       movups  48(ibuf), %xmm15                        // tmp = 4th ibuf\r
+\r
+       // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+       pxor    %xmm3, %xmm14\r
+       pxor    %xmm3, %xmm15\r
+\r
+    aesdec  %xmm4, %xmm1\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm4, %xmm14\r
+    aesdec  %xmm4, %xmm15\r
+\r
+    aesdec  %xmm5, %xmm1\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm5, %xmm14\r
+    aesdec  %xmm5, %xmm15\r
+\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm14\r
+    aesdec  %xmm6, %xmm15\r
+\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm14\r
+    aesdec  %xmm7, %xmm15\r
+\r
+    aesdec  %xmm8, %xmm1\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm8, %xmm14\r
+    aesdec  %xmm8, %xmm15\r
+\r
+    aesdec  %xmm9, %xmm1\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm9, %xmm14\r
+    aesdec  %xmm9, %xmm15\r
+\r
+    aesdec  %xmm10, %xmm1\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm10, %xmm14\r
+    aesdec  %xmm10, %xmm15\r
+\r
+    aesdec  %xmm11, %xmm1\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm11, %xmm14\r
+    aesdec  %xmm11, %xmm15\r
+\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm12, %xmm14\r
+    aesdec  %xmm12, %xmm15\r
+       movups  48(ctx), %xmm12\r
+\r
+    aesdec  %xmm13, %xmm1\r
+    aesdec  %xmm13, %xmm2\r
+    aesdec  %xmm13, %xmm14\r
+    aesdec  %xmm13, %xmm15\r
+       movups  32(ctx), %xmm13\r
+\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm12, %xmm14\r
+    aesdec  %xmm12, %xmm15\r
+       movups  16(ctx), %xmm12\r
+\r
+    aesdec  %xmm13, %xmm1\r
+    aesdec  %xmm13, %xmm2\r
+    aesdec  %xmm13, %xmm14\r
+    aesdec  %xmm13, %xmm15\r
+       movups  (ctx), %xmm13\r
+\r
+    aesdec  %xmm12, %xmm1\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm12, %xmm14\r
+    aesdec  %xmm12, %xmm15\r
+       movups  80(ctx), %xmm12\r
+\r
+    aesdeclast  %xmm13, %xmm1\r
+    aesdeclast  %xmm13, %xmm2\r
+    aesdeclast  %xmm13, %xmm14\r
+    aesdeclast  %xmm13, %xmm15\r
+       movups  64(ctx), %xmm13\r
+\r
+       pxor    iv, %xmm1                               // obuf ^= iv; \r
+       movups  (ibuf), iv                              // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm2                               // obuf ^= iv; \r
+       movups  16(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm14                              // obuf ^= iv; \r
+       movups  32(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm15                              // obuf ^= iv; \r
+       movups  48(ibuf), iv                    // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+       movups  %xmm1, (obuf)                   // write 1st obuf\r
+       movups  %xmm2, 16(obuf)                 // write 2nd obuf\r
+       movups  %xmm14, 32(obuf)                // write 3rd obuf\r
+       movups  %xmm15, 48(obuf)                // write 4th obuf\r
+\r
+       add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE*4; \r
+       add             $64, obuf                               // obuf += AES_BLOCK_SIZE*4;    \r
+\r
+       sub             $4, num_blk                             // num_blk -= 4\r
+       jge             0b                                              // if num_blk > 0, repeat the loop\r
+\r
+9:     add             $4, num_blk                             //      post incremtn num_blk by 4\r
+       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code\r
+\r
+       movups  48(ctx), %xmm14\r
+       movups  32(ctx), %xmm15\r
+\r
+#else\r
+\r
+       sub             $4, num_blk                             // pre decrement num_blk by 4\r
+       jl              9f                                              // if num_blk < 4, skip the per-pair processing code\r
+0:\r
+       movups  (ibuf), %xmm1                   // tmp = 1st ibuf\r
+       movups  16(ibuf), %xmm2                 // tmp = 2nd ibuf\r
+       movups  32(ibuf), %xmm4                 // tmp = 3rd ibuf\r
+       movups  48(ibuf), %xmm5                 // tmp = 4th ibuf\r
+\r
+       // aes_decrypt\r
+       // for i386, sequentially load expanded keys into xmm6/xmm7\r
+       movups  208(ctx), %xmm6\r
+       pxor    %xmm3, %xmm1\r
+       pxor    %xmm3, %xmm2\r
+       pxor    %xmm3, %xmm4\r
+       pxor    %xmm3, %xmm5\r
+\r
+       movups  192(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  176(ctx), %xmm6\r
+       aesdec  %xmm7, %xmm1\r
+       aesdec  %xmm7, %xmm2\r
+       aesdec  %xmm7, %xmm4\r
+       aesdec  %xmm7, %xmm5\r
+\r
+       movups  160(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  144(ctx), %xmm6\r
+       aesdec  %xmm7, %xmm1\r
+       aesdec  %xmm7, %xmm2\r
+       aesdec  %xmm7, %xmm4\r
+       aesdec  %xmm7, %xmm5\r
+\r
+       movups  128(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  112(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  96(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  80(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  64(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  48(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  32(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+       movups  16(ctx), %xmm6\r
+    aesdec  %xmm7, %xmm1\r
+    aesdec  %xmm7, %xmm2\r
+    aesdec  %xmm7, %xmm4\r
+    aesdec  %xmm7, %xmm5\r
+\r
+       movups  0(ctx), %xmm7\r
+    aesdec  %xmm6, %xmm1\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm6, %xmm4\r
+    aesdec  %xmm6, %xmm5\r
+\r
+    aesdeclast  %xmm7, %xmm1\r
+    aesdeclast  %xmm7, %xmm2\r
+    aesdeclast  %xmm7, %xmm4\r
+    aesdeclast  %xmm7, %xmm5\r
+\r
+       pxor    iv, %xmm1                               // 1st obuf ^= iv; \r
+       movups  (ibuf), iv                              // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm2                               // 2nd obuf ^= iv; \r
+       movups  16(ibuf), iv                    // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm4                               // 3rd obuf ^= iv; \r
+       movups  32(ibuf), iv                    // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       pxor    iv, %xmm5                               // 4th obuf ^= iv; \r
+       movups  48(ibuf), iv                    // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+       movups  %xmm1, (obuf)                   // write 1st obuf\r
+       movups  %xmm2, 16(obuf)                 // write 2nd obuf\r
+       movups  %xmm4, 32(obuf)                 // write 3rd obuf\r
+       movups  %xmm5, 48(obuf)                 // write 4th obuf\r
+\r
+       add             $64, ibuf                               // ibuf += AES_BLOCK_SIZE * 4; \r
+       add             $64, obuf                               // obuf += AES_BLOCK_SIZE * 4;  \r
+\r
+       sub             $4, num_blk                             // num_blk -= 4\r
+       jge             0b                                              // if num_blk > 0, repeat the loop\r
+\r
+\r
+9:     add             $4, num_blk                             //      post incremtn num_blk by 4\r
+       je              L_HW_cbc_done                   // if num_blk == 0, no need for forthur processing code\r
+\r
+       movups  208(ctx), %xmm4\r
+       movups  192(ctx), %xmm5\r
+       movups  176(ctx), %xmm6\r
+       movups  160(ctx), %xmm7\r
+\r
+#endif\r
+\r
+0:\r
+       movups  (ibuf), %xmm2                           // tmp = ibuf\r
+\r
+       // aes_decrypt\r
+       pxor    %xmm3, %xmm2\r
+    aesdec  %xmm4, %xmm2\r
+    aesdec  %xmm5, %xmm2\r
+    aesdec  %xmm6, %xmm2\r
+    aesdec  %xmm7, %xmm2\r
+#if defined    __x86_64__\r
+    aesdec  %xmm8, %xmm2\r
+    aesdec  %xmm9, %xmm2\r
+    aesdec  %xmm10, %xmm2\r
+    aesdec  %xmm11, %xmm2\r
+    aesdec  %xmm12, %xmm2\r
+    aesdec  %xmm13, %xmm2\r
+    aesdec  %xmm14, %xmm2\r
+    aesdec  %xmm15, %xmm2\r
+#else\r
+       movups  144(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  128(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  112(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  96(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  80(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  64(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  48(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  32(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+#endif\r
+       movups  16(ctx), %xmm1\r
+    aesdec  %xmm1, %xmm2\r
+       movups  (ctx), %xmm1\r
+    aesdeclast  %xmm1, %xmm2\r
+\r
+       pxor    iv, %xmm2                       // obuf ^= iv; \r
+       movups  (ibuf), iv                      // memcpy(iv, tmp, AES_BLOCK_SIZE);\r
+\r
+       movups  %xmm2, (obuf)           // write obuf\r
+\r
+       add             $16, ibuf                               // ibuf += AES_BLOCK_SIZE; \r
+       add             $16, obuf                               // obuf += AES_BLOCK_SIZE;      \r
+       sub             $1, num_blk                             // num_blk --\r
+       jg              0b                                              // if num_blk > 0, repeat the loop\r
+\r
+       jmp             L_HW_cbc_done\r
+\r
+       //\r
+       // --------- END of aes_decrypt_cbc_hw  -------------------\r
+       //\r
diff --git a/bsd/crypto/aes/test/ReadMe.txt b/bsd/crypto/aes/test/ReadMe.txt

deleted file mode 100644 (file)

index 1329e84..0000000
--- a/bsd/crypto/aes/test/ReadMe.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-This directory contains file and shell scripts 
-
-       tstaes.c
-       makegenarm.sh
-       makegenx86.sh
-       makeoptx86.sh
-
-that can be used to build executables. These executable are used to validate the implementation
-and to benchmark the performance of the aes functions in the kernel. This directory also serves
-as a development environment for porting of the aes functions to any new architectures.
-
-On xnu-1699.20.6 (from which we add this work), the generic aes source code sits at bsd/crypto/aes/gen. The x86_64 
-and i386 architectural optimization is given in bsd/crypto/aes/i386.
-
-After making some code corrections (aes.h and most assembly code in i386), now you can build a test executable
-that is functionally equivalent to aes in the kernel code.
-
-To generate a test executable for the aes in x86_64/i386 kernel,
-
-       $ makeoptx86.sh
-
-This will build a test executable tstaesoptx86 (x86_64/i386). The executable will automatically detects the 
-CPU clock rates. You specify the number of iterations and the number of 16-byte blocks for simulation. 
-The executable generates (random number) the test data, and calls aes_encrypt_cbc to encrypt the plain data
-into cipher data, and then calls aes_decrypt_cbc to decrypt cipher into decrypted data. Afterwards, it compares
-the decrypted data against the plain data. Should there be a mismatch, the code breaks and exit. 
-Otherwise, it measures the times the system spends on the 2 functions under test. Afterwards, it prints out
-the performance profiling data.
-
-On K5,
-
-$ tstaesoptx86 1000 2560
-device max CPU clock rate = 2659.00 MHz
-40960 bytes per cbc call
- aes_encrypt_cbc : time elapsed =   220.24 usecs,  177.37 MBytes/sec,    14.30 cycles/byte
-  best iteration : time elapsed =   218.30 usecs,  178.94 MBytes/sec,    14.17 cycles/byte
- worst iteration : time elapsed =   286.14 usecs,  136.51 MBytes/sec,    18.58 cycles/byte
-
- aes_decrypt_cbc : time elapsed =   199.85 usecs,  195.46 MBytes/sec,    12.97 cycles/byte
-  best iteration : time elapsed =   198.17 usecs,  197.12 MBytes/sec,    12.86 cycles/byte
- worst iteration : time elapsed =   228.12 usecs,  171.23 MBytes/sec,    14.81 cycles/byte
-
-On K5B (with aesni)
-
-$ tstaesoptx86 1000 256    
-device max CPU clock rate = 2400.00 MHz
-4096 bytes per cbc call
- aes_encrypt_cbc : time elapsed =     6.69 usecs,  583.67 MBytes/sec,     3.92 cycles/byte
-  best iteration : time elapsed =     6.38 usecs,  612.46 MBytes/sec,     3.74 cycles/byte
- worst iteration : time elapsed =     9.72 usecs,  401.96 MBytes/sec,     5.69 cycles/byte
-
- aes_decrypt_cbc : time elapsed =     2.05 usecs, 1902.65 MBytes/sec,     1.20 cycles/byte
-  best iteration : time elapsed =     1.96 usecs, 1997.06 MBytes/sec,     1.15 cycles/byte
- worst iteration : time elapsed =     4.60 usecs,  849.00 MBytes/sec,     2.70 cycles/byte
-
-You can also build a test executable using the generic source code for the i386/x86_64 architecture.
-
-       $ makegenx86.sh
-
-When run on K5,
-
-$ tstaesgenx86 1000 2560   
-device max CPU clock rate = 2659.00 MHz
-40960 bytes per cbc call
- aes_encrypt_cbc : time elapsed =   278.05 usecs,  140.49 MBytes/sec,    18.05 cycles/byte
-  best iteration : time elapsed =   274.63 usecs,  142.24 MBytes/sec,    17.83 cycles/byte
- worst iteration : time elapsed =   309.70 usecs,  126.13 MBytes/sec,    20.10 cycles/byte
-
- aes_decrypt_cbc : time elapsed =   265.43 usecs,  147.17 MBytes/sec,    17.23 cycles/byte
-  best iteration : time elapsed =   262.20 usecs,  148.98 MBytes/sec,    17.02 cycles/byte
- worst iteration : time elapsed =   296.19 usecs,  131.88 MBytes/sec,    19.23 cycles/byte
-
-We can see the current AES implementation in the x86_64 kernel has been improved from 17.83/17.02
-down to 14.12/12.86 cycles/byte for aes_encrypt_cbc and aes_decrypt_cbc, respectively.
-
-
- --------- iOS ---------
-
-Similarly, you can build a test executable for the aes in the armv7 kernel (which uses the generic source code)
-
-       $ makegenarm.sh
-
-Note that you need the iOS SDK installed. We can then copy this executable to iOS devices for simulation.
-
-On N88,
-
-iPhone:~ root# ./tstaesgenarm 1000 2560
-device max CPU clock rate = 600.00 MHz
-40960 bytes per cbc call
- aes_encrypt_cbc : time elapsed =  2890.18 usecs,   13.52 MBytes/sec,    42.34 cycles/byte
-  best iteration : time elapsed =  2692.00 usecs,   14.51 MBytes/sec,    39.43 cycles/byte
- worst iteration : time elapsed = 18248.33 usecs,    2.14 MBytes/sec,   267.31 cycles/byte
-
- aes_decrypt_cbc : time elapsed =  3078.20 usecs,   12.69 MBytes/sec,    45.09 cycles/byte
-  best iteration : time elapsed =  2873.33 usecs,   13.59 MBytes/sec,    42.09 cycles/byte
- worst iteration : time elapsed =  9664.79 usecs,    4.04 MBytes/sec,   141.57 cycles/byte
-
diff --git a/bsd/crypto/aes/test/makegenx86.sh b/bsd/crypto/aes/test/makegenx86.sh

deleted file mode 100755 (executable)

index ea4de6f..0000000
--- a/bsd/crypto/aes/test/makegenx86.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/ksh
-
-cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aescrypt.c -o aescrypt.o
-cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aeskey.c -o aeskey.o
-cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aestab.c -o aestab.o
-
-cc -arch i386 -arch x86_64 -Os tstaes.c aescrypt.o aeskey.o aestab.o -o tstaesgenx86
-rm -fr aescrypt.o aeskey.o aestab.o
diff --git a/bsd/crypto/aes/test/makeoptx86.sh b/bsd/crypto/aes/test/makeoptx86.sh

deleted file mode 100755 (executable)

index 3732e03..0000000
--- a/bsd/crypto/aes/test/makeoptx86.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/ksh
-
-cc -c -Os -arch i386 -arch x86_64 ../i386/AES.s -o AES.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_crypt_hw.s -o aes_crypt_hw.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_key_hw.s -o aes_key_hw.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_asm.s -o aes_modes_asm.o
-cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_hw.s -o aes_modes_hw.o
-
-cc -Os -arch i386 -arch x86_64 tstaes.c AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o -o tstaesoptx86
-rm -fr AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o
diff --git a/bsd/crypto/aes/test/tstaes.c b/bsd/crypto/aes/test/tstaes.c

deleted file mode 100644 (file)

index 9d186ee..0000000
--- a/bsd/crypto/aes/test/tstaes.c
+++ /dev/null
@@ -1,131 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "../aes.h"
-#include <mach/mach_time.h>
-#include <sys/sysctl.h>
-
-
-aes_encrypt_ctx        encrypt_ctx;
-aes_decrypt_ctx        decrypt_ctx;
-
-size_t getFreq()
-{
-    int mib[2];
-    size_t cpufreq, len;
-    mib[0] = CTL_HW;
-    mib[1] = HW_CPU_FREQ;
-    len = sizeof(cpufreq);
-
-    sysctl(mib, 2, &cpufreq, &len, NULL, 0);
-
-    return  cpufreq;
-}
-
-
-uint32_t       cpu_freq;
-
-main(int argc, char **argv)
-{
-
-       char    *plain;
-       char    *cipher;
-       char    *decrypt;
-
-uint32_t       ITERATIONS;
-uint32_t       NUM_BLOCKS;
-uint32_t       data_size;
-
-       char    key[32];
-       char    iv[16];
-       int             checksum=0;
-       int             i, j, iterations;
-       uint64_t    t0, t1, t2, sum=0, max_time=0, min_time=-1, sum1=0, max_time1=0, min_time1=-1;
-    float       time, time_max, time_min, time1, time_max1, time_min1;
-
-       cpu_freq = getFreq();
-
-       if (cpu_freq == 0) {
-               fprintf(stderr, "this appears to be an iPhone device, where cpu_freq can not be detected. set to 800MHz.\n");
-               cpu_freq = 800000000;
-       } else {
-               fprintf(stderr, "device max CPU clock rate = %.2f MHz\n", cpu_freq/1.e6);
-       }
-
-    mach_timebase_info_data_t info;
-    kern_return_t err = mach_timebase_info( &info );
-
-       if (argc!=3) {
-               fprintf(stderr, "usage : %s iterations num_16bytes_block\n", argv[0]);
-               exit(1);
-       }
-       ITERATIONS = atoi(argv[1]);
-       NUM_BLOCKS = atoi(argv[2]);
-       data_size = 16*NUM_BLOCKS;
-
-       plain = malloc(data_size);
-       cipher = malloc(data_size);
-       decrypt = malloc(data_size);
-
-       if ((plain==NULL) || (cipher==NULL) || (decrypt==NULL)) {
-               fprintf(stderr,"malloc error.\n");
-               exit(1);
-       }
-
-       for (i=0;i<data_size;i++) plain[i] = random();
-       for (i=0;i<32;i++) key[i] = random();
-       for (i=0;i<16;i++) iv[i] = random();
-
-       aes_encrypt_key128(key, &encrypt_ctx);
-       aes_decrypt_key128(key, &decrypt_ctx);
-
-       for (iterations=0;iterations<ITERATIONS;iterations++) {
-               t0 = mach_absolute_time();
-
-               // encrypt
-               aes_encrypt_cbc(plain, iv, NUM_BLOCKS, cipher, &encrypt_ctx);
-
-               t1 = mach_absolute_time();
-
-               // decrypt
-               aes_decrypt_cbc(cipher, iv, NUM_BLOCKS, decrypt, &decrypt_ctx);
-
-               t2 = mach_absolute_time();
-
-               for (i=0;i<(16*NUM_BLOCKS);i++) if (plain[i]!=decrypt[i]) {
-                               fprintf(stderr,"error : decrypt != plain. i = %d\n", i);
-                               exit(1);
-               }
-               sum += (t1-t0);
-               sum1 += (t2-t1);
-               t2-=t1;
-               t1-=t0;
-               if (t1>max_time) max_time = t1;
-        if (t1<min_time) min_time = t1;
-               if (t2>max_time1) max_time1 = t2;
-        if (t2<min_time1) min_time1 = t2;
-       }
-
-       time = sum * 1e-9* ((double) info.numer)/((double) info.denom);
-       time_max = max_time * 1e-9* ((double) info.numer)/((double) info.denom);
-    time_min = min_time * 1e-9* ((double) info.numer)/((double) info.denom);
-
-       time1 = sum1 * 1e-9* ((double) info.numer)/((double) info.denom);
-       time_max1 = max_time1 * 1e-9* ((double) info.numer)/((double) info.denom);
-    time_min1 = min_time1 * 1e-9* ((double) info.numer)/((double) info.denom);
-
-       printf("%d bytes per cbc call\n", data_size);
-       printf(" aes_encrypt_cbc : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time/ITERATIONS,data_size*ITERATIONS/1024./1024./time, time*1.*cpu_freq/ITERATIONS/data_size);
-       printf("  best iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_min,data_size/1024./1024./time_min, time_min*1.*cpu_freq/data_size);
-    printf(" worst iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_max,data_size/1024./1024./time_max, time_max*1.*cpu_freq/data_size);
-
-       printf("\n");
-
-       printf(" aes_decrypt_cbc : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time1/ITERATIONS,data_size*ITERATIONS/1024./1024./time1, time1*1.*cpu_freq/ITERATIONS/data_size);
-       printf("  best iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_min1,data_size/1024./1024./time_min1, time_min1*1.*cpu_freq/data_size);
-    printf(" worst iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_max1,data_size/1024./1024./time_max1, time_max1*1.*cpu_freq/data_size);
-
-       free(plain);
-       free(cipher);
-       free(decrypt);
-}
diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c

index 8703ecb9bf0a5e6f3ff9213a8291e17e5a32491d..016df24e0f6ee39cfeee98fe22151838680d49c5 100644 (file)
--- a/bsd/hfs/hfs_cnode.c
+++ b/bsd/hfs/hfs_cnode.c
@@ -77,7 +77,7 @@ int hfs_set_backingstore (struct vnode *vp, int val) {
         int err = 0;
         
         cp = VTOC(vp);
-       if (vnode_isdir(vp)) {
+       if (!vnode_isreg(vp) && !vnode_isdir(vp)) {
                 return EINVAL;
         }
  
@@ -113,7 +113,7 @@ int hfs_is_backingstore (struct vnode *vp, int *val) {
         struct cnode *cp = NULL;
         int err = 0;
  
-       if (!vnode_isreg(vp)) {
+       if (!vnode_isreg(vp) && !vnode_isdir(vp)) {
                 *val = 0;
                 return 0;
         }
diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c

index 0bac8a3eb2be303bbb233c7468040775067e20a9..adf02520b5e3648d66bbe69508189c6d3a1865cc 100644 (file)
--- a/bsd/hfs/hfs_vfsops.c
+++ b/bsd/hfs/hfs_vfsops.c
@@ -3966,6 +3966,9 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
  
+       /* Start with a clean journal. */
+       hfs_journal_flush(hfsmp, TRUE);
+
         /*
          * Enclose changes inside a transaction.
          */
@@ -4244,6 +4247,9 @@ out:
         }
         if (transaction_begun) {
                 hfs_end_transaction(hfsmp);
+               hfs_journal_flush(hfsmp, FALSE);
+               /* Just to be sure, sync all data to the disk */
+               (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
         }
  
         return MacToVFSError(error);
diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c

index f2c9c8711c89d69ad9b6906a7121860411bd2cf0..e1f693be2084cac4139ce2502a5324da78942eb7 100644 (file)
--- a/bsd/kern/kern_sysctl.c
+++ b/bsd/kern/kern_sysctl.c
@@ -3321,6 +3321,12 @@ SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW | CTLFLAG_LOCK
  SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_user_wire_limit, "");
  SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_user_wire_limit, "");
  
+extern int vm_map_copy_overwrite_aligned_src_not_internal;
+extern int vm_map_copy_overwrite_aligned_src_not_symmetric;
+extern int vm_map_copy_overwrite_aligned_src_large;
+SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_not_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_not_internal, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_not_symmetric, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_not_symmetric, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_large, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_large, 0, "");
  
  
  /*
diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c

index 9aba89b96e25d0f56c3f4afe12193e9b43f169e3..5294122ff3498f23868f0b2b54a293937ed8bf53 100644 (file)
--- a/bsd/kern/mach_process.c
+++ b/bsd/kern/mach_process.c
@@ -129,10 +129,6 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval)
         AUDIT_ARG(value32, uap->data);
  
         if (uap->req == PT_DENY_ATTACH) {
-#if (DEVELOPMENT || DEBUG) && defined(__arm__)
-               if (PE_i_can_has_debugger(NULL))
-                       return(0);
-#endif
                 proc_lock(p);
                 if (ISSET(p->p_lflag, P_LTRACED)) {
                         proc_unlock(p);
diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c

index 521de769e9c1b439b716f5215872e0668e8bb7fc..8a6356d5a2435982d4732963a3ed30c2fd1ca3bf 100644 (file)
--- a/bsd/kern/uipc_syscalls.c
+++ b/bsd/kern/uipc_syscalls.c
@@ -1847,22 +1847,25 @@ sockargs(struct mbuf **mp, user_addr_t data, int buflen, int type)
         struct mbuf *m;
         int error;
  
-       int alloc_buflen = buflen;
+       size_t alloc_buflen = (size_t)buflen;
+       
+       if(alloc_buflen > INT_MAX/2) 
+               return (EINVAL);
  #ifdef __LP64__
         /* The fd's in the buffer must expand to be pointers, thus we need twice as much space */
         if(type == MT_CONTROL)
                 alloc_buflen = ((buflen - sizeof(struct cmsghdr))*2) + sizeof(struct cmsghdr);
  #endif
-       if ((u_int)alloc_buflen > MLEN) {
-               if (type == MT_SONAME && (u_int)alloc_buflen <= 112)
+       if (alloc_buflen > MLEN) {
+               if (type == MT_SONAME && alloc_buflen <= 112)
                         alloc_buflen = MLEN;            /* unix domain compat. hack */
-               else if ((u_int)alloc_buflen > MCLBYTES)
+               else if (alloc_buflen > MCLBYTES)
                         return (EINVAL);
         }
         m = m_get(M_WAIT, type);
         if (m == NULL)
                 return (ENOBUFS);
-       if ((u_int)alloc_buflen > MLEN) {
+       if (alloc_buflen > MLEN) {
                 MCLGET(m, M_WAIT);
                 if ((m->m_flags & M_EXT) == 0) {
                         m_free(m);
diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h

index 0d9cff9197f31aa84e2d0466411fea89fdfc5ff0..8259186d03ae9e8a1684f5f63925465da9ca58d1 100644 (file)
--- a/bsd/libkern/libkern.h
+++ b/bsd/libkern/libkern.h
@@ -213,15 +213,6 @@ clz(unsigned int num)
         );
         return 31 ^ result;
  
-#elif __arm__ && !__thumb__ && defined(_ARM_ARCH_5)
-       unsigned int result;
-       __asm__ volatile(
-               "clz %0, %1"
-               : "=r" (result)
-               : "r" (num)
-       );
-
-       return result;
  #else
         return num?__builtin_clz(num):__builtin_clz(0);
  #endif
diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c

index 4bb6e1c28da0d4e6df4a1f28056b583cdbebfbd3..833b8ca34e753c69eb77618a779e96b277ed0786 100644 (file)
--- a/bsd/net/ntstat.c
+++ b/bsd/net/ntstat.c
@@ -1248,8 +1248,7 @@ nstat_idle_check(
                                 removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED;
                                 removed.hdr.context = 0;
                                 removed.srcref = dead->srcref;
-                               errno_t result = ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR);
-                               if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result);
+                               (void)ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR);
                                 
                                 // Put this on the list to release later
                                 dead->next = dead_list;
@@ -1318,8 +1317,7 @@ nstat_control_cleanup_source(
                 removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED;
                 removed.hdr.context = 0;
                 removed.srcref = src->srcref;
-               errno_t result = ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR);
-               if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result);
+               (void)ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR);
         }
         
         // Cleanup the source if we found it.
@@ -1551,7 +1549,6 @@ nstat_control_handle_add_request(
         
         if (result != 0)
         {
-               printf("nstat_lookup_entry failed: %d\n", result);
                 return result;
         }
         
@@ -1785,10 +1782,6 @@ nstat_control_handle_query_request(
                         if (result == 0)
                         {
                                 result = ctl_enqueuedata(state->kctl, state->unit, &counts, sizeof(counts), CTL_DATA_EOR);
-                               if (result != 0)
-                               {
-                                       printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result);
-                               }
                         }
                         else
                         {
diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c

index 1fcafd5837e2d3c40ba8402207870989153de134..f32cef303be01d9d7ddbbc70286fcad2d069ae4b 100644 (file)
--- a/bsd/netinet/in_cksum.c
+++ b/bsd/netinet/in_cksum.c
@@ -141,38 +141,6 @@ in_pseudo(u_int a, u_int b, u_int c)
  
  }
  
-#if defined(__arm__) && __ARM_ARCH__ >= 6
-
-extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum);
-
-u_int16_t
-inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip,
-    unsigned int len)
-{
-       u_int32_t sum = 0;
-
-       /* sanity check */
-       if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) {
-               panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n",
-                   m->m_pkthdr.len, skip, len);
-       }
-
-       /* include pseudo header checksum? */
-       if (nxt != 0) {
-               struct ip *iph;
-
-               if (m->m_len < sizeof (struct ip))
-                       panic("inet_cksum: bad mbuf chain");
-
-               iph = mtod(m, struct ip *);
-               sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr,
-                   htonl(len + nxt));
-       }
-
-       return (cpu_in_cksum(m, len, skip, sum));
-}
-
-#else
  
  u_int16_t
  inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip,
@@ -304,4 +272,3 @@ skip_start:
         return (~sum & 0xffff);
  }
  
-#endif
diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c

index c6415031996def71ef4815a49d9c3344b1d3077b..2052493abeba91c5253c09a2b3a1a512f1fc1b04 100644 (file)
--- a/bsd/netinet6/esp_input.c
+++ b/bsd/netinet6/esp_input.c
@@ -440,8 +440,8 @@ noreplaycheck:
                     seq >= sav->replay->lastseq)  {
                         struct udphdr *encap_uh = (__typeof__(encap_uh))((caddr_t)ip + off);
                         if (encap_uh->uh_sport &&
-                           encap_uh->uh_sport != sav->remote_ike_port) {
-                               sav->remote_ike_port = encap_uh->uh_sport;
+                           ntohs(encap_uh->uh_sport) != sav->remote_ike_port) {
+                               sav->remote_ike_port = ntohs(encap_uh->uh_sport);
                         }
                 }
                 ip = esp4_input_strip_UDP_encap(m, off);
diff --git a/bsd/netinet6/in6_cksum.c b/bsd/netinet6/in6_cksum.c

index f0352eb728863d338db24a9bdc9dfdd599b93d5a..77dd7e1afb2e7e5ab55b767d42f656bb09213ab1 100644 (file)
--- a/bsd/netinet6/in6_cksum.c
+++ b/bsd/netinet6/in6_cksum.c
@@ -131,91 +131,6 @@
  #include <machine/endian.h>
  
  
-#if defined(__arm__) && __ARM_ARCH__ >= 6
-extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum);
-
-u_int16_t
-inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off,
-    unsigned int len)
-{
-       union {
-               uint16_t words[16];
-               struct {
-                       struct in6_addr ip6_src;
-                       struct in6_addr ip6_dst;
-               } addrs;
-       } u;
-       const struct in6_addr *in6_src;
-       const struct in6_addr *in6_dst;
-       const struct ip6_hdr *ip6;
-       uint32_t sum;
-       const uint16_t *w;
-       const char *cp;
-
-       if (off < sizeof (struct ip6_hdr))
-               panic("inet6_cksum: offset too short for IPv6 header");
-       if (m->m_len < sizeof (struct ip6_hdr))
-               panic("inet6_cksum: mbuf too short for IPv6 header");
-
-       if (nxt == 0)
-               return (cpu_in_cksum(m, len, off, 0));
-
-       /*
-        * Compute the equivalent of:
-        * struct ip6_hdr_pseudo ip6;
-        *
-        * bzero(sizeof (*ip6));
-        * ip6.ip6ph_nxt = nxt;
-        * ip6.ip6ph_len = htonl(len);
-        * ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src;
-        * in6_clearscope(&ip6->ip6ph_src);
-        * ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst;
-        * in6_clearscope(&ip6->ip6ph_dst);
-        * sum = one_add(&ip6);
-        */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-       sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8;
-#else
-       sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt;
-#endif
-       cp = mtod(m, const char *);
-       w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src));
-       ip6 = (const void *)cp;
-       if ((uintptr_t)w % 2 == 0) {
-               in6_src = &ip6->ip6_src;
-               in6_dst = &ip6->ip6_dst;
-       } else {
-               memcpy(&u, &ip6->ip6_src, 32);
-               w = u.words;
-               in6_src = &u.addrs.ip6_src;
-               in6_dst = &u.addrs.ip6_dst;
-       }
-
-       sum += w[0];
-       if (!IN6_IS_SCOPE_EMBED(in6_src))
-               sum += w[1];
-       sum += w[2];
-       sum += w[3];
-       sum += w[4];
-       sum += w[5];
-       sum += w[6];
-       sum += w[7];
-       w += 8;
-       sum += w[0];
-       if (!IN6_IS_SCOPE_EMBED(in6_dst))
-               sum += w[1];
-       sum += w[2];
-       sum += w[3];
-       sum += w[4];
-       sum += w[5];
-       sum += w[6];
-       sum += w[7];
-
-       return (cpu_in_cksum(m, len, off, sum));
-}
-
-#else
  
  /*
   * Checksum routine for Internet Protocol family headers (Portable Version).
@@ -445,4 +360,3 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off,
         return (~sum & 0xffff);
  }
  
-#endif
diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c

index 7a0323fde8191f3d361c2874dc657bf4f17ab7ee..484e47c2b156e2445d49efd1097e9cc60f6acc97 100644 (file)
--- a/bsd/nfs/nfs_vfsops.c
+++ b/bsd/nfs/nfs_vfsops.c
@@ -1575,8 +1575,12 @@ nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int ar
         /* copy socket address */
         if (inkernel)
                 bcopy(CAST_DOWN(void *, args.addr), &ss, args.addrlen);
-       else
-               error = copyin(args.addr, &ss, args.addrlen);
+       else {
+               if ((size_t)args.addrlen > sizeof (struct sockaddr_storage))
+                       error = EINVAL;
+               else
+                       error = copyin(args.addr, &ss, args.addrlen);
+       }
         nfsmout_if(error);
         ss.ss_len = args.addrlen;
  
diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c

index 0e8bd67ddabfb1fa4b9550f12a700defd50730ea..2bccd5bb33c138990272a68e6a6f75251354f802 100644 (file)
--- a/bsd/vfs/vfs_cluster.c
+++ b/bsd/vfs/vfs_cluster.c
@@ -231,7 +231,7 @@ uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3);
   * before we issue a synchronous write 
   */
  #define HARD_THROTTLE_MAXCNT   0
-#define HARD_THROTTLE_MAXSIZE  (32 * 1024)
+#define HARD_THROTTLE_MAXSIZE  (256 * 1024)
  
  int hard_throttle_on_root = 0;
  struct timeval priority_IO_timestamp_for_root;
diff --git a/config/MasterVersion b/config/MasterVersion

index 15cb149c858997432361fef40b069d87f5ae8e4d..b5a6d2aac369f7c4e766864f691f252c38339f55 100644 (file)
--- a/config/MasterVersion
+++ b/config/MasterVersion
@@ -1,4 +1,4 @@
-11.2.0
+11.3.0
  
  # The first line of this file contains the master version number for the kernel.
  # All other instances of the kernel version in xnu are derived from this file.
diff --git a/kgmacros b/kgmacros

index edb1e35dbe89a924bfec03734f7f92b110977b4f..a2c6879f82ad2eaf238e1de2b40292de84613fcd 100644 (file)
--- a/kgmacros
+++ b/kgmacros
@@ -2445,13 +2445,13 @@ define zprint_one
      set $kgm_zone = (struct zone *)$arg0
  
      showptr $kgm_zone
-    printf "  %6d ",$kgm_zone->count
+    printf "  %8d ",$kgm_zone->count
      printf "%8x ",$kgm_zone->cur_size
      printf "%8x ",$kgm_zone->max_size
-    printf "%6d ",$kgm_zone->elem_size
+    printf "%8d ",$kgm_zone->elem_size
      printf "%8x ",$kgm_zone->alloc_size
-       printf " %8d ",$kgm_zone->num_allocs
-       printf "%8d ",$kgm_zone->num_frees
+       printf " %16ld ",$kgm_zone->num_allocs
+       printf "%16ld ",$kgm_zone->num_frees
      printf "%s ",$kgm_zone->zone_name
  
      if ($kgm_zone->exhaustible)
@@ -2473,7 +2473,7 @@ end
  define zprint
      printf "ZONE      "
      showptrhdrpad
-    printf "   COUNT   TOT_SZ   MAX_SZ ELT_SZ ALLOC_SZ TOT_ALLOC TOT_FREE NAME\n"
+    printf "     COUNT   TOT_SZ   MAX_SZ   ELT_SZ ALLOC_SZ         TOT_ALLOC         TOT_FREE NAME\n"
      set $kgm_zone_ptr = (struct zone *)first_zone
      while ($kgm_zone_ptr != 0)
          zprint_one $kgm_zone_ptr
@@ -9714,12 +9714,13 @@ define zstack
                 printf "\n--------------- "
  
                 if (zrecords[$index].z_opcode == 1)
-                       printf "ALLOC "
+                       printf "ALLOC  "
                 else
-                       printf "FREE "
+                       printf "FREE  "
                 end
  
-               printf " 0x%x : index %d  :  ztime %d -------------\n", zrecords[$index].z_element, $index, zrecords[$index].z_time
+               showptr zrecords[$index].z_element
+               printf " : index %d  :  ztime %d -------------\n", $index, zrecords[$index].z_time
  
                 set $frame = 0
  
diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h

index 662021550e093264312da1dccc700604e291635d..cb2f9896a0a77f76a87f2cbf862a00ebb183ae37 100644 (file)
--- a/libkern/libkern/c++/OSMetaClass.h
+++ b/libkern/libkern/c++/OSMetaClass.h
@@ -60,8 +60,6 @@ class OSSerialize;
  #if defined(__LP64__)
  /*! @parseOnly */
  #define APPLE_KEXT_LEGACY_ABI  0
-#elif defined(__arm__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define APPLE_KEXT_LEGACY_ABI  0
  #else
  #define APPLE_KEXT_LEGACY_ABI  1
  #endif
diff --git a/libsyscall/wrappers/remove-counter.c b/libsyscall/wrappers/remove-counter.c

index d6a2846d8dc2dabdb8c15122412ced3bb939fc7c..fe41f2757fff32ebb95ecd41a0cfdc72a0214c16 100644 (file)
--- a/libsyscall/wrappers/remove-counter.c
+++ b/libsyscall/wrappers/remove-counter.c
@@ -31,19 +31,11 @@ static int32_t __remove_counter = 0;
  
  __uint64_t
  __get_remove_counter(void) {
-#if defined(__arm__) && !defined(_ARM_ARCH_6)
-       return __remove_counter;
-#else
         return __sync_add_and_fetch(&__remove_counter, 0);
-#endif
  }
  
  void
  __inc_remove_counter(void)
  {
-#if defined(__arm__) && !defined(_ARM_ARCH_6)
-       __remove_counter++;
-#else
         __sync_add_and_fetch(&__remove_counter, 1);
-#endif
  }
diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s

index 9ea9f982bc2f38a013ee0c4d05748cfd9ff256ef..0f7bdba3a57ad65b81a968764df74180b58904ff 100644 (file)
--- a/osfmk/i386/i386_lock.s
+++ b/osfmk/i386/i386_lock.s
@@ -214,9 +214,7 @@
  #define PREEMPTION_DISABLE                             \
         incl    %gs:CPU_PREEMPTION_LEVEL
  
-#if MACH_LDEBUG || 1
  #define        PREEMPTION_LEVEL_DEBUG 1        
-#endif
  #if    PREEMPTION_LEVEL_DEBUG
  #define        PREEMPTION_ENABLE                               \
         decl    %gs:CPU_PREEMPTION_LEVEL        ;       \
diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c

index 2e1fbe691ef05d475ebe5f7457a2904afe54059f..604bc202f79afb47653d6f44027974565c72259e 100644 (file)
--- a/osfmk/vm/vm_map.c
+++ b/osfmk/vm/vm_map.c
@@ -1253,6 +1253,7 @@ vm_map_find_space(
         }
         *address = start;
  
+       assert(start < end);
         new_entry->vme_start = start;
         new_entry->vme_end = end;
         assert(page_aligned(new_entry->vme_start));
@@ -1868,6 +1869,7 @@ StartAgain: ;
                          *      new range.
                          */
                         map->size += (end - entry->vme_end);
+                       assert(entry->vme_start < end);
                         entry->vme_end = end;
                         vm_map_store_update_first_free(map, map->first_free);
                         RETURN(KERN_SUCCESS);
@@ -2971,7 +2973,7 @@ vm_map_clip_unnest(
   *     the specified address; if necessary,
   *     it splits the entry into two.
   */
-static void
+void
  vm_map_clip_start(
         vm_map_t        map,
         vm_map_entry_t  entry,
@@ -3038,7 +3040,9 @@ _vm_map_clip_start(
         vm_map_entry_copy_full(new_entry, entry);
  
         new_entry->vme_end = start;
+       assert(new_entry->vme_start < new_entry->vme_end);
         entry->offset += (start - entry->vme_start);
+       assert(start < entry->vme_end);
         entry->vme_start = start;
  
         _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
@@ -3057,7 +3061,7 @@ _vm_map_clip_start(
   *     the specified address; if necessary,
   *     it splits the entry into two.
   */
-static void
+void
  vm_map_clip_end(
         vm_map_t        map,
         vm_map_entry_t  entry,
@@ -3128,8 +3132,10 @@ _vm_map_clip_end(
         new_entry = _vm_map_entry_create(map_header);
         vm_map_entry_copy_full(new_entry, entry);
  
+       assert(entry->vme_start < end);
         new_entry->vme_start = entry->vme_end = end;
         new_entry->offset += (end - entry->vme_start);
+       assert(new_entry->vme_start < new_entry->vme_end);
  
         _vm_map_store_entry_link(map_header, entry, new_entry);
  
@@ -5876,6 +5882,12 @@ start_overwrite:
                                 copy->type = VM_MAP_COPY_ENTRY_LIST;
                                 copy->offset = new_offset;
  
+                               /*
+                                * XXX FBDP
+                                * this does not seem to deal with
+                                * the VM map store (R&B tree)
+                                */
+
                                 total_size -= copy_size;
                                 copy_size = 0;
                                 /* put back remainder of copy in container */
@@ -6520,6 +6532,10 @@ vm_map_copy_overwrite_unaligned(
   *     to the above pass and make sure that no wiring is involved.
   */
  
+int vm_map_copy_overwrite_aligned_src_not_internal = 0;
+int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
+int vm_map_copy_overwrite_aligned_src_large = 0;
+
  static kern_return_t
  vm_map_copy_overwrite_aligned(
         vm_map_t        dst_map,
@@ -6624,6 +6640,26 @@ vm_map_copy_overwrite_aligned(
                                 continue;
                         }
  
+#if !CONFIG_EMBEDDED
+#define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024)        /* 64 MB */
+#define __TRADEOFF1_COPY_SIZE (128 * 1024)     /* 128 KB */
+                       if (copy_entry->object.vm_object != VM_OBJECT_NULL &&
+                           copy_entry->object.vm_object->vo_size >= __TRADEOFF1_OBJ_SIZE &&
+                           copy_size <= __TRADEOFF1_COPY_SIZE) {
+                               /*
+                                * Virtual vs. Physical copy tradeoff #1.
+                                *
+                                * Copying only a few pages out of a large
+                                * object:  do a physical copy instead of
+                                * a virtual copy, to avoid possibly keeping
+                                * the entire large object alive because of
+                                * those few copy-on-write pages.
+                                */
+                               vm_map_copy_overwrite_aligned_src_large++;
+                               goto slow_copy;
+                       }
+#endif /* !CONFIG_EMBEDDED */
+
                         if (entry->alias >= VM_MEMORY_MALLOC &&
                             entry->alias <= VM_MEMORY_MALLOC_LARGE_REUSED) {
                                 vm_object_t new_object, new_shadow;
@@ -6637,6 +6673,10 @@ vm_map_copy_overwrite_aligned(
                                         vm_object_lock_shared(new_object);
                                 }
                                 while (new_object != VM_OBJECT_NULL &&
+#if !CONFIG_EMBEDDED
+                                      !new_object->true_share &&
+                                      new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
+#endif /* !CONFIG_EMBEDDED */
                                        new_object->internal) {
                                         new_shadow = new_object->shadow;
                                         if (new_shadow == VM_OBJECT_NULL) {
@@ -6657,9 +6697,24 @@ vm_map_copy_overwrite_aligned(
                                                  * let's go off the optimized
                                                  * path...
                                                  */
+                                               vm_map_copy_overwrite_aligned_src_not_internal++;
                                                 vm_object_unlock(new_object);
                                                 goto slow_copy;
                                         }
+#if !CONFIG_EMBEDDED
+                                       if (new_object->true_share ||
+                                           new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
+                                               /*
+                                                * Same if there's a "true_share"
+                                                * object in the shadow chain, or
+                                                * an object with a non-default
+                                                * (SYMMETRIC) copy strategy.
+                                                */
+                                               vm_map_copy_overwrite_aligned_src_not_symmetric++;
+                                               vm_object_unlock(new_object);
+                                               goto slow_copy;
+                                       }
+#endif /* !CONFIG_EMBEDDED */
                                         vm_object_unlock(new_object);
                                 }
                                 /*
@@ -6752,6 +6807,14 @@ vm_map_copy_overwrite_aligned(
                         kern_return_t           r;
  
                 slow_copy:
+                       if (entry->needs_copy) {
+                               vm_object_shadow(&entry->object.vm_object,
+                                                &entry->offset,
+                                                (entry->vme_end -
+                                                 entry->vme_start));
+                               entry->needs_copy = FALSE;
+                       }
+
                         dst_object = entry->object.vm_object;
                         dst_offset = entry->offset;
  
@@ -6838,7 +6901,8 @@ vm_map_copy_overwrite_aligned(
  
                         start += copy_size;
                         vm_map_lock(dst_map);
-                       if (version.main_timestamp == dst_map->timestamp) {
+                       if (version.main_timestamp == dst_map->timestamp &&
+                           copy_size != 0) {
                                 /* We can safely use saved tmp_entry value */
  
                                 vm_map_clip_end(dst_map, tmp_entry, start);
@@ -7910,6 +7974,7 @@ vm_map_copyin_common(
                 tmp_entry->vme_end = copy_addr + 
                         (tmp_entry->vme_end - tmp_entry->vme_start);
                 tmp_entry->vme_start = copy_addr;
+               assert(tmp_entry->vme_start < tmp_entry->vme_end);
                 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
                 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
         }
@@ -10000,6 +10065,7 @@ vm_map_simplify_entry(
             (this_entry->is_shared == FALSE)
                 ) {
                 _vm_map_store_entry_unlink(&map->hdr, prev_entry);
+               assert(prev_entry->vme_start < this_entry->vme_end);
                 this_entry->vme_start = prev_entry->vme_start;
                 this_entry->offset = prev_entry->offset;
                 if (prev_entry->is_sub_map) {
@@ -11086,6 +11152,7 @@ vm_map_entry_insert(
         new_entry->vme_end = end;
         assert(page_aligned(new_entry->vme_start));
         assert(page_aligned(new_entry->vme_end));
+       assert(new_entry->vme_start < new_entry->vme_end);
  
         new_entry->object.vm_object = object;
         new_entry->offset = offset;
@@ -11288,6 +11355,7 @@ vm_map_remap_extract(
  
                 new_entry->vme_start = map_address;
                 new_entry->vme_end = map_address + tmp_size;
+               assert(new_entry->vme_start < new_entry->vme_end);
                 new_entry->inheritance = inheritance;
                 new_entry->offset = offset;
  
@@ -13203,3 +13271,85 @@ out:
         vm_map_unlock(map);
  }
  #endif
+
+#if !CONFIG_EMBEDDED
+/*
+ * vm_map_entry_should_cow_for_true_share:
+ *
+ * Determines if the map entry should be clipped and setup for copy-on-write
+ * to avoid applying "true_share" to a large VM object when only a subset is
+ * targeted.
+ *
+ * For now, we target only the map entries created for the Objective C
+ * Garbage Collector, which initially have the following properties:
+ *     - alias == VM_MEMORY_MALLOC
+ *     - wired_count == 0
+ *     - !needs_copy
+ * and a VM object with:
+ *     - internal
+ *     - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
+ *     - !true_share
+ *     - vo_size == ANON_CHUNK_SIZE
+ */
+boolean_t
+vm_map_entry_should_cow_for_true_share(
+       vm_map_entry_t  entry)
+{
+       vm_object_t     object;
+
+       if (entry->is_sub_map) {
+               /* entry does not point at a VM object */
+               return FALSE;
+       }
+
+       if (entry->needs_copy) {
+               /* already set for copy_on_write: done! */
+               return FALSE;
+       }
+
+       if (entry->alias != VM_MEMORY_MALLOC) {
+               /* not tagged as an ObjectiveC's Garbage Collector entry */
+               return FALSE;
+       }
+
+       if (entry->wired_count) {
+               /* wired: can't change the map entry... */
+               return FALSE;
+       }
+
+       object = entry->object.vm_object;
+
+       if (object == VM_OBJECT_NULL) {
+               /* no object yet... */
+               return FALSE;
+       }
+
+       if (!object->internal) {
+               /* not an internal object */
+               return FALSE;
+       }
+
+       if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
+               /* not the default copy strategy */
+               return FALSE;
+       }
+
+       if (object->true_share) {
+               /* already true_share: too late to avoid it */
+               return FALSE;
+       }
+
+       if (object->vo_size != ANON_CHUNK_SIZE) {
+               /* not an object created for the ObjC Garbage Collector */
+               return FALSE;
+       }
+
+       /*
+        * All the criteria match: we have a large object being targeted for "true_share".
+        * To limit the adverse side-effects linked with "true_share", tell the caller to
+        * try and avoid setting up the entire object for "true_share" by clipping the
+        * targeted range and setting it up for copy-on-write.
+        */
+       return TRUE;
+}
+#endif /* !CONFIG_EMBEDDED */
diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h

index d278598582c6fe03e105ef59c839a0bc601844d1..d8ab731e9157db01391f5424417b6ef21687647e 100644 (file)
--- a/osfmk/vm/vm_map.h
+++ b/osfmk/vm/vm_map.h
@@ -468,6 +468,19 @@ extern kern_return_t vm_map_find_space(
                                 int                     flags,
                                 vm_map_entry_t          *o_entry);      /* OUT */
  
+extern void vm_map_clip_start(
+       vm_map_t        map,
+       vm_map_entry_t  entry,
+       vm_map_offset_t endaddr);
+extern void vm_map_clip_end(
+       vm_map_t        map,
+       vm_map_entry_t  entry,
+       vm_map_offset_t endaddr);
+#if !CONFIG_EMBEDDED
+extern boolean_t vm_map_entry_should_cow_for_true_share(
+       vm_map_entry_t  entry);
+#endif /* !CONFIG_EMBEDDED */
+
  /* Lookup map entry containing or the specified address in the given map */
  extern boolean_t       vm_map_lookup_entry(
                                 vm_map_t                map,
diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c

index 58148a96475935f257adedf52987277bc8f8a399..ccfcd062fe550b562a7dff3957d6dc95025a00c9 100644 (file)
--- a/osfmk/vm/vm_map_store.c
+++ b/osfmk/vm/vm_map_store.c
@@ -101,6 +101,7 @@ void        vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_
  void
  _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_where, vm_map_entry_t entry)
  {
+       assert(entry->vme_start < entry->vme_end);
         vm_map_store_entry_link_ll(mapHdr, after_where, entry);
  #ifdef VM_MAP_STORE_USE_RB
         vm_map_store_entry_link_rb(mapHdr, after_where, entry);
diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c

index 1c0138d82975577c01addda09628be7dc0d55660..2f7d54e3c1d05b002742f6367b3b89505ad6adef 100644 (file)
--- a/osfmk/vm/vm_object.c
+++ b/osfmk/vm/vm_object.c
@@ -3894,6 +3894,10 @@ vm_object_shadow(
         register vm_object_t    result;
  
         source = *object;
+       assert(source != VM_OBJECT_NULL);
+       if (source == VM_OBJECT_NULL)
+               return FALSE;
+
  #if 0
         /*
          * XXX FBDP
diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c

index acf4d64bd21732a4cc4f8a3c37be5f2cfe2b3a0b..0761db5ef7d56dff14dcfa13d52c2d5144980662 100644 (file)
--- a/osfmk/vm/vm_pageout.c
+++ b/osfmk/vm/vm_pageout.c
@@ -3956,10 +3956,30 @@ REDISCOVER_ENTRY:
  
                         return KERN_SUCCESS;
                 }
+
+               if (entry->is_sub_map) {
+                       vm_map_t        submap;
+
+                       submap = entry->object.sub_map;
+                       local_start = entry->vme_start;
+                       local_offset = entry->offset;
+
+                       vm_map_reference(submap);
+                       vm_map_unlock_read(map);
+
+                       ret = vm_map_create_upl(submap, 
+                                               local_offset + (offset - local_start), 
+                                               upl_size, upl, page_list, count, flags);
+                       vm_map_deallocate(submap);
+
+                       return ret;
+               }
+
                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
                         if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
                                         *upl_size = MAX_UPL_SIZE * PAGE_SIZE;
                 }
+
                 /*
                  *      Create an object if necessary.
                  */
@@ -3978,6 +3998,42 @@ REDISCOVER_ENTRY:
                                 vm_map_unlock_read(map);
                                 return KERN_PROTECTION_FAILURE;
                         }
+
+#if !CONFIG_EMBEDDED
+                       local_object = entry->object.vm_object;
+                       if (vm_map_entry_should_cow_for_true_share(entry) &&
+                           local_object->vo_size > *upl_size &&
+                           *upl_size != 0) {
+                               vm_prot_t       prot;
+
+                               /*
+                                * Set up the targeted range for copy-on-write to avoid
+                                * applying true_share/copy_delay to the entire object.
+                                */
+
+                               if (vm_map_lock_read_to_write(map)) {
+                                       goto REDISCOVER_ENTRY;
+                               }
+
+                               vm_map_clip_start(map, entry, vm_map_trunc_page(offset));
+                               vm_map_clip_end(map, entry, vm_map_round_page(offset + *upl_size));
+                               prot = entry->protection & ~VM_PROT_WRITE;
+                               if (override_nx(map, entry->alias) && prot)
+                                       prot |= VM_PROT_EXECUTE;
+                               vm_object_pmap_protect(local_object,
+                                                      entry->offset,
+                                                      entry->vme_end - entry->vme_start,
+                                                      ((entry->is_shared || map->mapped)
+                                                       ? PMAP_NULL
+                                                       : map->pmap),
+                                                      entry->vme_start,
+                                                      prot);
+                               entry->needs_copy = TRUE;
+
+                               vm_map_lock_write_to_read(map);
+                       }
+#endif /* !CONFIG_EMBEDDED */
+
                         if (entry->needs_copy)  {
                                 /*
                                  * Honor copy-on-write for COPY_SYMMETRIC
@@ -4012,23 +4068,6 @@ REDISCOVER_ENTRY:
                                 goto REDISCOVER_ENTRY;
                         }
                 }
-               if (entry->is_sub_map) {
-                       vm_map_t        submap;
-
-                       submap = entry->object.sub_map;
-                       local_start = entry->vme_start;
-                       local_offset = entry->offset;
-
-                       vm_map_reference(submap);
-                       vm_map_unlock_read(map);
-
-                       ret = vm_map_create_upl(submap, 
-                                               local_offset + (offset - local_start), 
-                                               upl_size, upl, page_list, count, flags);
-                       vm_map_deallocate(submap);
-
-                       return ret;
-               }
                 if (sync_cow_data) {
                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
                                 local_object = entry->object.vm_object;
diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c

index de18c16a1f3e14d0afddcda7f88e17a61ccbe8a8..8271d71b2c1aa9499cbb151103c9aba7bb2fb6ee 100644 (file)
--- a/osfmk/vm/vm_user.c
+++ b/osfmk/vm/vm_user.c
@@ -1833,6 +1833,8 @@ mach_make_memory_entry_64(
         vm_prot_t               original_protections, mask_protections;
         unsigned int            wimg_mode;
  
+       boolean_t               force_shadow = FALSE;
+
         if (((permission & 0x00FF0000) &
              ~(MAP_MEM_ONLY |
                MAP_MEM_NAMED_CREATE |
@@ -2173,6 +2175,35 @@ redo_lookup:
                         }
                 }
  
+#if !CONFIG_EMBEDDED
+               if (vm_map_entry_should_cow_for_true_share(map_entry) &&
+                   object->vo_size > map_size &&
+                   map_size != 0) {
+                       /*
+                        * Set up the targeted range for copy-on-write to
+                        * limit the impact of "true_share"/"copy_delay" to
+                        * that range instead of the entire VM object...
+                        */
+                       
+                       vm_object_unlock(object);
+                       if (vm_map_lock_read_to_write(target_map)) {
+                               vm_object_deallocate(object);
+                               target_map = original_map;
+                               goto redo_lookup;
+                       }
+
+                       vm_map_clip_start(target_map, map_entry, vm_map_trunc_page(offset));
+                       vm_map_clip_end(target_map, map_entry, vm_map_round_page(offset) + map_size);
+                       force_shadow = TRUE;
+
+                       map_size = map_entry->vme_end - map_entry->vme_start;
+                       total_size = map_size;
+
+                       vm_map_lock_write_to_read(target_map);
+                       vm_object_lock(object);
+               }
+#endif /* !CONFIG_EMBEDDED */
+
                 if(object->internal) {
                         /* vm_map_lookup_locked will create a shadow if   */
                         /* needs_copy is set but does not check for the   */
@@ -2180,9 +2211,11 @@ redo_lookup:
                         /* set up an object which will not be pulled from */
                         /* under us.  */
  
-                       if ((map_entry->needs_copy  || object->shadowed ||
-                            (object->vo_size > total_size))
-                                       && !object->true_share) {
+                       if (force_shadow ||
+                           ((map_entry->needs_copy  ||
+                             object->shadowed ||
+                             (object->vo_size > total_size)) &&
+                            !object->true_share)) {
                                 /*
                                  * We have to unlock the VM object before
                                  * trying to upgrade the VM map lock, to
diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s

index fe6cb1295e553ec76693a639bcdd33164778ed28..50bc8b99110389c4b88b3dd2f63b371f99044d03 100644 (file)
--- a/osfmk/x86_64/idt64.s
+++ b/osfmk/x86_64/idt64.s
@@ -268,14 +268,13 @@ L_32bit_dispatch: /* 32-bit user task */
         mov     %eax, R32_EIP(%rsp)
         mov     ISC32_RFLAGS(%rsp), %eax
         mov     %eax, R32_EFLAGS(%rsp)
-       mov     ISC32_CS(%rsp), %esi            /* %esi := %cs for later */
-
-       mov     %esi, R32_CS(%rsp)
         mov     ISC32_RSP(%rsp), %eax
         mov     %eax, R32_UESP(%rsp)
         mov     ISC32_SS(%rsp), %eax
         mov     %eax, R32_SS(%rsp)
  L_32bit_dispatch_after_fault:
+       mov     ISC32_CS(%rsp), %esi            /* %esi := %cs for later */
+       mov     %esi, R32_CS(%rsp)
         mov     ISC32_TRAPNO(%rsp), %ebx        /* %ebx := trapno for later */
         mov     %ebx, R32_TRAPNO(%rsp)
         mov     ISC32_ERR(%rsp), %eax
diff --git a/security/mac_base.c b/security/mac_base.c

index 1b67d3c0e47055b56d90bee5638bed3e0ba9324a..33dd044578e772d20f6118353239eec72ac8db25 100644 (file)
--- a/security/mac_base.c
+++ b/security/mac_base.c
@@ -167,9 +167,6 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW | CTLFLAG_LOCKED,
         &mac_label_mbufs, 0, "Label all MBUFs");
  #endif
  
-#if !defined(CONFIG_MACF_ALWAYS_LABEL_MBUF) && 0
-static int     mac_labelmbufs = 0;
-#endif
  
  /*
   * Flag to indicate whether or not we should allocate label storage for
@@ -744,26 +741,6 @@ mac_policy_removefrom_labellist(mac_policy_handle_t handle)
  static void
  mac_policy_updateflags(void)
  {
-#if !defined(CONFIG_MACF_ALWAYS_LABEL_MBUF) && 0 /* port to new list style */
-
-       struct mac_policy_conf *tmpc;
-       int labelmbufs;
-
-       mac_policy_assert_exclusive();
-
-       labelmbufs = 0;
-
-       /* XXX - convert to new list structure */
-       LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
-               if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
-                       labelmbufs++;
-       }
-       LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
-               if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
-                       labelmbufs++;
-       }
-       mac_labelmbufs = (labelmbufs != 0);
-#endif
  }
  
  static __inline void
author	Apple <opensource@apple.com>
	Thu, 2 Feb 2012 16:16:40 +0000 (16:16 +0000)
committer	Apple <opensource@apple.com>
	Thu, 2 Feb 2012 16:16:40 +0000 (16:16 +0000)
bsd/crypto/aes/gen/aesopt.h		patch \| blob \| blame \| history
bsd/crypto/aes/i386/aes_modes_hw.s		patch \| blob \| blame \| history
bsd/crypto/aes/test/ReadMe.txt	[deleted file]	patch \| blob \| blame \| history
bsd/crypto/aes/test/makegenx86.sh	[deleted file]	patch \| blob \| blame \| history
bsd/crypto/aes/test/makeoptx86.sh	[deleted file]	patch \| blob \| blame \| history
bsd/crypto/aes/test/tstaes.c	[deleted file]	patch \| blob \| blame \| history
bsd/hfs/hfs_cnode.c		patch \| blob \| blame \| history
bsd/hfs/hfs_vfsops.c		patch \| blob \| blame \| history
bsd/kern/kern_sysctl.c		patch \| blob \| blame \| history
bsd/kern/mach_process.c		patch \| blob \| blame \| history
bsd/kern/uipc_syscalls.c		patch \| blob \| blame \| history
bsd/libkern/libkern.h		patch \| blob \| blame \| history
bsd/net/ntstat.c		patch \| blob \| blame \| history
bsd/netinet/in_cksum.c		patch \| blob \| blame \| history
bsd/netinet6/esp_input.c		patch \| blob \| blame \| history
bsd/netinet6/in6_cksum.c		patch \| blob \| blame \| history
bsd/nfs/nfs_vfsops.c		patch \| blob \| blame \| history
bsd/vfs/vfs_cluster.c		patch \| blob \| blame \| history
config/MasterVersion		patch \| blob \| blame \| history
kgmacros		patch \| blob \| blame \| history
libkern/libkern/c++/OSMetaClass.h		patch \| blob \| blame \| history
libsyscall/wrappers/remove-counter.c		patch \| blob \| blame \| history
osfmk/i386/i386_lock.s		patch \| blob \| blame \| history
osfmk/vm/vm_map.c		patch \| blob \| blame \| history
osfmk/vm/vm_map.h		patch \| blob \| blame \| history
osfmk/vm/vm_map_store.c		patch \| blob \| blame \| history
osfmk/vm/vm_object.c		patch \| blob \| blame \| history
osfmk/vm/vm_pageout.c		patch \| blob \| blame \| history
osfmk/vm/vm_user.c		patch \| blob \| blame \| history
osfmk/x86_64/idt64.s		patch \| blob \| blame \| history
security/mac_base.c		patch \| blob \| blame \| history