From e2d2fc5c71f7d145cba7267989251af45e3bb5ba Mon Sep 17 00:00:00 2001 From: Apple Date: Thu, 2 Feb 2012 16:16:40 +0000 Subject: [PATCH] xnu-1699.24.23.tar.gz --- bsd/crypto/aes/gen/aesopt.h | 17 - bsd/crypto/aes/i386/aes_modes_hw.s | 3245 +++++++++++++------------- bsd/crypto/aes/test/ReadMe.txt | 97 - bsd/crypto/aes/test/makegenx86.sh | 8 - bsd/crypto/aes/test/makeoptx86.sh | 10 - bsd/crypto/aes/test/tstaes.c | 131 -- bsd/hfs/hfs_cnode.c | 4 +- bsd/hfs/hfs_vfsops.c | 6 + bsd/kern/kern_sysctl.c | 6 + bsd/kern/mach_process.c | 4 - bsd/kern/uipc_syscalls.c | 13 +- bsd/libkern/libkern.h | 9 - bsd/net/ntstat.c | 11 +- bsd/netinet/in_cksum.c | 33 - bsd/netinet6/esp_input.c | 4 +- bsd/netinet6/in6_cksum.c | 86 - bsd/nfs/nfs_vfsops.c | 8 +- bsd/vfs/vfs_cluster.c | 2 +- config/MasterVersion | 2 +- kgmacros | 17 +- libkern/libkern/c++/OSMetaClass.h | 2 - libsyscall/wrappers/remove-counter.c | 8 - osfmk/i386/i386_lock.s | 2 - osfmk/vm/vm_map.c | 156 +- osfmk/vm/vm_map.h | 13 + osfmk/vm/vm_map_store.c | 1 + osfmk/vm/vm_object.c | 4 + osfmk/vm/vm_pageout.c | 73 +- osfmk/vm/vm_user.c | 39 +- osfmk/x86_64/idt64.s | 5 +- security/mac_base.c | 23 - 31 files changed, 1931 insertions(+), 2108 deletions(-) delete mode 100644 bsd/crypto/aes/test/ReadMe.txt delete mode 100755 bsd/crypto/aes/test/makegenx86.sh delete mode 100755 bsd/crypto/aes/test/makeoptx86.sh delete mode 100644 bsd/crypto/aes/test/tstaes.c diff --git a/bsd/crypto/aes/gen/aesopt.h b/bsd/crypto/aes/gen/aesopt.h index fc28e4a48..a00794865 100644 --- a/bsd/crypto/aes/gen/aesopt.h +++ b/bsd/crypto/aes/gen/aesopt.h @@ -283,9 +283,6 @@ assembler code routines for encryption and decryption with the C code only providing key scheduling */ -#if 0 && !defined(AES_ASM) -#define AES_ASM -#endif /* 3. BYTE ORDER WITHIN 32 BIT WORDS @@ -316,15 +313,7 @@ NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set */ -#if 1 || defined(AES_ASM) #define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#elif 0 -#define ALGORITHM_BYTE_ORDER BRG_LITTLE_ENDIAN -#elif 0 -#define ALGORITHM_BYTE_ORDER BRG_BIG_ENDIAN -#else -#error The algorithm byte order is not defined -#endif /* 4. FAST INPUT/OUTPUT OPERATIONS. @@ -342,9 +331,6 @@ assumed that access to byte arrays as if they are arrays of 32-bit words will not cause problems when such accesses are misaligned. */ -#if 0 && !defined(_MSC_VER) -#define SAFE_IO -#endif /* 5. LOOP UNROLLING @@ -429,9 +415,6 @@ it seems to sometimes cause trouble for the VC++ version 6 compiler. */ -#if 0 && defined(_MSC_VER) && (_MSC_VER >= 1300) -#define TABLE_ALIGN 64 -#endif /* 10. INTERNAL TABLE CONFIGURATION diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s index c9702eaec..b9e35085c 100644 --- a/bsd/crypto/aes/i386/aes_modes_hw.s +++ b/bsd/crypto/aes/i386/aes_modes_hw.s @@ -1,1622 +1,1623 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - These subroutines implement multiple block AES modes for ECB, CBC, CFB, - OFB and CTR encryption, The code provides support for the VIA Advanced - Cryptography Engine (ACE). - - NOTE: In the following subroutines, the AES contexts (ctx) must be - 16 byte aligned if VIA ACE is being used -*/ - -/* ---------------------------------------------------------------------------------------------------------------- - - aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : - - For simplicity, I am assuming all variables are in 128-bit data type. - - aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) - { - while(num_blk--) { - *iv ^= *ibuf++; - aes_encrypt(iv, iv, ctx); - *obuf++ = *iv; - } - return 0; - } - - The following is an implementation of this function using Intel AESNI. - This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. - Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch - to this aesni-based function should it detecs that aesni is available. - Blindly call this function SURELY will cause a CRASH on systems with no aesni support. - - Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks - are serially chained. This prevents us from arranging several blocks for encryption in parallel. - - ----------------------------------------------------------------------------------------------------------------*/ - - .text - .align 4,0x90 - .globl _aes_encrypt_cbc_hw -_aes_encrypt_cbc_hw: - - // push/save registers for local use -#if defined __i386__ - - push %ebp - movl %esp, %ebp - push %ebx - push %edi - - #define sp %esp - -#else // __x86_64__ - - push %rbp - mov %rsp, %rbp - push %rbx - push %r13 - push %r14 - push %r15 - - #define sp %rsp - -#endif - - // if this is kernel code, need to save used xmm registers -#ifdef KERNEL - -#if defined __i386__ - sub $(8*16), %esp // for possible xmm0-xmm7 save/restore -#else - sub $(16*16), %rsp // xmm0-xmm15 save/restore -#endif - - movaps %xmm0, (sp) - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm3, 48(sp) - movaps %xmm4, 64(sp) - movaps %xmm5, 80(sp) - movaps %xmm6, 96(sp) - movaps %xmm7, 112(sp) -#if defined __x86_64__ - movaps %xmm8, 16*8(sp) - movaps %xmm9, 16*9(sp) - movaps %xmm10, 16*10(sp) - movaps %xmm11, 16*11(sp) - movaps %xmm12, 16*12(sp) - movaps %xmm13, 16*13(sp) - movaps %xmm14, 16*14(sp) - movaps %xmm15, 16*15(sp) -#endif // __x86_64__ - -#endif // KERNEL - - #define iv %xmm0 - -#ifdef __i386__ - - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), iv // iv = in_iv - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %ecx // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %ecx - #define ctx %edx - -#else - - mov %rdi, %rbx // ibuf - movups (%rsi), iv // iv = in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - mov 240(ctx), %eax // aes length - cmp $160, %eax // aes-128 encrypt ? - je L_encrypt_128 - cmp $192, %eax // aes-192 encrypt ? - je L_encrypt_192 - cmp $224, %eax // aes-256 encrypt ? - je L_encrypt_256 - mov $-1, %eax // return error - jmp L_error - - // - // aes-128 encrypt_cbc operation, up to L_HW_cbc_done - // - -L_encrypt_128: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm2, iv // 1st instruction inside aes_encrypt - pxor %xmm1, iv // *iv ^= *ibuf - - // finishing up the rest of aes_encrypt - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenclast %xmm12, iv -#else - movups 96(ctx), %xmm1 // key6 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 // key7 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 // key8 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 // key9 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 // keyA - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, obuf // obuf++; - add $16, ibuf // ibuf++; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) - -L_HW_cbc_done: - - xor %eax, %eax // to return CRYPT_OK - -L_error: - - // if kernel, restore xmm registers -#ifdef KERNEL - movaps 0(sp), %xmm0 - movaps 16(sp), %xmm1 - movaps 32(sp), %xmm2 - movaps 48(sp), %xmm3 - movaps 64(sp), %xmm4 - movaps 80(sp), %xmm5 - movaps 96(sp), %xmm6 - movaps 112(sp), %xmm7 -#if defined __x86_64__ - movaps 16*8(sp), %xmm8 - movaps 16*9(sp), %xmm9 - movaps 16*10(sp), %xmm10 - movaps 16*11(sp), %xmm11 - movaps 16*12(sp), %xmm12 - movaps 16*13(sp), %xmm13 - movaps 16*14(sp), %xmm14 - movaps 16*15(sp), %xmm15 -#endif // __x86_64__ -#endif // KERNEL - - // release used stack memory, restore used callee-saved registers, and return -#if defined __i386__ -#ifdef KERNEL - add $(8*16), %esp -#endif - pop %edi - pop %ebx -#else -#ifdef KERNEL - add $(16*16), %rsp -#endif - pop %r15 - pop %r14 - pop %r13 - pop %rbx -#endif - leave - ret - - // - // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_encrypt_192: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA - movups 176(ctx), %xmm13 // keyB - movups 192(ctx), %xmm14 // keyC -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm1, iv // *iv ^= ibuf - - // aes_encrypt(iv, iv, ctx); - - pxor %xmm2, iv - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenc %xmm12, iv - aesenc %xmm13, iv - aesenclast %xmm14, iv -#else - movups 96(ctx), %xmm1 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 - aesenc %xmm1, iv - movups 176(ctx), %xmm1 - aesenc %xmm1, iv - movups 192(ctx), %xmm1 - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, ibuf // ibuf++ - add $16, obuf // obuf++ - - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done // share with the common exit code - - // - // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_encrypt_256: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA - movups 176(ctx), %xmm13 // keyB - movups 192(ctx), %xmm14 // keyC - movups 208(ctx), %xmm15 // keyD - // movups 224(ctx), %xmm1 // keyE -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm1, iv // *iv ^= ibuf - - // aes_encrypt(iv, iv, ctx); - pxor %xmm2, iv - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - movups 224(ctx), %xmm1 // keyE - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenc %xmm12, iv - aesenc %xmm13, iv - aesenc %xmm14, iv - aesenc %xmm15, iv - aesenclast %xmm1, iv -#else - movups 96(ctx), %xmm1 // key6 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 // key7 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 // key8 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 // key9 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 // keyA - aesenc %xmm1, iv - movups 176(ctx), %xmm1 // keyB - aesenc %xmm1, iv - movups 192(ctx), %xmm1 // keyC - aesenc %xmm1, iv - movups 208(ctx), %xmm1 // keyD - aesenc %xmm1, iv - movups 224(ctx), %xmm1 // keyE - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, ibuf // ibuf++ - add $16, obuf // obuf++ - - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done // share with the common exit code - - - - // - // --------- END of aes_encrypt_cbc_hw ------------------- - // - - -/* ---------------------------------------------------------------------------------------------------------------- - - aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : - - For simplicity, I am assuming all variables are in 128-bit data type. - - aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) - { - while(num_blk--) { - aes_decrypt(ibuf, obuf, ctx); - *obuf++ ^= *iv; - *iv = *ibuf++; - } - return 0; - } - - The following is an implementation of this function using Intel AESNI. - This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. - Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch - to this aesni-based function should it detecs that aesni is available. - Blindly call this function SURELY will cause a CRASH on systems with no aesni support. - - Note that the decryption operation is not related over blocks. - This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. - This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) - The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. - - Example C code for packing 4 blocks in an iteration is shown as follows: - - while ((num_blk-=4)>=0) { - - // the following 4 functions can be interleaved to exploit parallelism - aes_decrypt(ibuf, obuf, ctx); - aes_decrypt(ibuf+1, obuf+1, ctx); - aes_decrypt(ibuf+2, obuf+2, ctx); - aes_decrypt(ibuf+3, obuf+3, ctx); - - obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - *iv = ibuf[3]; ibuf += 4; obuf += 4; - } - num_blk+=4; - - ----------------------------------------------------------------------------------------------------------------*/ - - .text - .align 4,0x90 - .globl _aes_decrypt_cbc_hw -_aes_decrypt_cbc_hw: - - // push/save registers for local use -#if defined __i386__ - - push %ebp - movl %esp, %ebp - push %ebx // ibuf - push %edi // obuf - - #define sp %esp - -#else // __x86_64__ - - push %rbp - mov %rsp, %rbp - push %rbx - push %r13 - push %r14 - push %r15 - - #define sp %rsp - -#endif - - - // if kernel, allocate stack space to save xmm registers -#ifdef KERNEL -#if defined __i386__ - sub $(8*16), %esp -#else - sub $(16*16), %rsp -#endif - movaps %xmm0, (sp) - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm3, 48(sp) - movaps %xmm4, 64(sp) - movaps %xmm5, 80(sp) - movaps %xmm6, 96(sp) - movaps %xmm7, 112(sp) -#if defined __x86_64__ - movaps %xmm8, 16*8(sp) - movaps %xmm9, 16*9(sp) - movaps %xmm10, 16*10(sp) - movaps %xmm11, 16*11(sp) - movaps %xmm12, 16*12(sp) - movaps %xmm13, 16*13(sp) - movaps %xmm14, 16*14(sp) - movaps %xmm15, 16*15(sp) -#endif // __x86_64__ -#endif - - #undef iv - #define iv %xmm0 - -#if defined __i386__ - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), iv // iv = in_iv - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %ecx // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %ecx - #define ctx %edx - -#else // __x86_64__, rdi/rsi/rdx/rcx/r8 - - mov %rdi, %rbx // ibuf - movups (%rsi), iv // iv = in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - mov 240(ctx), %eax // aes length - cmp $160, %eax // aes-128 decrypt - je L_decrypt_128 - cmp $192, %eax // aes-192 decrypt - je L_decrypt_192 - cmp $224, %eax // aes-256 decrypt - je L_decrypt_256 - - mov $-1, %eax // wrong aes length, to return -1 - jmp L_error // early exit due to wrong aes length - - - // - // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_128: - - cmp $1, num_blk - jl L_HW_cbc_done // if num_blk < 1, early return - - // aes-128 decrypt expanded keys - movups 160(ctx), %xmm3 - movups 144(ctx), %xmm4 - movups 128(ctx), %xmm5 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#if defined __x86_64__ - movups 80(ctx), %xmm8 - movups 64(ctx), %xmm9 - movups 48(ctx), %xmm10 - movups 32(ctx), %xmm11 - movups 16(ctx), %xmm12 - movups 0(ctx), %xmm13 -#endif - - // performs 4 block decryption in an iteration to exploit decrypt in parallel - - // while ((num_blk-=4)>=0) { - // aes_decrypt(ibuf, obuf, ctx); - // aes_decrypt(ibuf+1, obuf+1, ctx); - // aes_decrypt(ibuf+2, obuf+2, ctx); - // aes_decrypt(ibuf+3, obuf+3, ctx); - // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - // *iv = ibuf[3]; ibuf += 4; obuf += 4; - // } - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code - -0: - - -#if defined __x86_64__ - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // for x86_64, the expanded keys are already stored in xmm3-xmm13 - - // aes-128 decrypt round 0 per 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - // aes-128 decrypt round 1 per 4 blocks - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - // aes-128 decrypt round 2 per 4 blocks - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - // aes-128 decrypt round 3 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - // aes-128 decrypt round 4 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - // aes-128 decrypt round 5 per 4 blocks - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - // aes-128 decrypt round 6 per 4 blocks - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - // aes-128 decrypt round 7 per 4 blocks - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - // aes-128 decrypt round 8 per 4 blocks - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - // aes-128 decrypt round 9 per 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - // aes-128 decrypt round 10 (last) per 4 blocks - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; - movups 16(ibuf), iv // ibuf[1] - pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; - movups 32(ibuf), iv // ibuf[2] - pxor iv, %xmm15 // obuf[3] ^= obuf[2]; - movups 48(ibuf), iv // *iv = ibuf[3] - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - -#else - - // aes_decrypt_cbc per 4 blocks using aes-128 for i386 - // xmm1/xmm2/xmm4/xmm5 used for obuf per block - // xmm3 = key0 - // xmm0 = iv - // xmm6/xmm7 dynamically load with other expanded keys - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - - movups 144(ctx), %xmm6 // key1 - - // aes-128 decrypt round 0 per 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 128(ctx), %xmm7 // key2 - - // aes-128 decrypt round 1 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 // key3 - - // aes-128 decrypt round 2 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 // key4 - - // aes-128 decrypt round 3 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 // key5 - - // aes-128 decrypt round 4 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 // key6 - - // aes-128 decrypt round 5 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 // key7 - - // aes-128 decrypt round 6 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 // key8 - - // aes-128 decrypt round 7 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 // key9 - - // aes-128 decrypt round 8 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 // keyA - - // aes-128 decrypt round 9 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - // aes-128 decrypt round 10 (last) per 4 blocks - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf -#endif - - add $64, ibuf // ibuf += 4; - add $64, obuf // obuf += 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - -#if defined __i386__ - // updated as they might be needed as expanded keys in the remaining - movups 144(ctx), %xmm4 - movups 128(ctx), %xmm5 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#endif - - test $2, num_blk // check whether num_blk has 2 blocks - je 9f // if num_blk & 2 == 0, skip the per-pair processing code - - // do the remaining 2 blocks together - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - - // aes_decrypt - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 -#if defined __x86_64__ - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 -#else - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#endif - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0] - movups 16(ibuf), iv // *iv = ibuf[1] - - movups %xmm1, (obuf) // write obuf[0] - movups %xmm2, 16(obuf) // write obuf[1] - - add $32, ibuf // ibuf += 2 - add $32, obuf // obuf += 2 - -9: - test $1, num_blk // check whether num_blk has residual 1 block - je L_HW_cbc_done // if num_blk == 0, no need for residual processing code - - movups (ibuf), %xmm2 // tmp = ibuf - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdeclast %xmm13, %xmm2 -#else - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 -#endif - - pxor iv, %xmm2 // *obuf ^= *iv; - movups (ibuf), iv // *iv = *ibuf; - movups %xmm2, (obuf) // write *obuf - - jmp L_HW_cbc_done - - // - // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_192: - - cmp $1, num_blk - jl L_HW_cbc_done // if num_blk < 1, early return - - // aes-192 decryp expanded keys - movups 192(ctx), %xmm3 - movups 176(ctx), %xmm4 - movups 160(ctx), %xmm5 - movups 144(ctx), %xmm6 - movups 128(ctx), %xmm7 -#if defined __x86_64__ - movups 112(ctx), %xmm8 - movups 96(ctx), %xmm9 - movups 80(ctx), %xmm10 - movups 64(ctx), %xmm11 - movups 48(ctx), %xmm12 - movups 32(ctx), %xmm13 - movups 16(ctx), %xmm14 - movups (ctx), %xmm15 -#endif - - // performs 4 block decryption in an iteration to exploit decrypt in parallel - - // while ((num_blk-=4)>=0) { - // aes_decrypt(ibuf, obuf, ctx); - // aes_decrypt(ibuf+1, obuf+1, ctx); - // aes_decrypt(ibuf+2, obuf+2, ctx); - // aes_decrypt(ibuf+3, obuf+3, ctx); - // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - // *iv = ibuf[3]; ibuf += 4; obuf += 4; - // } - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code -0: - -#if defined __x86_64__ - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 - // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards - - // round 0 for 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - // round 1 for 4 blocks - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - // round 2 for 4 blocks - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - // round 3 for 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - // round 4 for 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - // round 5 for 4 blocks - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - // round 6 for 4 blocks - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - // round 7 for 4 blocks - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - // round 8 for 4 blocks - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - // round 9 for 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - movups 16(ctx), %xmm12 - - // round A for 4 blocks - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - - movups (ctx), %xmm13 - - // round B for 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - movups 48(ctx), %xmm12 // restore %xmm12 to its original key - - // round C (last) for 4 blocks - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - - movups 32(ctx), %xmm13 // restore %xmm13 to its original key - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0] - movups 16(ibuf), iv // ibuf[1] - pxor iv, %xmm14 // obuf[2] ^= ibuf[1] - movups 32(ibuf), iv // ibuf[2] - pxor iv, %xmm15 // obuf[3] ^= ibuf[2] - movups 48(ibuf), iv // *iv = ibuf[3] - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += 4; - add $64, obuf // obuf += 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, prepare to return - - movups 16(ctx), %xmm14 // restore %xmm14 to its key - movups (ctx), %xmm15 // restore %xmm15 to its key - -#else - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - movups 176(ctx), %xmm6 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 160(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 144(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 128(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; - add $64, obuf // obuf += AES_BLOCK_SIZE * 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 176(ctx), %xmm4 - movups 160(ctx), %xmm5 - movups 144(ctx), %xmm6 - movups 128(ctx), %xmm7 - -#endif - - // per-block aes_decrypt_cbc loop - -0: - movups (ibuf), %xmm2 // tmp = ibuf - - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdec %xmm13, %xmm2 - aesdec %xmm14, %xmm2 - aesdeclast %xmm15, %xmm2 -#else - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 -#endif - - pxor iv, %xmm2 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm2, (obuf) // write obuf - - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done - - // - // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_256: - - cmp $1, num_blk - jl L_HW_cbc_done - - movups 224(ctx), %xmm3 - movups 208(ctx), %xmm4 - movups 192(ctx), %xmm5 - movups 176(ctx), %xmm6 - movups 160(ctx), %xmm7 -#if defined __x86_64__ - movups 144(ctx), %xmm8 - movups 128(ctx), %xmm9 - movups 112(ctx), %xmm10 - movups 96(ctx), %xmm11 - movups 80(ctx), %xmm12 - movups 64(ctx), %xmm13 - movups 48(ctx), %xmm14 - movups 32(ctx), %xmm15 -// movups 16(ctx), %xmm14 -// movups (ctx), %xmm15 -#endif - -#if defined __x86_64__ - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code -0: - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 48(ctx), %xmm12 - - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - movups 32(ctx), %xmm13 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 16(ctx), %xmm12 - - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - movups (ctx), %xmm13 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 80(ctx), %xmm12 - - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - movups 64(ctx), %xmm13 - - pxor iv, %xmm1 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // obuf ^= iv; - movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm14 // obuf ^= iv; - movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm15 // obuf ^= iv; - movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; - add $64, obuf // obuf += AES_BLOCK_SIZE*4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 48(ctx), %xmm14 - movups 32(ctx), %xmm15 - -#else - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-pair processing code -0: - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - movups 208(ctx), %xmm6 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 192(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 176(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 160(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 144(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 128(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; - add $64, obuf // obuf += AES_BLOCK_SIZE * 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 208(ctx), %xmm4 - movups 192(ctx), %xmm5 - movups 176(ctx), %xmm6 - movups 160(ctx), %xmm7 - -#endif - -0: - movups (ibuf), %xmm2 // tmp = ibuf - - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdec %xmm13, %xmm2 - aesdec %xmm14, %xmm2 - aesdec %xmm15, %xmm2 -#else - movups 144(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 128(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 -#endif - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 - - pxor iv, %xmm2 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm2, (obuf) // write obuf - - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done - - // - // --------- END of aes_decrypt_cbc_hw ------------------- - // +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 31/01/2006 + + These subroutines implement multiple block AES modes for ECB, CBC, CFB, + OFB and CTR encryption, The code provides support for the VIA Advanced + Cryptography Engine (ACE). + + NOTE: In the following subroutines, the AES contexts (ctx) must be + 16 byte aligned if VIA ACE is being used +*/ + + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) + { + while(num_blk--) { + *iv ^= *ibuf++; + aes_encrypt(iv, iv, ctx); + *obuf++ = *iv; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks + are serially chained. This prevents us from arranging several blocks for encryption in parallel. + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_encrypt_cbc_hw +_aes_encrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx + push %edi + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + // if this is kernel code, need to save used xmm registers +#ifdef KERNEL + +#if defined __i386__ + sub $(8*16), %esp // for possible xmm0-xmm7 save/restore +#else + sub $(16*16), %rsp // xmm0-xmm15 save/restore +#endif + + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ + +#endif // KERNEL + + #define iv %xmm0 + +#ifdef __i386__ + + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 encrypt ? + je L_encrypt_128 + cmp $192, %eax // aes-192 encrypt ? + je L_encrypt_192 + cmp $224, %eax // aes-256 encrypt ? + je L_encrypt_256 + mov $-1, %eax // return error + jmp L_error + + // + // aes-128 encrypt_cbc operation, up to L_HW_cbc_done + // + +L_encrypt_128: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm2, iv // 1st instruction inside aes_encrypt + pxor %xmm1, iv // *iv ^= *ibuf + + // finishing up the rest of aes_encrypt + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenclast %xmm12, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, obuf // obuf++; + add $16, ibuf // ibuf++; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) + +L_HW_cbc_done: + + xor %eax, %eax // to return CRYPT_OK + +L_error: + + // if kernel, restore xmm registers +#ifdef KERNEL + movaps 0(sp), %xmm0 + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm3 + movaps 64(sp), %xmm4 + movaps 80(sp), %xmm5 + movaps 96(sp), %xmm6 + movaps 112(sp), %xmm7 +#if defined __x86_64__ + movaps 16*8(sp), %xmm8 + movaps 16*9(sp), %xmm9 + movaps 16*10(sp), %xmm10 + movaps 16*11(sp), %xmm11 + movaps 16*12(sp), %xmm12 + movaps 16*13(sp), %xmm13 + movaps 16*14(sp), %xmm14 + movaps 16*15(sp), %xmm15 +#endif // __x86_64__ +#endif // KERNEL + + // release used stack memory, restore used callee-saved registers, and return +#if defined __i386__ +#ifdef KERNEL + add $(8*16), %esp +#endif + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $(16*16), %rsp +#endif + pop %r15 + pop %r14 + pop %r13 + pop %rbx +#endif + leave + ret + + // + // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_192: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenclast %xmm14, iv +#else + movups 96(ctx), %xmm1 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 + aesenc %xmm1, iv + movups 176(ctx), %xmm1 + aesenc %xmm1, iv + movups 192(ctx), %xmm1 + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + // + // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_256: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC + movups 208(ctx), %xmm15 // keyD + // movups 224(ctx), %xmm1 // keyE +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + movups 224(ctx), %xmm1 // keyE + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenc %xmm14, iv + aesenc %xmm15, iv + aesenclast %xmm1, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenc %xmm1, iv + movups 176(ctx), %xmm1 // keyB + aesenc %xmm1, iv + movups 192(ctx), %xmm1 // keyC + aesenc %xmm1, iv + movups 208(ctx), %xmm1 // keyD + aesenc %xmm1, iv + movups 224(ctx), %xmm1 // keyE + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + + + // + // --------- END of aes_encrypt_cbc_hw ------------------- + // + + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) + { + while(num_blk--) { + aes_decrypt(ibuf, obuf, ctx); + *obuf++ ^= *iv; + *iv = *ibuf++; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that the decryption operation is not related over blocks. + This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. + This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) + The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. + + Example C code for packing 4 blocks in an iteration is shown as follows: + + while ((num_blk-=4)>=0) { + + // the following 4 functions can be interleaved to exploit parallelism + aes_decrypt(ibuf, obuf, ctx); + aes_decrypt(ibuf+1, obuf+1, ctx); + aes_decrypt(ibuf+2, obuf+2, ctx); + aes_decrypt(ibuf+3, obuf+3, ctx); + + obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + *iv = ibuf[3]; ibuf += 4; obuf += 4; + } + num_blk+=4; + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_decrypt_cbc_hw +_aes_decrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx // ibuf + push %edi // obuf + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + + // if kernel, allocate stack space to save xmm registers +#ifdef KERNEL +#if defined __i386__ + sub $(8*16), %esp +#else + sub $(16*16), %rsp +#endif + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ +#endif + + #undef iv + #define iv %xmm0 + +#if defined __i386__ + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else // __x86_64__, rdi/rsi/rdx/rcx/r8 + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 decrypt + je L_decrypt_128 + cmp $192, %eax // aes-192 decrypt + je L_decrypt_192 + cmp $224, %eax // aes-256 decrypt + je L_decrypt_256 + + mov $-1, %eax // wrong aes length, to return -1 + jmp L_error // early exit due to wrong aes length + + + // + // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_128: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-128 decrypt expanded keys + movups 160(ctx), %xmm3 + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#if defined __x86_64__ + movups 80(ctx), %xmm8 + movups 64(ctx), %xmm9 + movups 48(ctx), %xmm10 + movups 32(ctx), %xmm11 + movups 16(ctx), %xmm12 + movups 0(ctx), %xmm13 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code + +0: + + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // for x86_64, the expanded keys are already stored in xmm3-xmm13 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= obuf[2]; + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + +#else + + // aes_decrypt_cbc per 4 blocks using aes-128 for i386 + // xmm1/xmm2/xmm4/xmm5 used for obuf per block + // xmm3 = key0 + // xmm0 = iv + // xmm6/xmm7 dynamically load with other expanded keys + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + + movups 144(ctx), %xmm6 // key1 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 128(ctx), %xmm7 // key2 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 // key3 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 // key4 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 // key5 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 // key6 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 // key7 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 // key8 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 // key9 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 // keyA + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf +#endif + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + +#if defined __i386__ + // updated as they might be needed as expanded keys in the remaining + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + test $2, num_blk // check whether num_blk has 2 blocks + je 9f // if num_blk & 2 == 0, skip the per-pair processing code + + // do the remaining 2 blocks together + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + + // aes_decrypt + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 +#if defined __x86_64__ + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // *iv = ibuf[1] + + movups %xmm1, (obuf) // write obuf[0] + movups %xmm2, 16(obuf) // write obuf[1] + + add $32, ibuf // ibuf += 2 + add $32, obuf // obuf += 2 + +9: + test $1, num_blk // check whether num_blk has residual 1 block + je L_HW_cbc_done // if num_blk == 0, no need for residual processing code + + movups (ibuf), %xmm2 // tmp = ibuf + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // *obuf ^= *iv; + movups (ibuf), iv // *iv = *ibuf; + movups %xmm2, (obuf) // write *obuf + + jmp L_HW_cbc_done + + // + // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_192: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-192 decryp expanded keys + movups 192(ctx), %xmm3 + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 +#if defined __x86_64__ + movups 112(ctx), %xmm8 + movups 96(ctx), %xmm9 + movups 80(ctx), %xmm10 + movups 64(ctx), %xmm11 + movups 48(ctx), %xmm12 + movups 32(ctx), %xmm13 + movups 16(ctx), %xmm14 + movups (ctx), %xmm15 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards + + // round 0 for 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // round 1 for 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // round 2 for 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // round 3 for 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // round 4 for 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // round 5 for 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // round 6 for 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // round 7 for 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // round 8 for 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // round 9 for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 16(ctx), %xmm12 + + // round A for 4 blocks + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + + movups (ctx), %xmm13 + + // round B for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 48(ctx), %xmm12 // restore %xmm12 to its original key + + // round C (last) for 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + movups 32(ctx), %xmm13 // restore %xmm13 to its original key + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1] + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= ibuf[2] + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, prepare to return + + movups 16(ctx), %xmm14 // restore %xmm14 to its key + movups (ctx), %xmm15 // restore %xmm15 to its key + +#else + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 176(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 + +#endif + + // per-block aes_decrypt_cbc loop + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdeclast %xmm15, %xmm2 +#else + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_256: + + cmp $1, num_blk + jl L_HW_cbc_done + + movups 224(ctx), %xmm3 + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 +#if defined __x86_64__ + movups 144(ctx), %xmm8 + movups 128(ctx), %xmm9 + movups 112(ctx), %xmm10 + movups 96(ctx), %xmm11 + movups 80(ctx), %xmm12 + movups 64(ctx), %xmm13 + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 +// movups 16(ctx), %xmm14 +// movups (ctx), %xmm15 +#endif + +#if defined __x86_64__ + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 48(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups 32(ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 16(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups (ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 80(ctx), %xmm12 + + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + movups 64(ctx), %xmm13 + + pxor iv, %xmm1 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // obuf ^= iv; + movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm14 // obuf ^= iv; + movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm15 // obuf ^= iv; + movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; + add $64, obuf // obuf += AES_BLOCK_SIZE*4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 + +#else + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-pair processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 208(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 192(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 176(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 + +#endif + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdec %xmm15, %xmm2 +#else + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 +#endif + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // --------- END of aes_decrypt_cbc_hw ------------------- + // diff --git a/bsd/crypto/aes/test/ReadMe.txt b/bsd/crypto/aes/test/ReadMe.txt deleted file mode 100644 index 1329e84be..000000000 --- a/bsd/crypto/aes/test/ReadMe.txt +++ /dev/null @@ -1,97 +0,0 @@ -This directory contains file and shell scripts - - tstaes.c - makegenarm.sh - makegenx86.sh - makeoptx86.sh - -that can be used to build executables. These executable are used to validate the implementation -and to benchmark the performance of the aes functions in the kernel. This directory also serves -as a development environment for porting of the aes functions to any new architectures. - -On xnu-1699.20.6 (from which we add this work), the generic aes source code sits at bsd/crypto/aes/gen. The x86_64 -and i386 architectural optimization is given in bsd/crypto/aes/i386. - -After making some code corrections (aes.h and most assembly code in i386), now you can build a test executable -that is functionally equivalent to aes in the kernel code. - -To generate a test executable for the aes in x86_64/i386 kernel, - - $ makeoptx86.sh - -This will build a test executable tstaesoptx86 (x86_64/i386). The executable will automatically detects the -CPU clock rates. You specify the number of iterations and the number of 16-byte blocks for simulation. -The executable generates (random number) the test data, and calls aes_encrypt_cbc to encrypt the plain data -into cipher data, and then calls aes_decrypt_cbc to decrypt cipher into decrypted data. Afterwards, it compares -the decrypted data against the plain data. Should there be a mismatch, the code breaks and exit. -Otherwise, it measures the times the system spends on the 2 functions under test. Afterwards, it prints out -the performance profiling data. - -On K5, - -$ tstaesoptx86 1000 2560 -device max CPU clock rate = 2659.00 MHz -40960 bytes per cbc call - aes_encrypt_cbc : time elapsed = 220.24 usecs, 177.37 MBytes/sec, 14.30 cycles/byte - best iteration : time elapsed = 218.30 usecs, 178.94 MBytes/sec, 14.17 cycles/byte - worst iteration : time elapsed = 286.14 usecs, 136.51 MBytes/sec, 18.58 cycles/byte - - aes_decrypt_cbc : time elapsed = 199.85 usecs, 195.46 MBytes/sec, 12.97 cycles/byte - best iteration : time elapsed = 198.17 usecs, 197.12 MBytes/sec, 12.86 cycles/byte - worst iteration : time elapsed = 228.12 usecs, 171.23 MBytes/sec, 14.81 cycles/byte - -On K5B (with aesni) - -$ tstaesoptx86 1000 256 -device max CPU clock rate = 2400.00 MHz -4096 bytes per cbc call - aes_encrypt_cbc : time elapsed = 6.69 usecs, 583.67 MBytes/sec, 3.92 cycles/byte - best iteration : time elapsed = 6.38 usecs, 612.46 MBytes/sec, 3.74 cycles/byte - worst iteration : time elapsed = 9.72 usecs, 401.96 MBytes/sec, 5.69 cycles/byte - - aes_decrypt_cbc : time elapsed = 2.05 usecs, 1902.65 MBytes/sec, 1.20 cycles/byte - best iteration : time elapsed = 1.96 usecs, 1997.06 MBytes/sec, 1.15 cycles/byte - worst iteration : time elapsed = 4.60 usecs, 849.00 MBytes/sec, 2.70 cycles/byte - -You can also build a test executable using the generic source code for the i386/x86_64 architecture. - - $ makegenx86.sh - -When run on K5, - -$ tstaesgenx86 1000 2560 -device max CPU clock rate = 2659.00 MHz -40960 bytes per cbc call - aes_encrypt_cbc : time elapsed = 278.05 usecs, 140.49 MBytes/sec, 18.05 cycles/byte - best iteration : time elapsed = 274.63 usecs, 142.24 MBytes/sec, 17.83 cycles/byte - worst iteration : time elapsed = 309.70 usecs, 126.13 MBytes/sec, 20.10 cycles/byte - - aes_decrypt_cbc : time elapsed = 265.43 usecs, 147.17 MBytes/sec, 17.23 cycles/byte - best iteration : time elapsed = 262.20 usecs, 148.98 MBytes/sec, 17.02 cycles/byte - worst iteration : time elapsed = 296.19 usecs, 131.88 MBytes/sec, 19.23 cycles/byte - -We can see the current AES implementation in the x86_64 kernel has been improved from 17.83/17.02 -down to 14.12/12.86 cycles/byte for aes_encrypt_cbc and aes_decrypt_cbc, respectively. - - - --------- iOS --------- - -Similarly, you can build a test executable for the aes in the armv7 kernel (which uses the generic source code) - - $ makegenarm.sh - -Note that you need the iOS SDK installed. We can then copy this executable to iOS devices for simulation. - -On N88, - -iPhone:~ root# ./tstaesgenarm 1000 2560 -device max CPU clock rate = 600.00 MHz -40960 bytes per cbc call - aes_encrypt_cbc : time elapsed = 2890.18 usecs, 13.52 MBytes/sec, 42.34 cycles/byte - best iteration : time elapsed = 2692.00 usecs, 14.51 MBytes/sec, 39.43 cycles/byte - worst iteration : time elapsed = 18248.33 usecs, 2.14 MBytes/sec, 267.31 cycles/byte - - aes_decrypt_cbc : time elapsed = 3078.20 usecs, 12.69 MBytes/sec, 45.09 cycles/byte - best iteration : time elapsed = 2873.33 usecs, 13.59 MBytes/sec, 42.09 cycles/byte - worst iteration : time elapsed = 9664.79 usecs, 4.04 MBytes/sec, 141.57 cycles/byte - diff --git a/bsd/crypto/aes/test/makegenx86.sh b/bsd/crypto/aes/test/makegenx86.sh deleted file mode 100755 index ea4de6f63..000000000 --- a/bsd/crypto/aes/test/makegenx86.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/ksh - -cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aescrypt.c -o aescrypt.o -cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aeskey.c -o aeskey.o -cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aestab.c -o aestab.o - -cc -arch i386 -arch x86_64 -Os tstaes.c aescrypt.o aeskey.o aestab.o -o tstaesgenx86 -rm -fr aescrypt.o aeskey.o aestab.o diff --git a/bsd/crypto/aes/test/makeoptx86.sh b/bsd/crypto/aes/test/makeoptx86.sh deleted file mode 100755 index 3732e037f..000000000 --- a/bsd/crypto/aes/test/makeoptx86.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/ksh - -cc -c -Os -arch i386 -arch x86_64 ../i386/AES.s -o AES.o -cc -c -Os -arch i386 -arch x86_64 ../i386/aes_crypt_hw.s -o aes_crypt_hw.o -cc -c -Os -arch i386 -arch x86_64 ../i386/aes_key_hw.s -o aes_key_hw.o -cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_asm.s -o aes_modes_asm.o -cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_hw.s -o aes_modes_hw.o - -cc -Os -arch i386 -arch x86_64 tstaes.c AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o -o tstaesoptx86 -rm -fr AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o diff --git a/bsd/crypto/aes/test/tstaes.c b/bsd/crypto/aes/test/tstaes.c deleted file mode 100644 index 9d186ee77..000000000 --- a/bsd/crypto/aes/test/tstaes.c +++ /dev/null @@ -1,131 +0,0 @@ - -#include -#include -#include "../aes.h" -#include -#include - - -aes_encrypt_ctx encrypt_ctx; -aes_decrypt_ctx decrypt_ctx; - -size_t getFreq() -{ - int mib[2]; - size_t cpufreq, len; - mib[0] = CTL_HW; - mib[1] = HW_CPU_FREQ; - len = sizeof(cpufreq); - - sysctl(mib, 2, &cpufreq, &len, NULL, 0); - - return cpufreq; -} - - -uint32_t cpu_freq; - -main(int argc, char **argv) -{ - - char *plain; - char *cipher; - char *decrypt; - -uint32_t ITERATIONS; -uint32_t NUM_BLOCKS; -uint32_t data_size; - - char key[32]; - char iv[16]; - int checksum=0; - int i, j, iterations; - uint64_t t0, t1, t2, sum=0, max_time=0, min_time=-1, sum1=0, max_time1=0, min_time1=-1; - float time, time_max, time_min, time1, time_max1, time_min1; - - cpu_freq = getFreq(); - - if (cpu_freq == 0) { - fprintf(stderr, "this appears to be an iPhone device, where cpu_freq can not be detected. set to 800MHz.\n"); - cpu_freq = 800000000; - } else { - fprintf(stderr, "device max CPU clock rate = %.2f MHz\n", cpu_freq/1.e6); - } - - mach_timebase_info_data_t info; - kern_return_t err = mach_timebase_info( &info ); - - if (argc!=3) { - fprintf(stderr, "usage : %s iterations num_16bytes_block\n", argv[0]); - exit(1); - } - ITERATIONS = atoi(argv[1]); - NUM_BLOCKS = atoi(argv[2]); - data_size = 16*NUM_BLOCKS; - - plain = malloc(data_size); - cipher = malloc(data_size); - decrypt = malloc(data_size); - - if ((plain==NULL) || (cipher==NULL) || (decrypt==NULL)) { - fprintf(stderr,"malloc error.\n"); - exit(1); - } - - for (i=0;imax_time) max_time = t1; - if (t1max_time1) max_time1 = t2; - if (t2hfs_flags |= HFS_RESIZE_IN_PROGRESS; HFS_MOUNT_UNLOCK(hfsmp, TRUE); + /* Start with a clean journal. */ + hfs_journal_flush(hfsmp, TRUE); + /* * Enclose changes inside a transaction. */ @@ -4244,6 +4247,9 @@ out: } if (transaction_begun) { hfs_end_transaction(hfsmp); + hfs_journal_flush(hfsmp, FALSE); + /* Just to be sure, sync all data to the disk */ + (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context); } return MacToVFSError(error); diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index f2c9c8711..e1f693be2 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -3321,6 +3321,12 @@ SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW | CTLFLAG_LOCK SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_user_wire_limit, ""); SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_user_wire_limit, ""); +extern int vm_map_copy_overwrite_aligned_src_not_internal; +extern int vm_map_copy_overwrite_aligned_src_not_symmetric; +extern int vm_map_copy_overwrite_aligned_src_large; +SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_not_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_not_internal, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_not_symmetric, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_not_symmetric, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_large, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_copy_overwrite_aligned_src_large, 0, ""); /* diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index 9aba89b96..5294122ff 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -129,10 +129,6 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) AUDIT_ARG(value32, uap->data); if (uap->req == PT_DENY_ATTACH) { -#if (DEVELOPMENT || DEBUG) && defined(__arm__) - if (PE_i_can_has_debugger(NULL)) - return(0); -#endif proc_lock(p); if (ISSET(p->p_lflag, P_LTRACED)) { proc_unlock(p); diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 521de769e..8a6356d5a 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -1847,22 +1847,25 @@ sockargs(struct mbuf **mp, user_addr_t data, int buflen, int type) struct mbuf *m; int error; - int alloc_buflen = buflen; + size_t alloc_buflen = (size_t)buflen; + + if(alloc_buflen > INT_MAX/2) + return (EINVAL); #ifdef __LP64__ /* The fd's in the buffer must expand to be pointers, thus we need twice as much space */ if(type == MT_CONTROL) alloc_buflen = ((buflen - sizeof(struct cmsghdr))*2) + sizeof(struct cmsghdr); #endif - if ((u_int)alloc_buflen > MLEN) { - if (type == MT_SONAME && (u_int)alloc_buflen <= 112) + if (alloc_buflen > MLEN) { + if (type == MT_SONAME && alloc_buflen <= 112) alloc_buflen = MLEN; /* unix domain compat. hack */ - else if ((u_int)alloc_buflen > MCLBYTES) + else if (alloc_buflen > MCLBYTES) return (EINVAL); } m = m_get(M_WAIT, type); if (m == NULL) return (ENOBUFS); - if ((u_int)alloc_buflen > MLEN) { + if (alloc_buflen > MLEN) { MCLGET(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 0d9cff919..8259186d0 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -213,15 +213,6 @@ clz(unsigned int num) ); return 31 ^ result; -#elif __arm__ && !__thumb__ && defined(_ARM_ARCH_5) - unsigned int result; - __asm__ volatile( - "clz %0, %1" - : "=r" (result) - : "r" (num) - ); - - return result; #else return num?__builtin_clz(num):__builtin_clz(0); #endif diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index 4bb6e1c28..833b8ca34 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -1248,8 +1248,7 @@ nstat_idle_check( removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; removed.hdr.context = 0; removed.srcref = dead->srcref; - errno_t result = ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR); - if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + (void)ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR); // Put this on the list to release later dead->next = dead_list; @@ -1318,8 +1317,7 @@ nstat_control_cleanup_source( removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; removed.hdr.context = 0; removed.srcref = src->srcref; - errno_t result = ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR); - if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + (void)ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR); } // Cleanup the source if we found it. @@ -1551,7 +1549,6 @@ nstat_control_handle_add_request( if (result != 0) { - printf("nstat_lookup_entry failed: %d\n", result); return result; } @@ -1785,10 +1782,6 @@ nstat_control_handle_query_request( if (result == 0) { result = ctl_enqueuedata(state->kctl, state->unit, &counts, sizeof(counts), CTL_DATA_EOR); - if (result != 0) - { - printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); - } } else { diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c index 1fcafd583..f32cef303 100644 --- a/bsd/netinet/in_cksum.c +++ b/bsd/netinet/in_cksum.c @@ -141,38 +141,6 @@ in_pseudo(u_int a, u_int b, u_int c) } -#if defined(__arm__) && __ARM_ARCH__ >= 6 - -extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum); - -u_int16_t -inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, - unsigned int len) -{ - u_int32_t sum = 0; - - /* sanity check */ - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) { - panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n", - m->m_pkthdr.len, skip, len); - } - - /* include pseudo header checksum? */ - if (nxt != 0) { - struct ip *iph; - - if (m->m_len < sizeof (struct ip)) - panic("inet_cksum: bad mbuf chain"); - - iph = mtod(m, struct ip *); - sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, - htonl(len + nxt)); - } - - return (cpu_in_cksum(m, len, skip, sum)); -} - -#else u_int16_t inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, @@ -304,4 +272,3 @@ skip_start: return (~sum & 0xffff); } -#endif diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index c64150319..2052493ab 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -440,8 +440,8 @@ noreplaycheck: seq >= sav->replay->lastseq) { struct udphdr *encap_uh = (__typeof__(encap_uh))((caddr_t)ip + off); if (encap_uh->uh_sport && - encap_uh->uh_sport != sav->remote_ike_port) { - sav->remote_ike_port = encap_uh->uh_sport; + ntohs(encap_uh->uh_sport) != sav->remote_ike_port) { + sav->remote_ike_port = ntohs(encap_uh->uh_sport); } } ip = esp4_input_strip_UDP_encap(m, off); diff --git a/bsd/netinet6/in6_cksum.c b/bsd/netinet6/in6_cksum.c index f0352eb72..77dd7e1af 100644 --- a/bsd/netinet6/in6_cksum.c +++ b/bsd/netinet6/in6_cksum.c @@ -131,91 +131,6 @@ #include -#if defined(__arm__) && __ARM_ARCH__ >= 6 -extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum); - -u_int16_t -inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, - unsigned int len) -{ - union { - uint16_t words[16]; - struct { - struct in6_addr ip6_src; - struct in6_addr ip6_dst; - } addrs; - } u; - const struct in6_addr *in6_src; - const struct in6_addr *in6_dst; - const struct ip6_hdr *ip6; - uint32_t sum; - const uint16_t *w; - const char *cp; - - if (off < sizeof (struct ip6_hdr)) - panic("inet6_cksum: offset too short for IPv6 header"); - if (m->m_len < sizeof (struct ip6_hdr)) - panic("inet6_cksum: mbuf too short for IPv6 header"); - - if (nxt == 0) - return (cpu_in_cksum(m, len, off, 0)); - - /* - * Compute the equivalent of: - * struct ip6_hdr_pseudo ip6; - * - * bzero(sizeof (*ip6)); - * ip6.ip6ph_nxt = nxt; - * ip6.ip6ph_len = htonl(len); - * ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src; - * in6_clearscope(&ip6->ip6ph_src); - * ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst; - * in6_clearscope(&ip6->ip6ph_dst); - * sum = one_add(&ip6); - */ - -#if BYTE_ORDER == LITTLE_ENDIAN - sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8; -#else - sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt; -#endif - cp = mtod(m, const char *); - w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src)); - ip6 = (const void *)cp; - if ((uintptr_t)w % 2 == 0) { - in6_src = &ip6->ip6_src; - in6_dst = &ip6->ip6_dst; - } else { - memcpy(&u, &ip6->ip6_src, 32); - w = u.words; - in6_src = &u.addrs.ip6_src; - in6_dst = &u.addrs.ip6_dst; - } - - sum += w[0]; - if (!IN6_IS_SCOPE_EMBED(in6_src)) - sum += w[1]; - sum += w[2]; - sum += w[3]; - sum += w[4]; - sum += w[5]; - sum += w[6]; - sum += w[7]; - w += 8; - sum += w[0]; - if (!IN6_IS_SCOPE_EMBED(in6_dst)) - sum += w[1]; - sum += w[2]; - sum += w[3]; - sum += w[4]; - sum += w[5]; - sum += w[6]; - sum += w[7]; - - return (cpu_in_cksum(m, len, off, sum)); -} - -#else /* * Checksum routine for Internet Protocol family headers (Portable Version). @@ -445,4 +360,3 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, return (~sum & 0xffff); } -#endif diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 7a0323fde..484e47c2b 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1575,8 +1575,12 @@ nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int ar /* copy socket address */ if (inkernel) bcopy(CAST_DOWN(void *, args.addr), &ss, args.addrlen); - else - error = copyin(args.addr, &ss, args.addrlen); + else { + if ((size_t)args.addrlen > sizeof (struct sockaddr_storage)) + error = EINVAL; + else + error = copyin(args.addr, &ss, args.addrlen); + } nfsmout_if(error); ss.ss_len = args.addrlen; diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 0e8bd67dd..2bccd5bb3 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -231,7 +231,7 @@ uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); * before we issue a synchronous write */ #define HARD_THROTTLE_MAXCNT 0 -#define HARD_THROTTLE_MAXSIZE (32 * 1024) +#define HARD_THROTTLE_MAXSIZE (256 * 1024) int hard_throttle_on_root = 0; struct timeval priority_IO_timestamp_for_root; diff --git a/config/MasterVersion b/config/MasterVersion index 15cb149c8..b5a6d2aac 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -11.2.0 +11.3.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/kgmacros b/kgmacros index edb1e35db..a2c6879f8 100644 --- a/kgmacros +++ b/kgmacros @@ -2445,13 +2445,13 @@ define zprint_one set $kgm_zone = (struct zone *)$arg0 showptr $kgm_zone - printf " %6d ",$kgm_zone->count + printf " %8d ",$kgm_zone->count printf "%8x ",$kgm_zone->cur_size printf "%8x ",$kgm_zone->max_size - printf "%6d ",$kgm_zone->elem_size + printf "%8d ",$kgm_zone->elem_size printf "%8x ",$kgm_zone->alloc_size - printf " %8d ",$kgm_zone->num_allocs - printf "%8d ",$kgm_zone->num_frees + printf " %16ld ",$kgm_zone->num_allocs + printf "%16ld ",$kgm_zone->num_frees printf "%s ",$kgm_zone->zone_name if ($kgm_zone->exhaustible) @@ -2473,7 +2473,7 @@ end define zprint printf "ZONE " showptrhdrpad - printf " COUNT TOT_SZ MAX_SZ ELT_SZ ALLOC_SZ TOT_ALLOC TOT_FREE NAME\n" + printf " COUNT TOT_SZ MAX_SZ ELT_SZ ALLOC_SZ TOT_ALLOC TOT_FREE NAME\n" set $kgm_zone_ptr = (struct zone *)first_zone while ($kgm_zone_ptr != 0) zprint_one $kgm_zone_ptr @@ -9714,12 +9714,13 @@ define zstack printf "\n--------------- " if (zrecords[$index].z_opcode == 1) - printf "ALLOC " + printf "ALLOC " else - printf "FREE " + printf "FREE " end - printf " 0x%x : index %d : ztime %d -------------\n", zrecords[$index].z_element, $index, zrecords[$index].z_time + showptr zrecords[$index].z_element + printf " : index %d : ztime %d -------------\n", $index, zrecords[$index].z_time set $frame = 0 diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index 662021550..cb2f9896a 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -60,8 +60,6 @@ class OSSerialize; #if defined(__LP64__) /*! @parseOnly */ #define APPLE_KEXT_LEGACY_ABI 0 -#elif defined(__arm__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define APPLE_KEXT_LEGACY_ABI 0 #else #define APPLE_KEXT_LEGACY_ABI 1 #endif diff --git a/libsyscall/wrappers/remove-counter.c b/libsyscall/wrappers/remove-counter.c index d6a2846d8..fe41f2757 100644 --- a/libsyscall/wrappers/remove-counter.c +++ b/libsyscall/wrappers/remove-counter.c @@ -31,19 +31,11 @@ static int32_t __remove_counter = 0; __uint64_t __get_remove_counter(void) { -#if defined(__arm__) && !defined(_ARM_ARCH_6) - return __remove_counter; -#else return __sync_add_and_fetch(&__remove_counter, 0); -#endif } void __inc_remove_counter(void) { -#if defined(__arm__) && !defined(_ARM_ARCH_6) - __remove_counter++; -#else __sync_add_and_fetch(&__remove_counter, 1); -#endif } diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index 9ea9f982b..0f7bdba3a 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -214,9 +214,7 @@ #define PREEMPTION_DISABLE \ incl %gs:CPU_PREEMPTION_LEVEL -#if MACH_LDEBUG || 1 #define PREEMPTION_LEVEL_DEBUG 1 -#endif #if PREEMPTION_LEVEL_DEBUG #define PREEMPTION_ENABLE \ decl %gs:CPU_PREEMPTION_LEVEL ; \ diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 2e1fbe691..604bc202f 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -1253,6 +1253,7 @@ vm_map_find_space( } *address = start; + assert(start < end); new_entry->vme_start = start; new_entry->vme_end = end; assert(page_aligned(new_entry->vme_start)); @@ -1868,6 +1869,7 @@ StartAgain: ; * new range. */ map->size += (end - entry->vme_end); + assert(entry->vme_start < end); entry->vme_end = end; vm_map_store_update_first_free(map, map->first_free); RETURN(KERN_SUCCESS); @@ -2971,7 +2973,7 @@ vm_map_clip_unnest( * the specified address; if necessary, * it splits the entry into two. */ -static void +void vm_map_clip_start( vm_map_t map, vm_map_entry_t entry, @@ -3038,7 +3040,9 @@ _vm_map_clip_start( vm_map_entry_copy_full(new_entry, entry); new_entry->vme_end = start; + assert(new_entry->vme_start < new_entry->vme_end); entry->offset += (start - entry->vme_start); + assert(start < entry->vme_end); entry->vme_start = start; _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry); @@ -3057,7 +3061,7 @@ _vm_map_clip_start( * the specified address; if necessary, * it splits the entry into two. */ -static void +void vm_map_clip_end( vm_map_t map, vm_map_entry_t entry, @@ -3128,8 +3132,10 @@ _vm_map_clip_end( new_entry = _vm_map_entry_create(map_header); vm_map_entry_copy_full(new_entry, entry); + assert(entry->vme_start < end); new_entry->vme_start = entry->vme_end = end; new_entry->offset += (end - entry->vme_start); + assert(new_entry->vme_start < new_entry->vme_end); _vm_map_store_entry_link(map_header, entry, new_entry); @@ -5876,6 +5882,12 @@ start_overwrite: copy->type = VM_MAP_COPY_ENTRY_LIST; copy->offset = new_offset; + /* + * XXX FBDP + * this does not seem to deal with + * the VM map store (R&B tree) + */ + total_size -= copy_size; copy_size = 0; /* put back remainder of copy in container */ @@ -6520,6 +6532,10 @@ vm_map_copy_overwrite_unaligned( * to the above pass and make sure that no wiring is involved. */ +int vm_map_copy_overwrite_aligned_src_not_internal = 0; +int vm_map_copy_overwrite_aligned_src_not_symmetric = 0; +int vm_map_copy_overwrite_aligned_src_large = 0; + static kern_return_t vm_map_copy_overwrite_aligned( vm_map_t dst_map, @@ -6624,6 +6640,26 @@ vm_map_copy_overwrite_aligned( continue; } +#if !CONFIG_EMBEDDED +#define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */ +#define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */ + if (copy_entry->object.vm_object != VM_OBJECT_NULL && + copy_entry->object.vm_object->vo_size >= __TRADEOFF1_OBJ_SIZE && + copy_size <= __TRADEOFF1_COPY_SIZE) { + /* + * Virtual vs. Physical copy tradeoff #1. + * + * Copying only a few pages out of a large + * object: do a physical copy instead of + * a virtual copy, to avoid possibly keeping + * the entire large object alive because of + * those few copy-on-write pages. + */ + vm_map_copy_overwrite_aligned_src_large++; + goto slow_copy; + } +#endif /* !CONFIG_EMBEDDED */ + if (entry->alias >= VM_MEMORY_MALLOC && entry->alias <= VM_MEMORY_MALLOC_LARGE_REUSED) { vm_object_t new_object, new_shadow; @@ -6637,6 +6673,10 @@ vm_map_copy_overwrite_aligned( vm_object_lock_shared(new_object); } while (new_object != VM_OBJECT_NULL && +#if !CONFIG_EMBEDDED + !new_object->true_share && + new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && +#endif /* !CONFIG_EMBEDDED */ new_object->internal) { new_shadow = new_object->shadow; if (new_shadow == VM_OBJECT_NULL) { @@ -6657,9 +6697,24 @@ vm_map_copy_overwrite_aligned( * let's go off the optimized * path... */ + vm_map_copy_overwrite_aligned_src_not_internal++; vm_object_unlock(new_object); goto slow_copy; } +#if !CONFIG_EMBEDDED + if (new_object->true_share || + new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { + /* + * Same if there's a "true_share" + * object in the shadow chain, or + * an object with a non-default + * (SYMMETRIC) copy strategy. + */ + vm_map_copy_overwrite_aligned_src_not_symmetric++; + vm_object_unlock(new_object); + goto slow_copy; + } +#endif /* !CONFIG_EMBEDDED */ vm_object_unlock(new_object); } /* @@ -6752,6 +6807,14 @@ vm_map_copy_overwrite_aligned( kern_return_t r; slow_copy: + if (entry->needs_copy) { + vm_object_shadow(&entry->object.vm_object, + &entry->offset, + (entry->vme_end - + entry->vme_start)); + entry->needs_copy = FALSE; + } + dst_object = entry->object.vm_object; dst_offset = entry->offset; @@ -6838,7 +6901,8 @@ vm_map_copy_overwrite_aligned( start += copy_size; vm_map_lock(dst_map); - if (version.main_timestamp == dst_map->timestamp) { + if (version.main_timestamp == dst_map->timestamp && + copy_size != 0) { /* We can safely use saved tmp_entry value */ vm_map_clip_end(dst_map, tmp_entry, start); @@ -7910,6 +7974,7 @@ vm_map_copyin_common( tmp_entry->vme_end = copy_addr + (tmp_entry->vme_end - tmp_entry->vme_start); tmp_entry->vme_start = copy_addr; + assert(tmp_entry->vme_start < tmp_entry->vme_end); copy_addr += tmp_entry->vme_end - tmp_entry->vme_start; tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next; } @@ -10000,6 +10065,7 @@ vm_map_simplify_entry( (this_entry->is_shared == FALSE) ) { _vm_map_store_entry_unlink(&map->hdr, prev_entry); + assert(prev_entry->vme_start < this_entry->vme_end); this_entry->vme_start = prev_entry->vme_start; this_entry->offset = prev_entry->offset; if (prev_entry->is_sub_map) { @@ -11086,6 +11152,7 @@ vm_map_entry_insert( new_entry->vme_end = end; assert(page_aligned(new_entry->vme_start)); assert(page_aligned(new_entry->vme_end)); + assert(new_entry->vme_start < new_entry->vme_end); new_entry->object.vm_object = object; new_entry->offset = offset; @@ -11288,6 +11355,7 @@ vm_map_remap_extract( new_entry->vme_start = map_address; new_entry->vme_end = map_address + tmp_size; + assert(new_entry->vme_start < new_entry->vme_end); new_entry->inheritance = inheritance; new_entry->offset = offset; @@ -13203,3 +13271,85 @@ out: vm_map_unlock(map); } #endif + +#if !CONFIG_EMBEDDED +/* + * vm_map_entry_should_cow_for_true_share: + * + * Determines if the map entry should be clipped and setup for copy-on-write + * to avoid applying "true_share" to a large VM object when only a subset is + * targeted. + * + * For now, we target only the map entries created for the Objective C + * Garbage Collector, which initially have the following properties: + * - alias == VM_MEMORY_MALLOC + * - wired_count == 0 + * - !needs_copy + * and a VM object with: + * - internal + * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC + * - !true_share + * - vo_size == ANON_CHUNK_SIZE + */ +boolean_t +vm_map_entry_should_cow_for_true_share( + vm_map_entry_t entry) +{ + vm_object_t object; + + if (entry->is_sub_map) { + /* entry does not point at a VM object */ + return FALSE; + } + + if (entry->needs_copy) { + /* already set for copy_on_write: done! */ + return FALSE; + } + + if (entry->alias != VM_MEMORY_MALLOC) { + /* not tagged as an ObjectiveC's Garbage Collector entry */ + return FALSE; + } + + if (entry->wired_count) { + /* wired: can't change the map entry... */ + return FALSE; + } + + object = entry->object.vm_object; + + if (object == VM_OBJECT_NULL) { + /* no object yet... */ + return FALSE; + } + + if (!object->internal) { + /* not an internal object */ + return FALSE; + } + + if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { + /* not the default copy strategy */ + return FALSE; + } + + if (object->true_share) { + /* already true_share: too late to avoid it */ + return FALSE; + } + + if (object->vo_size != ANON_CHUNK_SIZE) { + /* not an object created for the ObjC Garbage Collector */ + return FALSE; + } + + /* + * All the criteria match: we have a large object being targeted for "true_share". + * To limit the adverse side-effects linked with "true_share", tell the caller to + * try and avoid setting up the entire object for "true_share" by clipping the + * targeted range and setting it up for copy-on-write. + */ + return TRUE; +} +#endif /* !CONFIG_EMBEDDED */ diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index d27859858..d8ab731e9 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -468,6 +468,19 @@ extern kern_return_t vm_map_find_space( int flags, vm_map_entry_t *o_entry); /* OUT */ +extern void vm_map_clip_start( + vm_map_t map, + vm_map_entry_t entry, + vm_map_offset_t endaddr); +extern void vm_map_clip_end( + vm_map_t map, + vm_map_entry_t entry, + vm_map_offset_t endaddr); +#if !CONFIG_EMBEDDED +extern boolean_t vm_map_entry_should_cow_for_true_share( + vm_map_entry_t entry); +#endif /* !CONFIG_EMBEDDED */ + /* Lookup map entry containing or the specified address in the given map */ extern boolean_t vm_map_lookup_entry( vm_map_t map, diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index 58148a964..ccfcd062f 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -101,6 +101,7 @@ void vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_ void _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_where, vm_map_entry_t entry) { + assert(entry->vme_start < entry->vme_end); vm_map_store_entry_link_ll(mapHdr, after_where, entry); #ifdef VM_MAP_STORE_USE_RB vm_map_store_entry_link_rb(mapHdr, after_where, entry); diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 1c0138d82..2f7d54e3c 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -3894,6 +3894,10 @@ vm_object_shadow( register vm_object_t result; source = *object; + assert(source != VM_OBJECT_NULL); + if (source == VM_OBJECT_NULL) + return FALSE; + #if 0 /* * XXX FBDP diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index acf4d64bd..0761db5ef 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -3956,10 +3956,30 @@ REDISCOVER_ENTRY: return KERN_SUCCESS; } + + if (entry->is_sub_map) { + vm_map_t submap; + + submap = entry->object.sub_map; + local_start = entry->vme_start; + local_offset = entry->offset; + + vm_map_reference(submap); + vm_map_unlock_read(map); + + ret = vm_map_create_upl(submap, + local_offset + (offset - local_start), + upl_size, upl, page_list, count, flags); + vm_map_deallocate(submap); + + return ret; + } + if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) { if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE) *upl_size = MAX_UPL_SIZE * PAGE_SIZE; } + /* * Create an object if necessary. */ @@ -3978,6 +3998,42 @@ REDISCOVER_ENTRY: vm_map_unlock_read(map); return KERN_PROTECTION_FAILURE; } + +#if !CONFIG_EMBEDDED + local_object = entry->object.vm_object; + if (vm_map_entry_should_cow_for_true_share(entry) && + local_object->vo_size > *upl_size && + *upl_size != 0) { + vm_prot_t prot; + + /* + * Set up the targeted range for copy-on-write to avoid + * applying true_share/copy_delay to the entire object. + */ + + if (vm_map_lock_read_to_write(map)) { + goto REDISCOVER_ENTRY; + } + + vm_map_clip_start(map, entry, vm_map_trunc_page(offset)); + vm_map_clip_end(map, entry, vm_map_round_page(offset + *upl_size)); + prot = entry->protection & ~VM_PROT_WRITE; + if (override_nx(map, entry->alias) && prot) + prot |= VM_PROT_EXECUTE; + vm_object_pmap_protect(local_object, + entry->offset, + entry->vme_end - entry->vme_start, + ((entry->is_shared || map->mapped) + ? PMAP_NULL + : map->pmap), + entry->vme_start, + prot); + entry->needs_copy = TRUE; + + vm_map_lock_write_to_read(map); + } +#endif /* !CONFIG_EMBEDDED */ + if (entry->needs_copy) { /* * Honor copy-on-write for COPY_SYMMETRIC @@ -4012,23 +4068,6 @@ REDISCOVER_ENTRY: goto REDISCOVER_ENTRY; } } - if (entry->is_sub_map) { - vm_map_t submap; - - submap = entry->object.sub_map; - local_start = entry->vme_start; - local_offset = entry->offset; - - vm_map_reference(submap); - vm_map_unlock_read(map); - - ret = vm_map_create_upl(submap, - local_offset + (offset - local_start), - upl_size, upl, page_list, count, flags); - vm_map_deallocate(submap); - - return ret; - } if (sync_cow_data) { if (entry->object.vm_object->shadow || entry->object.vm_object->copy) { local_object = entry->object.vm_object; diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index de18c16a1..8271d71b2 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -1833,6 +1833,8 @@ mach_make_memory_entry_64( vm_prot_t original_protections, mask_protections; unsigned int wimg_mode; + boolean_t force_shadow = FALSE; + if (((permission & 0x00FF0000) & ~(MAP_MEM_ONLY | MAP_MEM_NAMED_CREATE | @@ -2173,6 +2175,35 @@ redo_lookup: } } +#if !CONFIG_EMBEDDED + if (vm_map_entry_should_cow_for_true_share(map_entry) && + object->vo_size > map_size && + map_size != 0) { + /* + * Set up the targeted range for copy-on-write to + * limit the impact of "true_share"/"copy_delay" to + * that range instead of the entire VM object... + */ + + vm_object_unlock(object); + if (vm_map_lock_read_to_write(target_map)) { + vm_object_deallocate(object); + target_map = original_map; + goto redo_lookup; + } + + vm_map_clip_start(target_map, map_entry, vm_map_trunc_page(offset)); + vm_map_clip_end(target_map, map_entry, vm_map_round_page(offset) + map_size); + force_shadow = TRUE; + + map_size = map_entry->vme_end - map_entry->vme_start; + total_size = map_size; + + vm_map_lock_write_to_read(target_map); + vm_object_lock(object); + } +#endif /* !CONFIG_EMBEDDED */ + if(object->internal) { /* vm_map_lookup_locked will create a shadow if */ /* needs_copy is set but does not check for the */ @@ -2180,9 +2211,11 @@ redo_lookup: /* set up an object which will not be pulled from */ /* under us. */ - if ((map_entry->needs_copy || object->shadowed || - (object->vo_size > total_size)) - && !object->true_share) { + if (force_shadow || + ((map_entry->needs_copy || + object->shadowed || + (object->vo_size > total_size)) && + !object->true_share)) { /* * We have to unlock the VM object before * trying to upgrade the VM map lock, to diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index fe6cb1295..50bc8b991 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -268,14 +268,13 @@ L_32bit_dispatch: /* 32-bit user task */ mov %eax, R32_EIP(%rsp) mov ISC32_RFLAGS(%rsp), %eax mov %eax, R32_EFLAGS(%rsp) - mov ISC32_CS(%rsp), %esi /* %esi := %cs for later */ - - mov %esi, R32_CS(%rsp) mov ISC32_RSP(%rsp), %eax mov %eax, R32_UESP(%rsp) mov ISC32_SS(%rsp), %eax mov %eax, R32_SS(%rsp) L_32bit_dispatch_after_fault: + mov ISC32_CS(%rsp), %esi /* %esi := %cs for later */ + mov %esi, R32_CS(%rsp) mov ISC32_TRAPNO(%rsp), %ebx /* %ebx := trapno for later */ mov %ebx, R32_TRAPNO(%rsp) mov ISC32_ERR(%rsp), %eax diff --git a/security/mac_base.c b/security/mac_base.c index 1b67d3c0e..33dd04457 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -167,9 +167,6 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_label_mbufs, 0, "Label all MBUFs"); #endif -#if !defined(CONFIG_MACF_ALWAYS_LABEL_MBUF) && 0 -static int mac_labelmbufs = 0; -#endif /* * Flag to indicate whether or not we should allocate label storage for @@ -744,26 +741,6 @@ mac_policy_removefrom_labellist(mac_policy_handle_t handle) static void mac_policy_updateflags(void) { -#if !defined(CONFIG_MACF_ALWAYS_LABEL_MBUF) && 0 /* port to new list style */ - - struct mac_policy_conf *tmpc; - int labelmbufs; - - mac_policy_assert_exclusive(); - - labelmbufs = 0; - - /* XXX - convert to new list structure */ - LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) { - if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS) - labelmbufs++; - } - LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) { - if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS) - labelmbufs++; - } - mac_labelmbufs = (labelmbufs != 0); -#endif } static __inline void -- 2.45.2