xnu-2050.7.9.tar.gz

[apple/xnu.git] / bsd / crypto / sha2 / intel / sha256.s
diff --git a/bsd/crypto/sha2/intel/sha256.s b/bsd/crypto/sha2/intel/sha256.s

deleted file mode 100644 (file)

index 59353ff..0000000
--- a/bsd/crypto/sha2/intel/sha256.s
+++ /dev/null
@@ -1,617 +0,0 @@
-/*
-       This file provides x86_64/i386 hand implementation of the following function
-
-       void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
-
-       which is a C function in sha2.c (from xnu).
-
-       The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
-       SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
-       with all ssse3 instructions replaced with sse3 or below instructions.
-
-       sha256 algorithm per block description:
-
-               1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
-               2. load 8 digests a-h from ctx->state
-               3. for r = 0:15
-                               T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
-                               d += T1;
-                               h = T1 + Sigma0(a) + Maj(a,b,c)
-                               permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
-               4. for r = 16:63
-                               W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
-                               T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
-                               d += T1;
-                               h = T1 + Sigma0(a) + Maj(a,b,c)
-                               permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
-                               
-       In the assembly implementation: 
-               - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
-               - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
-               - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
-
-       the implementation per block looks like
-
-       ----------------------------------------------------------------------------
-
-       load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
-       pre_calculate and store W+K(0:15) in stack
-
-       load digests a-h from ctx->state;
-
-       for (r=0;r<48;r+=4) {
-               digests a-h update and permute round r:r+3
-               update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
-       }
-
-       for (r=48;r<64;r+=4) {
-               digests a-h update and permute round r:r+3
-       }
-
-       ctx->states += digests a-h;
-
-       ----------------------------------------------------------------------------
-
-       our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
-       into the last 16 rounds of its previous block:
-
-       ----------------------------------------------------------------------------
-
-       load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
-       pre_calculate and store W+K(0:15) in stack
-
-L_loop:
-
-       load digests a-h from ctx->state;
-
-       for (r=0;r<48;r+=4) {
-               digests a-h update and permute round r:r+3
-               update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
-       }
-
-       num_block--;
-       if (num_block==0)       jmp L_last_block;
-
-       for (r=48;r<64;r+=4) {
-               digests a-h update and permute round r:r+3
-               load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 
-               pre_calculate and store W+K([r:r+3]%16) in stack
-       }
-
-       ctx->states += digests a-h;
-
-       jmp     L_loop;
-
-L_last_block:
-
-       for (r=48;r<64;r+=4) {
-               digests a-h update and permute round r:r+3
-       }
-
-       ctx->states += digests a-h;
-
-       ------------------------------------------------------------------------
-
-       Apple CoreOS vector & numerics
-       cclee 8-3-10
-*/
-
-#if defined    KERNEL
-#include <i386/cpu_capabilities.h>
-#else
-#include <System/i386/cpu_capabilities.h>
-#endif
-
-       // associate variables with registers or memory
-
-#if defined    (__x86_64__)
-       #define sp                      %rsp
-       #define ctx                     %rdi
-       #define data            %rsi
-       #define num_blocks      %rdx
-
-       #define a                       %r8d
-       #define b                       %r9d
-       #define c                       %r10d
-       #define d                       %r11d
-       #define e                       %r12d
-       #define f                       %r13d
-       #define g                       %r14d
-       #define h                       %r15d
-
-       #define K                       %rbx
-       #define stack_size      (8+16*8+16+64)  // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
-
-       #define L_aligned_bswap 64(sp)          // bswap : big-endian loading of 4-byte words
-       #define xmm_save        80(sp)                  // starting address for xmm save/restore
-#else
-       #define sp      %esp
-       #define stack_size      (12+16*8+16+16+64)      // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
-       #define ctx_addr        20+stack_size(sp)       // ret_addr + 4 registers = 20, 1st caller argument
-       #define data_addr       24+stack_size(sp)       // 2nd caller argument
-       #define num_blocks      28+stack_size(sp)       // 3rd caller argument
-
-       #define a       %ebx
-       #define b       %edx
-       #define c       64(sp)
-       #define d       %ebp
-       #define e       %esi
-       #define f       68(sp)
-       #define g       %edi
-       #define h       72(sp)
-
-       #define K       76(sp)                                  // pointer to K256[] table
-       #define L_aligned_bswap 80(sp)          // bswap : big-endian loading of 4-byte words
-       #define xmm_save        96(sp)                  // starting address for xmm save/restore
-#endif
-
-       // 2 local variables
-       #define t       %eax
-       #define s       %ecx
-
-       // a window (16 words) of message scheule
-       #define W0      %xmm0
-       #define W1      %xmm1
-       #define W2      %xmm2
-       #define W3      %xmm3
-
-       // circular buffer for WK[(r:r+15)%16]
-       #define WK(x)   (x&15)*4(sp)
-
-// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
-
-       .macro Ch
-       mov             $0, t           // x
-       mov             $0, s           // x
-       not             t                       // ~x
-       and             $1, s           // x & y
-       and             $2, t           // ~x & z
-       xor             s, t            // t = ((x) & (y)) ^ ((~(x)) & (z));
-       .endm
-
-// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-
-       .macro  Maj
-       mov             $0, t           // x
-       mov             $1, s           // y
-       and             s, t            // x&y
-       and             $2, s           // y&z
-       xor             s, t            // (x&y) ^ (y&z)
-       mov             $2, s           // z
-       and             $0, s           // (x&z)
-       xor             s, t            // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
-       .endm
-
-/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
-// #define R(b,x)      ((x) >> (b))
-/* 32-bit Rotate-right (used in SHA-256): */
-// #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))
-
-// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
-
-       // performs sigma0_256 on 4 words on an xmm registers
-       // use xmm6/xmm7 as intermediate registers
-       .macro  sigma0
-       movdqa  $0, %xmm6
-       movdqa  $0, %xmm7
-       psrld   $$3, $0                 // SHR3(x)
-       psrld   $$7, %xmm6              // part of ROTR7
-       pslld   $$14, %xmm7             // part of ROTR18
-       pxor    %xmm6, $0
-       pxor    %xmm7, $0
-       psrld   $$11, %xmm6             // part of ROTR18
-       pslld   $$11, %xmm7             // part of ROTR7
-       pxor    %xmm6, $0
-       pxor    %xmm7, $0
-       .endm
-
-// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
-
-       // performs sigma1_256 on 4 words on an xmm registers
-       // use xmm6/xmm7 as intermediate registers
-       .macro  sigma1
-       movdqa  $0, %xmm6
-       movdqa  $0, %xmm7
-       psrld   $$10, $0                // SHR10(x)
-       psrld   $$17, %xmm6             // part of ROTR17
-       pxor    %xmm6, $0
-       pslld   $$13, %xmm7             // part of ROTR19
-       pxor    %xmm7, $0
-       psrld   $$2, %xmm6              // part of ROTR19
-       pxor    %xmm6, $0
-       pslld   $$2, %xmm7              // part of ROTR17
-       pxor    %xmm7, $0
-       .endm
-
-// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
-
-       .macro  Sigma0
-       mov             $0, t                   // x
-       mov             $0, s                   // x
-       ror             $$2, t                  // S32(2,  (x))
-       ror             $$13, s                 // S32(13,  (x))
-       xor             s, t                    // S32(2,  (x)) ^ S32(13, (x))
-       ror             $$9, s                  // S32(22,  (x))
-       xor             s, t                    // t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
-       .endm
-
-// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
-
-       .macro  Sigma1
-       mov             $0, s                   // x
-       ror             $$6, s                  // S32(6,  (x))
-       mov             s, t                    // S32(6,  (x))
-       ror             $$5, s                  // S32(11, (x))
-       xor             s, t                    // S32(6,  (x)) ^ S32(11, (x))
-       ror             $$14, s                 // S32(25, (x))
-       xor             s, t                    // t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
-       .endm
-
-       // per round digests update
-       .macro  round
-       Sigma1  $4                              // t = T1
-       add             t, $7                   // use h to store h+Sigma1(e)
-       Ch              $4, $5, $6              // t = Ch (e, f, g);
-       add             $7, t                   // t = h+Sigma1(e)+Ch(e,f,g);
-       add             WK($8), t               // h = T1
-       add             t, $3                   // d += T1;
-       mov             t, $7                   // h = T1
-       Sigma0  $0                              // t = Sigma0(a);
-       add             t, $7                   // h = T1 + Sigma0(a);
-       Maj             $0, $1, $2              // t = Maj(a,b,c)
-       add             t, $7                   // h = T1 + Sigma0(a) + Maj(a,b,c);                     
-       .endm
-
-       // per 4 rounds digests update and permutation
-       // permutation is absorbed by rotating the roles of digests a-h
-       .macro  rounds
-       round   $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
-       round   $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
-       round   $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
-       round   $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
-       .endm
-
-       // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future 
-       .macro  message_schedule
-
-       // 4 32-bit K256 words in xmm5
-#if defined    (__x86_64__)
-       movdqu  (K), %xmm5
-#else
-       mov             K, t
-       movdqu  (t), %xmm5 
-#endif 
-       add             $$16, K                         // K points to next K256 word for next iteration
-       movdqa  $1, %xmm4                       // W7:W4
-       palignr $$4, $0, %xmm4          // W4:W1
-       sigma0  %xmm4                           // sigma0(W4:W1)
-       movdqa  $3, %xmm6                       // W15:W12
-       paddd   %xmm4, $0                       // $0 = W3:W0 + sigma0(W4:W1) 
-       palignr $$4, $2, %xmm6          // W12:W9
-       paddd   %xmm6, $0                       // $0 = W12:W9 + sigma0(W4:W1) + W3:W0  
-       movdqa  $3, %xmm4                       // W15:W12
-       psrldq  $$8, %xmm4                      // 0,0,W15,W14  
-       sigma1  %xmm4                           // sigma1(0,0,W15,W14)
-       paddd   %xmm4, $0                       // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
-       movdqa  $0, %xmm4                       // W19-sigma1(W17), W18-sigma1(W16), W17, W16
-       pslldq  $$8, %xmm4                      // W17, W16, 0, 0
-       sigma1  %xmm4                           // sigma1(W17,W16,0,0)
-       paddd   %xmm4, $0                       // W19:W16
-       paddd   $0, %xmm5                       // WK
-       movdqa  %xmm5, WK($4)
-       .endm
-
-       // this macro is used in the last 16 rounds of a current block
-       // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
-       // and save into stack to prepare for next block
-
-       .macro  update_W_WK
-#if defined (__x86_64__)
-       movdqu  $0*16(data), $1         // read 4 4-byte words
-       pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
-       movdqu  $0*16(K), %xmm4         // K[r:r+3]
-#else
-       mov             data_addr, t
-       movdqu  $0*16(t), $1            // read 4 4-byte words
-       pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
-       mov             K, t
-       movdqu  $0*16(t), %xmm4         // K[r:r+3]
-#endif
-       paddd   $1, %xmm4                       // WK[r:r+3]
-       movdqa  %xmm4, WK($0*4)         // save WK[r:r+3] into stack circular buffer
-       .endm
-
-       .text
-
-#if defined (__x86_64__) || defined (__i386__)
-
-       .globl  _SHA256_Transform
-
-_SHA256_Transform:
-
-
-       // detect SSSE3 and dispatch appropriate code branch
-       #if defined __x86_64__
-        movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
-        mov     (%rax), %eax                                    // %eax = __cpu_capabilities
-    #else       // i386
-        #if defined KERNEL
-            leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
-            mov     (%eax), %eax                                // %eax = __cpu_capabilities
-        #else
-            mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
-        #endif
-    #endif
-    test    $(kHasSupplementalSSE3), %eax
-    je      _SHA256_Transform_nossse3                              // branch to no-ssse3 code
-
-       // push callee-saved registers
-#if defined    (__x86_64__)
-       push    %rbp
-       push    %rbx
-       push    %r12
-       push    %r13
-       push    %r14
-       push    %r15
-#else
-    push    %ebp
-       push    %ebx
-    push    %esi
-    push    %edi
-#endif
-
-       // allocate stack space
-       sub             $stack_size, sp
-
-       // if kernel code, save used xmm registers
-#if    KERNEL
-       movdqa  %xmm0, 0*16+xmm_save
-       movdqa  %xmm1, 1*16+xmm_save
-       movdqa  %xmm2, 2*16+xmm_save
-       movdqa  %xmm3, 3*16+xmm_save
-       movdqa  %xmm4, 4*16+xmm_save
-       movdqa  %xmm5, 5*16+xmm_save
-       movdqa  %xmm6, 6*16+xmm_save
-       movdqa  %xmm7, 7*16+xmm_save
-#endif
-
-       // set up bswap parameters in the aligned stack space and pointer to table K256[]
-#if defined (__x86_64__)
-       lea             _K256(%rip), K
-       lea             L_bswap(%rip), %rax
-       movdqa  (%rax), %xmm0
-#else
-       lea             _K256, t
-       mov             t, K
-       lea             L_bswap, %eax
-       movdqa  (%eax), %xmm0
-#endif
-       movdqa  %xmm0, L_aligned_bswap  
-
-       // load W[0:15] into xmm0-xmm3
-#if defined (__x86_64__)
-       movdqu  0*16(data), W0
-       movdqu  1*16(data), W1
-       movdqu  2*16(data), W2
-       movdqu  3*16(data), W3
-       add             $64, data
-#else
-       mov             data_addr, t
-       movdqu  0*16(t), W0
-       movdqu  1*16(t), W1
-       movdqu  2*16(t), W2
-       movdqu  3*16(t), W3
-       add             $64, data_addr
-#endif
-       pshufb  L_aligned_bswap, W0
-       pshufb  L_aligned_bswap, W1
-       pshufb  L_aligned_bswap, W2
-       pshufb  L_aligned_bswap, W3
-
-       // compute WK[0:15] and save in stack
-#if defined (__x86_64__)
-       movdqu  0*16(K), %xmm4  
-       movdqu  1*16(K), %xmm5
-       movdqu  2*16(K), %xmm6  
-       movdqu  3*16(K), %xmm7
-#else
-       mov             K, t
-       movdqu  0*16(t), %xmm4  
-       movdqu  1*16(t), %xmm5
-       movdqu  2*16(t), %xmm6  
-       movdqu  3*16(t), %xmm7
-#endif
-       add             $64, K
-       paddd   %xmm0, %xmm4
-       paddd   %xmm1, %xmm5
-       paddd   %xmm2, %xmm6
-       paddd   %xmm3, %xmm7
-       movdqa  %xmm4, WK(0)
-       movdqa  %xmm5, WK(4)
-       movdqa  %xmm6, WK(8)
-       movdqa  %xmm7, WK(12)
-
-L_loop:
-
-       // digests a-h = ctx->states;
-#if defined (__x86_64__)
-       mov             0*4(ctx), a
-       mov             1*4(ctx), b
-       mov             2*4(ctx), c
-       mov             3*4(ctx), d
-       mov             4*4(ctx), e
-       mov             5*4(ctx), f
-       mov             6*4(ctx), g
-       mov             7*4(ctx), h
-#else
-       mov             ctx_addr, t
-       mov     0*4(t), a
-       mov     1*4(t), b
-       mov     2*4(t), s
-       mov             s, c
-       mov     3*4(t), d
-       mov     4*4(t), e
-       mov     5*4(t), s
-       mov             s, f
-       mov     6*4(t), g
-       mov     7*4(t), s
-       mov             s, h
-#endif
-
-       // rounds 0:47 interleaved with W/WK update for rounds 16:63
-       rounds  a, b, c, d, e, f, g, h, 0
-       message_schedule W0,W1,W2,W3,16
-       rounds  e, f, g, h, a, b, c, d, 4 
-       message_schedule W1,W2,W3,W0,20
-       rounds  a, b, c, d, e, f, g, h, 8
-       message_schedule W2,W3,W0,W1,24
-       rounds  e, f, g, h, a, b, c, d, 12 
-       message_schedule W3,W0,W1,W2,28
-       rounds  a, b, c, d, e, f, g, h, 16
-       message_schedule W0,W1,W2,W3,32
-       rounds  e, f, g, h, a, b, c, d, 20 
-       message_schedule W1,W2,W3,W0,36
-       rounds  a, b, c, d, e, f, g, h, 24
-       message_schedule W2,W3,W0,W1,40
-       rounds  e, f, g, h, a, b, c, d, 28 
-       message_schedule W3,W0,W1,W2,44
-       rounds  a, b, c, d, e, f, g, h, 32
-       message_schedule W0,W1,W2,W3,48
-       rounds  e, f, g, h, a, b, c, d, 36 
-       message_schedule W1,W2,W3,W0,52
-       rounds  a, b, c, d, e, f, g, h, 40
-       message_schedule W2,W3,W0,W1,56
-       rounds  e, f, g, h, a, b, c, d, 44 
-       message_schedule W3,W0,W1,W2,60
-
-       // revert K to the beginning of K256[]
-#if defined __x86_64__
-       sub             $256, K
-#else
-       subl    $256, K
-#endif
-
-       sub             $1, num_blocks                          // num_blocks--
-       je              L_final_block                           // if final block, wrap up final rounds
-
-       // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
-       rounds  a, b, c, d, e, f, g, h, 48
-       update_W_WK     0, W0
-       rounds  e, f, g, h, a, b, c, d, 52 
-       update_W_WK     1, W1
-       rounds  a, b, c, d, e, f, g, h, 56
-       update_W_WK     2, W2
-       rounds  e, f, g, h, a, b, c, d, 60 
-       update_W_WK     3, W3
-
-       add             $64, K
-#if defined (__x86_64__)
-       add             $64, data
-#else
-       add             $64, data_addr
-#endif
-
-       // ctx->states += digests a-h
-#if    defined (__x86_64__)
-       add             a, 0*4(ctx)
-       add             b, 1*4(ctx)
-       add             c, 2*4(ctx)
-       add             d, 3*4(ctx)
-       add             e, 4*4(ctx)
-       add             f, 5*4(ctx)
-       add             g, 6*4(ctx)
-       add             h, 7*4(ctx)
-#else
-       mov             ctx_addr, t
-       add             a, 0*4(t)
-       add             b, 1*4(t)
-       mov             c, s
-       add             s, 2*4(t)
-       add             d, 3*4(t)
-       add             e, 4*4(t)
-       mov             f, s
-       add             s, 5*4(t)
-       add             g, 6*4(t)
-       mov             h, s
-       add             s, 7*4(t)
-#endif
-
-       jmp             L_loop                          // branch for next block
-
-       // wrap up digest update round 48:63 for final block
-L_final_block:
-       rounds  a, b, c, d, e, f, g, h, 48
-       rounds  e, f, g, h, a, b, c, d, 52 
-       rounds  a, b, c, d, e, f, g, h, 56
-       rounds  e, f, g, h, a, b, c, d, 60 
-
-       // ctx->states += digests a-h
-#if    defined (__x86_64__)
-       add             a, 0*4(ctx)
-       add             b, 1*4(ctx)
-       add             c, 2*4(ctx)
-       add             d, 3*4(ctx)
-       add             e, 4*4(ctx)
-       add             f, 5*4(ctx)
-       add             g, 6*4(ctx)
-       add             h, 7*4(ctx)
-#else
-       mov             ctx_addr, t
-       add             a, 0*4(t)
-       add             b, 1*4(t)
-       mov             c, s
-       add             s, 2*4(t)
-       add             d, 3*4(t)
-       add             e, 4*4(t)
-       mov             f, s
-       add             s, 5*4(t)
-       add             g, 6*4(t)
-       mov             h, s
-       add             s, 7*4(t)
-#endif
-
-       // if kernel, restore xmm0-xmm7
-#if    KERNEL
-       movdqa  0*16+xmm_save, %xmm0
-       movdqa  1*16+xmm_save, %xmm1
-       movdqa  2*16+xmm_save, %xmm2
-       movdqa  3*16+xmm_save, %xmm3
-       movdqa  4*16+xmm_save, %xmm4
-       movdqa  5*16+xmm_save, %xmm5
-       movdqa  6*16+xmm_save, %xmm6
-       movdqa  7*16+xmm_save, %xmm7
-#endif
-
-       // free allocated stack memory
-       add             $stack_size, sp
-
-       // restore callee-saved registers
-#if defined (__x86_64__)
-       pop             %r15
-       pop             %r14
-       pop             %r13
-       pop             %r12
-       pop             %rbx
-       pop             %rbp
-#else
-    pop                %edi
-    pop                %esi
-       pop             %ebx
-    pop                %ebp
-#endif
-
-       // return
-       ret
-
-
-       .const
-       .align  4, 0x90
-
-L_bswap:
-    .long   0x00010203
-    .long   0x04050607
-    .long   0x08090a0b
-    .long   0x0c0d0e0f
-
-#endif         // x86_64/i386
-