xnu-1699.22.73.tar.gz

[apple/xnu.git] / bsd / crypto / sha2 / intel / sha256.s
diff --git a/bsd/crypto/sha2/intel/sha256.s b/bsd/crypto/sha2/intel/sha256.s

new file mode 100644 (file)

index 0000000..59353ff
--- /dev/null
+++ b/bsd/crypto/sha2/intel/sha256.s
@@ -0,0 +1,617 @@
+/*
+       This file provides x86_64/i386 hand implementation of the following function
+
+       void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
+
+       which is a C function in sha2.c (from xnu).
+
+       The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
+       SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
+       with all ssse3 instructions replaced with sse3 or below instructions.
+
+       sha256 algorithm per block description:
+
+               1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
+               2. load 8 digests a-h from ctx->state
+               3. for r = 0:15
+                               T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+                               d += T1;
+                               h = T1 + Sigma0(a) + Maj(a,b,c)
+                               permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+               4. for r = 16:63
+                               W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
+                               T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+                               d += T1;
+                               h = T1 + Sigma0(a) + Maj(a,b,c)
+                               permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+                               
+       In the assembly implementation: 
+               - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
+               - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+               - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
+
+       the implementation per block looks like
+
+       ----------------------------------------------------------------------------
+
+       load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
+       pre_calculate and store W+K(0:15) in stack
+
+       load digests a-h from ctx->state;
+
+       for (r=0;r<48;r+=4) {
+               digests a-h update and permute round r:r+3
+               update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+       }
+
+       for (r=48;r<64;r+=4) {
+               digests a-h update and permute round r:r+3
+       }
+
+       ctx->states += digests a-h;
+
+       ----------------------------------------------------------------------------
+
+       our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
+       into the last 16 rounds of its previous block:
+
+       ----------------------------------------------------------------------------
+
+       load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
+       pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+       load digests a-h from ctx->state;
+
+       for (r=0;r<48;r+=4) {
+               digests a-h update and permute round r:r+3
+               update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+       }
+
+       num_block--;
+       if (num_block==0)       jmp L_last_block;
+
+       for (r=48;r<64;r+=4) {
+               digests a-h update and permute round r:r+3
+               load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 
+               pre_calculate and store W+K([r:r+3]%16) in stack
+       }
+
+       ctx->states += digests a-h;
+
+       jmp     L_loop;
+
+L_last_block:
+
+       for (r=48;r<64;r+=4) {
+               digests a-h update and permute round r:r+3
+       }
+
+       ctx->states += digests a-h;
+
+       ------------------------------------------------------------------------
+
+       Apple CoreOS vector & numerics
+       cclee 8-3-10
+*/
+
+#if defined    KERNEL
+#include <i386/cpu_capabilities.h>
+#else
+#include <System/i386/cpu_capabilities.h>
+#endif
+
+       // associate variables with registers or memory
+
+#if defined    (__x86_64__)
+       #define sp                      %rsp
+       #define ctx                     %rdi
+       #define data            %rsi
+       #define num_blocks      %rdx
+
+       #define a                       %r8d
+       #define b                       %r9d
+       #define c                       %r10d
+       #define d                       %r11d
+       #define e                       %r12d
+       #define f                       %r13d
+       #define g                       %r14d
+       #define h                       %r15d
+
+       #define K                       %rbx
+       #define stack_size      (8+16*8+16+64)  // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
+
+       #define L_aligned_bswap 64(sp)          // bswap : big-endian loading of 4-byte words
+       #define xmm_save        80(sp)                  // starting address for xmm save/restore
+#else
+       #define sp      %esp
+       #define stack_size      (12+16*8+16+16+64)      // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
+       #define ctx_addr        20+stack_size(sp)       // ret_addr + 4 registers = 20, 1st caller argument
+       #define data_addr       24+stack_size(sp)       // 2nd caller argument
+       #define num_blocks      28+stack_size(sp)       // 3rd caller argument
+
+       #define a       %ebx
+       #define b       %edx
+       #define c       64(sp)
+       #define d       %ebp
+       #define e       %esi
+       #define f       68(sp)
+       #define g       %edi
+       #define h       72(sp)
+
+       #define K       76(sp)                                  // pointer to K256[] table
+       #define L_aligned_bswap 80(sp)          // bswap : big-endian loading of 4-byte words
+       #define xmm_save        96(sp)                  // starting address for xmm save/restore
+#endif
+
+       // 2 local variables
+       #define t       %eax
+       #define s       %ecx
+
+       // a window (16 words) of message scheule
+       #define W0      %xmm0
+       #define W1      %xmm1
+       #define W2      %xmm2
+       #define W3      %xmm3
+
+       // circular buffer for WK[(r:r+15)%16]
+       #define WK(x)   (x&15)*4(sp)
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+       .macro Ch
+       mov             $0, t           // x
+       mov             $0, s           // x
+       not             t                       // ~x
+       and             $1, s           // x & y
+       and             $2, t           // ~x & z
+       xor             s, t            // t = ((x) & (y)) ^ ((~(x)) & (z));
+       .endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+       .macro  Maj
+       mov             $0, t           // x
+       mov             $1, s           // y
+       and             s, t            // x&y
+       and             $2, s           // y&z
+       xor             s, t            // (x&y) ^ (y&z)
+       mov             $2, s           // z
+       and             $0, s           // (x&z)
+       xor             s, t            // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
+       .endm
+
+/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
+// #define R(b,x)      ((x) >> (b))
+/* 32-bit Rotate-right (used in SHA-256): */
+// #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))
+
+// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
+
+       // performs sigma0_256 on 4 words on an xmm registers
+       // use xmm6/xmm7 as intermediate registers
+       .macro  sigma0
+       movdqa  $0, %xmm6
+       movdqa  $0, %xmm7
+       psrld   $$3, $0                 // SHR3(x)
+       psrld   $$7, %xmm6              // part of ROTR7
+       pslld   $$14, %xmm7             // part of ROTR18
+       pxor    %xmm6, $0
+       pxor    %xmm7, $0
+       psrld   $$11, %xmm6             // part of ROTR18
+       pslld   $$11, %xmm7             // part of ROTR7
+       pxor    %xmm6, $0
+       pxor    %xmm7, $0
+       .endm
+
+// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
+
+       // performs sigma1_256 on 4 words on an xmm registers
+       // use xmm6/xmm7 as intermediate registers
+       .macro  sigma1
+       movdqa  $0, %xmm6
+       movdqa  $0, %xmm7
+       psrld   $$10, $0                // SHR10(x)
+       psrld   $$17, %xmm6             // part of ROTR17
+       pxor    %xmm6, $0
+       pslld   $$13, %xmm7             // part of ROTR19
+       pxor    %xmm7, $0
+       psrld   $$2, %xmm6              // part of ROTR19
+       pxor    %xmm6, $0
+       pslld   $$2, %xmm7              // part of ROTR17
+       pxor    %xmm7, $0
+       .endm
+
+// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+
+       .macro  Sigma0
+       mov             $0, t                   // x
+       mov             $0, s                   // x
+       ror             $$2, t                  // S32(2,  (x))
+       ror             $$13, s                 // S32(13,  (x))
+       xor             s, t                    // S32(2,  (x)) ^ S32(13, (x))
+       ror             $$9, s                  // S32(22,  (x))
+       xor             s, t                    // t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+       .endm
+
+// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+
+       .macro  Sigma1
+       mov             $0, s                   // x
+       ror             $$6, s                  // S32(6,  (x))
+       mov             s, t                    // S32(6,  (x))
+       ror             $$5, s                  // S32(11, (x))
+       xor             s, t                    // S32(6,  (x)) ^ S32(11, (x))
+       ror             $$14, s                 // S32(25, (x))
+       xor             s, t                    // t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+       .endm
+
+       // per round digests update
+       .macro  round
+       Sigma1  $4                              // t = T1
+       add             t, $7                   // use h to store h+Sigma1(e)
+       Ch              $4, $5, $6              // t = Ch (e, f, g);
+       add             $7, t                   // t = h+Sigma1(e)+Ch(e,f,g);
+       add             WK($8), t               // h = T1
+       add             t, $3                   // d += T1;
+       mov             t, $7                   // h = T1
+       Sigma0  $0                              // t = Sigma0(a);
+       add             t, $7                   // h = T1 + Sigma0(a);
+       Maj             $0, $1, $2              // t = Maj(a,b,c)
+       add             t, $7                   // h = T1 + Sigma0(a) + Maj(a,b,c);                     
+       .endm
+
+       // per 4 rounds digests update and permutation
+       // permutation is absorbed by rotating the roles of digests a-h
+       .macro  rounds
+       round   $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+       round   $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+       round   $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+       round   $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+       .endm
+
+       // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future 
+       .macro  message_schedule
+
+       // 4 32-bit K256 words in xmm5
+#if defined    (__x86_64__)
+       movdqu  (K), %xmm5
+#else
+       mov             K, t
+       movdqu  (t), %xmm5 
+#endif 
+       add             $$16, K                         // K points to next K256 word for next iteration
+       movdqa  $1, %xmm4                       // W7:W4
+       palignr $$4, $0, %xmm4          // W4:W1
+       sigma0  %xmm4                           // sigma0(W4:W1)
+       movdqa  $3, %xmm6                       // W15:W12
+       paddd   %xmm4, $0                       // $0 = W3:W0 + sigma0(W4:W1) 
+       palignr $$4, $2, %xmm6          // W12:W9
+       paddd   %xmm6, $0                       // $0 = W12:W9 + sigma0(W4:W1) + W3:W0  
+       movdqa  $3, %xmm4                       // W15:W12
+       psrldq  $$8, %xmm4                      // 0,0,W15,W14  
+       sigma1  %xmm4                           // sigma1(0,0,W15,W14)
+       paddd   %xmm4, $0                       // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
+       movdqa  $0, %xmm4                       // W19-sigma1(W17), W18-sigma1(W16), W17, W16
+       pslldq  $$8, %xmm4                      // W17, W16, 0, 0
+       sigma1  %xmm4                           // sigma1(W17,W16,0,0)
+       paddd   %xmm4, $0                       // W19:W16
+       paddd   $0, %xmm5                       // WK
+       movdqa  %xmm5, WK($4)
+       .endm
+
+       // this macro is used in the last 16 rounds of a current block
+       // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
+       // and save into stack to prepare for next block
+
+       .macro  update_W_WK
+#if defined (__x86_64__)
+       movdqu  $0*16(data), $1         // read 4 4-byte words
+       pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
+       movdqu  $0*16(K), %xmm4         // K[r:r+3]
+#else
+       mov             data_addr, t
+       movdqu  $0*16(t), $1            // read 4 4-byte words
+       pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
+       mov             K, t
+       movdqu  $0*16(t), %xmm4         // K[r:r+3]
+#endif
+       paddd   $1, %xmm4                       // WK[r:r+3]
+       movdqa  %xmm4, WK($0*4)         // save WK[r:r+3] into stack circular buffer
+       .endm
+
+       .text
+
+#if defined (__x86_64__) || defined (__i386__)
+
+       .globl  _SHA256_Transform
+
+_SHA256_Transform:
+
+
+       // detect SSSE3 and dispatch appropriate code branch
+       #if defined __x86_64__
+        movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
+        mov     (%rax), %eax                                    // %eax = __cpu_capabilities
+    #else       // i386
+        #if defined KERNEL
+            leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
+            mov     (%eax), %eax                                // %eax = __cpu_capabilities
+        #else
+            mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
+        #endif
+    #endif
+    test    $(kHasSupplementalSSE3), %eax
+    je      _SHA256_Transform_nossse3                              // branch to no-ssse3 code
+
+       // push callee-saved registers
+#if defined    (__x86_64__)
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+#else
+    push    %ebp
+       push    %ebx
+    push    %esi
+    push    %edi
+#endif
+
+       // allocate stack space
+       sub             $stack_size, sp
+
+       // if kernel code, save used xmm registers
+#if    KERNEL
+       movdqa  %xmm0, 0*16+xmm_save
+       movdqa  %xmm1, 1*16+xmm_save
+       movdqa  %xmm2, 2*16+xmm_save
+       movdqa  %xmm3, 3*16+xmm_save
+       movdqa  %xmm4, 4*16+xmm_save
+       movdqa  %xmm5, 5*16+xmm_save
+       movdqa  %xmm6, 6*16+xmm_save
+       movdqa  %xmm7, 7*16+xmm_save
+#endif
+
+       // set up bswap parameters in the aligned stack space and pointer to table K256[]
+#if defined (__x86_64__)
+       lea             _K256(%rip), K
+       lea             L_bswap(%rip), %rax
+       movdqa  (%rax), %xmm0
+#else
+       lea             _K256, t
+       mov             t, K
+       lea             L_bswap, %eax
+       movdqa  (%eax), %xmm0
+#endif
+       movdqa  %xmm0, L_aligned_bswap  
+
+       // load W[0:15] into xmm0-xmm3
+#if defined (__x86_64__)
+       movdqu  0*16(data), W0
+       movdqu  1*16(data), W1
+       movdqu  2*16(data), W2
+       movdqu  3*16(data), W3
+       add             $64, data
+#else
+       mov             data_addr, t
+       movdqu  0*16(t), W0
+       movdqu  1*16(t), W1
+       movdqu  2*16(t), W2
+       movdqu  3*16(t), W3
+       add             $64, data_addr
+#endif
+       pshufb  L_aligned_bswap, W0
+       pshufb  L_aligned_bswap, W1
+       pshufb  L_aligned_bswap, W2
+       pshufb  L_aligned_bswap, W3
+
+       // compute WK[0:15] and save in stack
+#if defined (__x86_64__)
+       movdqu  0*16(K), %xmm4  
+       movdqu  1*16(K), %xmm5
+       movdqu  2*16(K), %xmm6  
+       movdqu  3*16(K), %xmm7
+#else
+       mov             K, t
+       movdqu  0*16(t), %xmm4  
+       movdqu  1*16(t), %xmm5
+       movdqu  2*16(t), %xmm6  
+       movdqu  3*16(t), %xmm7
+#endif
+       add             $64, K
+       paddd   %xmm0, %xmm4
+       paddd   %xmm1, %xmm5
+       paddd   %xmm2, %xmm6
+       paddd   %xmm3, %xmm7
+       movdqa  %xmm4, WK(0)
+       movdqa  %xmm5, WK(4)
+       movdqa  %xmm6, WK(8)
+       movdqa  %xmm7, WK(12)
+
+L_loop:
+
+       // digests a-h = ctx->states;
+#if defined (__x86_64__)
+       mov             0*4(ctx), a
+       mov             1*4(ctx), b
+       mov             2*4(ctx), c
+       mov             3*4(ctx), d
+       mov             4*4(ctx), e
+       mov             5*4(ctx), f
+       mov             6*4(ctx), g
+       mov             7*4(ctx), h
+#else
+       mov             ctx_addr, t
+       mov     0*4(t), a
+       mov     1*4(t), b
+       mov     2*4(t), s
+       mov             s, c
+       mov     3*4(t), d
+       mov     4*4(t), e
+       mov     5*4(t), s
+       mov             s, f
+       mov     6*4(t), g
+       mov     7*4(t), s
+       mov             s, h
+#endif
+
+       // rounds 0:47 interleaved with W/WK update for rounds 16:63
+       rounds  a, b, c, d, e, f, g, h, 0
+       message_schedule W0,W1,W2,W3,16
+       rounds  e, f, g, h, a, b, c, d, 4 
+       message_schedule W1,W2,W3,W0,20
+       rounds  a, b, c, d, e, f, g, h, 8
+       message_schedule W2,W3,W0,W1,24
+       rounds  e, f, g, h, a, b, c, d, 12 
+       message_schedule W3,W0,W1,W2,28
+       rounds  a, b, c, d, e, f, g, h, 16
+       message_schedule W0,W1,W2,W3,32
+       rounds  e, f, g, h, a, b, c, d, 20 
+       message_schedule W1,W2,W3,W0,36
+       rounds  a, b, c, d, e, f, g, h, 24
+       message_schedule W2,W3,W0,W1,40
+       rounds  e, f, g, h, a, b, c, d, 28 
+       message_schedule W3,W0,W1,W2,44
+       rounds  a, b, c, d, e, f, g, h, 32
+       message_schedule W0,W1,W2,W3,48
+       rounds  e, f, g, h, a, b, c, d, 36 
+       message_schedule W1,W2,W3,W0,52
+       rounds  a, b, c, d, e, f, g, h, 40
+       message_schedule W2,W3,W0,W1,56
+       rounds  e, f, g, h, a, b, c, d, 44 
+       message_schedule W3,W0,W1,W2,60
+
+       // revert K to the beginning of K256[]
+#if defined __x86_64__
+       sub             $256, K
+#else
+       subl    $256, K
+#endif
+
+       sub             $1, num_blocks                          // num_blocks--
+       je              L_final_block                           // if final block, wrap up final rounds
+
+       // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
+       rounds  a, b, c, d, e, f, g, h, 48
+       update_W_WK     0, W0
+       rounds  e, f, g, h, a, b, c, d, 52 
+       update_W_WK     1, W1
+       rounds  a, b, c, d, e, f, g, h, 56
+       update_W_WK     2, W2
+       rounds  e, f, g, h, a, b, c, d, 60 
+       update_W_WK     3, W3
+
+       add             $64, K
+#if defined (__x86_64__)
+       add             $64, data
+#else
+       add             $64, data_addr
+#endif
+
+       // ctx->states += digests a-h
+#if    defined (__x86_64__)
+       add             a, 0*4(ctx)
+       add             b, 1*4(ctx)
+       add             c, 2*4(ctx)
+       add             d, 3*4(ctx)
+       add             e, 4*4(ctx)
+       add             f, 5*4(ctx)
+       add             g, 6*4(ctx)
+       add             h, 7*4(ctx)
+#else
+       mov             ctx_addr, t
+       add             a, 0*4(t)
+       add             b, 1*4(t)
+       mov             c, s
+       add             s, 2*4(t)
+       add             d, 3*4(t)
+       add             e, 4*4(t)
+       mov             f, s
+       add             s, 5*4(t)
+       add             g, 6*4(t)
+       mov             h, s
+       add             s, 7*4(t)
+#endif
+
+       jmp             L_loop                          // branch for next block
+
+       // wrap up digest update round 48:63 for final block
+L_final_block:
+       rounds  a, b, c, d, e, f, g, h, 48
+       rounds  e, f, g, h, a, b, c, d, 52 
+       rounds  a, b, c, d, e, f, g, h, 56
+       rounds  e, f, g, h, a, b, c, d, 60 
+
+       // ctx->states += digests a-h
+#if    defined (__x86_64__)
+       add             a, 0*4(ctx)
+       add             b, 1*4(ctx)
+       add             c, 2*4(ctx)
+       add             d, 3*4(ctx)
+       add             e, 4*4(ctx)
+       add             f, 5*4(ctx)
+       add             g, 6*4(ctx)
+       add             h, 7*4(ctx)
+#else
+       mov             ctx_addr, t
+       add             a, 0*4(t)
+       add             b, 1*4(t)
+       mov             c, s
+       add             s, 2*4(t)
+       add             d, 3*4(t)
+       add             e, 4*4(t)
+       mov             f, s
+       add             s, 5*4(t)
+       add             g, 6*4(t)
+       mov             h, s
+       add             s, 7*4(t)
+#endif
+
+       // if kernel, restore xmm0-xmm7
+#if    KERNEL
+       movdqa  0*16+xmm_save, %xmm0
+       movdqa  1*16+xmm_save, %xmm1
+       movdqa  2*16+xmm_save, %xmm2
+       movdqa  3*16+xmm_save, %xmm3
+       movdqa  4*16+xmm_save, %xmm4
+       movdqa  5*16+xmm_save, %xmm5
+       movdqa  6*16+xmm_save, %xmm6
+       movdqa  7*16+xmm_save, %xmm7
+#endif
+
+       // free allocated stack memory
+       add             $stack_size, sp
+
+       // restore callee-saved registers
+#if defined (__x86_64__)
+       pop             %r15
+       pop             %r14
+       pop             %r13
+       pop             %r12
+       pop             %rbx
+       pop             %rbp
+#else
+    pop                %edi
+    pop                %esi
+       pop             %ebx
+    pop                %ebp
+#endif
+
+       // return
+       ret
+
+
+       .const
+       .align  4, 0x90
+
+L_bswap:
+    .long   0x00010203
+    .long   0x04050607
+    .long   0x08090a0b
+    .long   0x0c0d0e0f
+
+#endif         // x86_64/i386
+