xnu-1699.22.73.tar.gz

[apple/xnu.git] / libkern / crypto / intel / sha1edp.s
diff --git a/libkern/crypto/intel/sha1edp.s b/libkern/crypto/intel/sha1edp.s

new file mode 100644 (file)

index 0000000..80da81a
--- /dev/null
+++ b/libkern/crypto/intel/sha1edp.s
@@ -0,0 +1,1481 @@
+/*     sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
+       CoreOS - vector and numerics group
+       cclee   6-21-10
+       
+       The implementation is based on the principle described in an Intel online article
+       "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+       http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+
+
+       Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
+
+void SHA1( int HASH[], int MESSAGE[] )
+{
+    int A[81], B[81], C[81], D[81], E[81];
+    int W[80];
+
+    int i, FN;
+
+    A[0] = HASH[0]; 
+    B[0] = HASH[1];
+    C[0] = HASH[2];
+    D[0] = HASH[3];
+    E[0] = HASH[4];
+
+    for ( i=0; i<80; ++i )
+    {
+        if ( i < 16 )
+            W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
+        else
+            W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+        FN = F( i, B[i], C[i], D[i] );
+
+        A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+    }
+
+    HASH[0] += A[80];
+    HASH[1] += B[80];
+    HASH[2] += C[80];
+    HASH[3] += D[80];
+    HASH[4] += E[80];
+} 
+
+       For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+       The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
+
+       1. done on 4 consequtive W[i] values in a single XMM register
+    W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+    2. this additional calculation unfortunately requires many additional operations
+    W[i+3] ^= W[i] rol 1
+
+    3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
+    W[i:i+3] += {K,K,K,K}
+
+       Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
+       The Dean Gaudet approach can be expressed as
+
+       1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
+       2. W[i+3] ^= W[i] rol 1
+       3. W0 += {K,K,K,K}
+
+       For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
+
+       1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); 
+
+       Note:
+       1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
+       2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
+               i=0, W28,W24,...,W0
+               i=4, W24,W20,...,W28
+               i=8, W20,W16,...,W24
+               .
+               .
+               and so forth.
+       3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
+               a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation 
+               b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
+       4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available.
+          If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched. 
+
+*/
+
+/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
+#define        Multiple_Blocks 1
+
+#if defined (__x86_64__) || defined(__i386__)          // x86_64 or i386 architectures
+
+#if defined(__x86_64__)
+
+       // set up for x86_64
+#define        stack_size      (8+16*11+16*4)                                  // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage
+#define        sp                      %rsp                                                    // unifying architectural stack pointer representation
+#define        ctx                     %rdi                                                    // 1st input argument, will move to HASH_PTR (%r9)
+#define        buf                     %rsi                                                    // 2nd input argument, will move to BUFFER_PTR (%r10) 
+#define        cnt                     %r11                                                    // will copy from the 3rd input argument (%rdx)
+#define K_BASE         %r8                                                             // an aligned pointer to point to shufb reference numbers of table of K values
+#define HASH_PTR       %r9                                                             // pointer to Hash values (A,B,C,D,E)
+#define BUFFER_PTR     %r10                                                    // pointer to input blocks 
+
+#else  // !__x86_64__
+
+       // set up for i386 
+#define stack_size     (12+16*2+16*11+16*4)                    // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
+#define        sp                      %esp                                                    // unifying architectural stack pointer representation
+#define HASH_PTR       stack_size+16+4(sp)                             // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
+#define BUFFER_PTR     stack_size+16+8(sp)                             // use 2nd input argument from caller function 
+#define cnt                    stack_size+16+12(sp)                    // use 3rd input argument from caller function
+#define K_BASE         stack_size-4(sp)                                // use for K_BASE
+
+#endif // __x86_64__
+
+// symbolizing registers or stack memory with algorithmic variables    W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
+
+#define W_TMP          %xmm0
+#define W_TMP2         %xmm1
+#define W0     %xmm2
+#define W4     %xmm3
+#define W8     %xmm4
+#define W12    %xmm5
+#define W16    %xmm6
+#define W20    %xmm7
+#if defined(__x86_64__)
+#define W24    %xmm8
+#define W28    %xmm9
+#define XMM_SHUFB_BSWAP %xmm10                         // used only when ssse3 is supported
+#else  // defined (__i386__)
+#define W24     12*16(sp)
+#define W28     13*16(sp)
+#define XMM_SHUFB_BSWAP 14*16(sp)                      // used only when ssse3 is supported 
+#endif
+
+#define        xmov    movaps                                          // aligned 16-byte move
+#define        xmovu   movups                                          // unaligned 16-byte move
+
+// intermediate hash variables
+#define A %ecx
+#define B %esi
+#define C %edi
+#define D %ebp
+#define E %edx
+
+// temp variables
+#define T1 %eax
+#define T2 %ebx
+
+#define        WK(t)   (t&15)*4(sp)
+
+       // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
+       // result in T1
+       .macro  F1
+       mov     $1, T1
+       xor     $2, T1
+       and     $0, T1
+       xor     $2, T1
+       .endm
+
+       // int F2(int B, int C, int D) { return (D ^ B ^ C); }
+       // result in T1
+       .macro  F2
+       mov     $2, T1
+       xor     $1, T1
+       xor     $0, T1
+       .endm
+
+       // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
+       // result in T1
+       .macro  F3
+               mov $1, T1
+        mov $0, T2
+        or  $0, T1
+        and $1, T2
+        and $2, T1
+        or  T2, T1
+       .endm
+
+       // for i=60:79, F4 is identical to F2
+       #define F4      F2
+
+
+       /*
+               i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
+
+               with ssse3 support, this is achived via
+               for (i=0;i<16;i+=4) {
+                       1. W_TMP = new 16 bytes from MESSAGE[]
+                       2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W 
+                       3. WTMP += {K,K,K,K};
+                       4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
+               }
+
+               each step is represented in one of the following 4 macro definitions
+
+       */
+
+       .macro  W_PRECALC_00_15_0_ssse3                 // input argument $0 : 0/4/8/12
+#if defined (__x86_64__)                                       // BUFFER_PTR is already an address register in x86_64
+       xmovu   $0*4(BUFFER_PTR), W_TMP                 // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
+#else                                                                          // BUFFER_PTR is from the argument set up in the caller
+       mov     BUFFER_PTR, T1                                  // T1 = BUFFER_PTR
+    xmovu  $0*4(T1), W_TMP                                     // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
+#endif
+       .endm
+
+       .macro  W_PRECALC_00_15_1_ssse3                 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
+       pshufb  XMM_SHUFB_BSWAP, W_TMP                  // convert W_TMP from little-endian into big-endian
+       xmov    W_TMP, $0                                               // save W_TMP in the circular buffer
+       .endm
+
+       .macro  W_PRECALC_00_15_2                               // K_BASE points to the current K quadruple.
+#if defined (__x86_64__)                                       // K_BASE is already an address register in x86_64
+       paddd   (K_BASE), W_TMP                                 // W_TMP += {K,K,K,K};
+#else                                                                          // K_BASE is previously set up in the stack memory
+       mov     K_BASE, T1                                              // T1 = K_BASE
+    paddd   (T1), W_TMP                                                // W_TMP += {K,K,K,K};
+#endif
+       .endm
+
+       .macro  W_PRECALC_00_15_3
+       xmov    W_TMP, WK($0&~3)                                // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
+       .endm
+
+       /*
+               without ssse3 support, steps 1 and 2 need to be modified
+               1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space
+               2. load the 16-bytes from the aligned stack memory into W_TMP
+       */
+
+       .macro  W_PRECALC_00_15_0_nossse3               // input argument $0 : 0/4/8/12
+
+#if    defined (__x86_64__)
+       #define BUFFERP BUFFER_PTR
+#else
+       mov             BUFFER_PTR, T2                                  // copy BUFFER_PTR (from caller 2nd argument) to T2
+       #define BUFFERP T2
+#endif
+
+       // load 1st word, bswap it, save it to stack
+       mov             $0*4(BUFFERP), T1
+       bswap   T1
+       mov             T1, 14*16(sp)
+
+       // load 2nd word, bswap it, save it to stack
+       mov             4+$0*4(BUFFERP), T1
+       bswap   T1
+       mov             T1, 4+14*16(sp)
+
+       // load 3rd word, bswap it, save it to stack
+       mov             8+$0*4(BUFFERP), T1
+       bswap   T1
+       mov             T1, 8+14*16(sp)
+
+       // load 4th word, bswap it, save it to stack
+       mov             12+$0*4(BUFFERP), T1
+       bswap   T1
+       mov             T1, 12+14*16(sp)
+       .endm
+
+       .macro  W_PRECALC_00_15_1_nossse3               // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
+       xmov    14*16(sp), W_TMP                                // load the bswapped 16-bytes from the aligned stack memory
+       xmov    W_TMP, $0                                               // save W = W_TMP in the circular buffer
+       .endm
+
+       // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
+       /*
+       W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+       W[i+3] ^= W[i] rol 1;   // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
+
+       The operation (updating W and W+K) is scheduled as and divided into 4 steps
+
+       0. W_tmp = W3; W = W14 ^ W8
+       1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0); 
+       2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
+       3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;  
+
+       */
+
+       .macro  W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W
+       xmov    $1, $4                                  // W = W12
+       palignr $$8, $0, $4                             // W = W14
+       xmov    $3, W_TMP                               // W_TMP = W4
+       psrldq  $$4, W_TMP                              // W_TMP = W3
+       pxor    $2, $4                                  // W = W8 ^ W14 
+       .endm
+
+       .macro  W_PRECALC_16_31_1               // input arguments : W16,W
+       pxor    $0, W_TMP                               // W_TMP = W3 ^ W16
+       pxor    W_TMP, $1                               // W = W3 ^ W16 ^ W8 ^ W14
+       xmov    $1, W_TMP2                              // W_TMP2 = W3 ^ W16 ^ W8 ^ W14 
+       xmov    $1, W_TMP                               // W_TMP = W3 ^ W16 ^ W8 ^ W14
+       pslldq  $$12, W_TMP2                    // W_TMP2 = (W[i] 0 0 0)
+       .endm
+
+       .macro  W_PRECALC_16_31_2               // input argument : W
+       psrld   $$31, $0                                // (W3 ^ W16 ^ W8 ^ W14)>>31
+       pslld   $$1, W_TMP                              // (W3 ^ W16 ^ W8 ^ W14)<<1
+       por             $0, W_TMP                               // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
+       xmov    W_TMP2, $0                              // copy W[i] at location of W[i+3]
+       psrld   $$30, W_TMP2                    // W_TMP2 = W[i] lower 2 bits after rol 2
+       pslld   $$2, $0                                 // W = W[i] higher 30 bits after rol 2
+       .endm
+
+       .macro  W_PRECALC_16_31_3               // input arguments: W, i, K_XMM
+#if defined (__i386__)
+       mov     K_BASE, T1                              // K_BASE is store in the stack memory for i386
+#endif
+       pxor    $0, W_TMP
+       pxor    W_TMP2, W_TMP                   // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
+       xmov    W_TMP, $0                               // save W = W_TMP in the W circular buffer
+#if defined (__x86_64__)
+       paddd   $2(K_BASE), W_TMP               // W+K
+#else
+    paddd   $2(T1), W_TMP                      // W+K
+#endif
+       xmov    W_TMP, WK($1&~3)                // save WK = W+K for later update of the hashes A/B/C/D/E
+       .endm
+
+       // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions
+
+       .macro  W_PRECALC_16_31_0_nossse3       // input arguments : W16,W12,W8,W4,W
+       xmov    $1, $4                                          // W = W12 = (w9 w10 w11 w12)
+
+       // the following is a wrokaround for palignr
+       xmov    $0, W_TMP                                       // W16 = (w13 w14 w15 w16)
+       pslldq  $$8, $4                                         // shift left to make (w11 w12 0 0)
+       psrldq  $$8, W_TMP                                      // shift right to make (0 0 w13 w14)
+       por             W_TMP, $4                                       // W = W14 = (w11 w12 w13 w14)
+
+       xmov    $3, W_TMP                                       // W_TMP = W4 = (w1 w2 w3 w4)
+       psrldq  $$4, W_TMP                                      // W_TMP = W3 = (0 w1 w2 w3)
+       pxor    $2, $4                                          // W = W8 ^ W14 
+       .endm
+
+       /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
+
+               W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+               where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
+
+
+       0. W_tmp = W6; W = W28 ^ W32;
+       1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
+       2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
+       3. W = W_Tmp; WK = W_tmp + K;
+
+       */
+
+
+       .macro  W_PRECALC_32_79_0_ssse3         // inputr arguments : W28,W8,W4,W
+       xmov    $2, W_TMP                                       // (w1 w2 w3 w4)
+       pxor    $0, $3                                          // W = W28 ^ W32;
+       palignr $$8, $1, W_TMP                          // W_tmp = (w3 w4 w5 w6) = W6;
+       .endm
+
+       // the following is a variant and will be used for system without ssse3 support
+       .macro  W_PRECALC_32_79_0_nossse3       // input arguments : W28,W8,W4,W
+       xmov    $2, W_TMP                                       // (w1 w2 w3 w4)
+       xmov    $1, W_TMP2                                      // (w5 w6 w7 w8)
+       pxor    $0, $3                                          // W = W28 ^ W32
+       pslldq  $$8, W_TMP                                      // (w3 w4 0 0)
+       psrldq  $$8, W_TMP2                                     // (0 0 w5 w6)
+       por             W_TMP2, W_TMP                           // W_tmp = (w3 w4 w5 w6) = W6
+       .endm
+
+       // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
+       .macro  W_PRECALC_32_79_0_i386_ssse3    // input arguments : W28,W8,W4,W
+    xmov    $3, W_TMP                                          // W32
+    pxor    $0, W_TMP                                          // W28 ^ W32
+    xmov    W_TMP, $3                                          // W = W28 ^ W32;
+    xmov    $2, W_TMP                                          // W4
+    palignr $$8, $1, W_TMP                                     // W_tmp = (w3 w4 w5 w6) = W6;
+    .endm
+
+       // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers)
+       .macro  W_PRECALC_32_79_0_i386_nossse3  // input arguments : W28,W8,W4,W
+    xmov    $3, W_TMP                                          // W32
+    pxor    $0, W_TMP                                          // W28 ^ W32
+    xmov    W_TMP, $3                                          // W = W28 ^ W32
+    xmov    $2, W_TMP                                          // W4 = (w1 w2 w3 w4)
+       xmov    $1, W_TMP2                                              // W8 = (w5 w6 w7 w8)
+       pslldq  $$8, W_TMP                                              // (w3 w4 0 0)
+       psrldq  $$8, W_TMP2                                             // (0 0 w5 w6)
+       por             W_TMP2, W_TMP                                   // W_tmp = (w3 w4 w5 w6) = W6
+    .endm
+
+       .macro  W_PRECALC_32_79_1                       // input arguments : W16,W
+       pxor    $0, W_TMP                                       // W_tmp = W6 ^ W16
+       pxor    $1, W_TMP                                       // W_tmp = W6 ^ W16 ^ W28 ^ W32
+       xmov    W_TMP, $1                                       // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
+       .endm
+
+       .macro  W_PRECALC_32_79_2                       // input argument : W
+       psrld   $$30, $0                                        // W >> 30
+       pslld   $$2, W_TMP                                      // W << 2
+       por             $0, W_TMP                                       // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
+       .endm
+
+       // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
+       // this should be used when the input is either W24 or W28 on i386 architecture
+    .macro  W_PRECALC_32_79_2_i386     // input argument : W
+    xmov    $0, W_TMP2                                 // W
+    psrld   $$30, W_TMP2                               // W >> 30
+    xmov    W_TMP2, $0                                 // save (W >> 30) at W
+    pslld   $$2, W_TMP                                 // W_tmp << 2
+    por     $0, W_TMP                                  // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
+    .endm
+
+       .macro  W_PRECALC_32_79_3                       // input argument W, i, K_XMM
+#if defined (__x86_64__)
+       xmov    W_TMP, $0                                       // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
+       paddd   $2(K_BASE), W_TMP                       // W + K
+       xmov    W_TMP, WK($1&~3)                        // write W+K
+#else
+    mov     K_BASE, T1                                 // T1 = K_BASE (which is in the caller argument)
+    xmov    W_TMP, $0                                  // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
+    paddd   $2(T1), W_TMP                              // W_tmp = W + K
+    xmov    W_TMP, WK($1&~3)                   // write WK
+#endif
+       .endm
+
+
+       /* The hash update operation is completed by the following statements.
+
+               A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+
+               Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
+
+               A1 = FN + E0 + rol(A0,5) + WK;
+               B1 = A0;
+               C1 = rol(B0, 30);
+               D1 = C0;
+               E1 = D0;
+
+               to avoid excessive memory movement between registers, 
+                       1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0, 
+                       2. C1 = rol(B0,30) can be temporarily saved in B0. 
+
+               Therefore, ignoring the time index, the update operation is equivalent to
+                       1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+                       2. B = rol(B,30)
+                       3. the hashes are now stored in the order of E,A,B,C,D
+
+
+               To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
+               1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+               2. B = rol(B,30)
+               // now the hashes are in the order of E,A,B,C,D
+               3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+               4. A = rol(A,30)
+               // now the hashes are in the order of D,E,A,B,C
+       
+               These operations are distributed into the following 2 macro definitions RR0 and RR1.    
+
+       */
+
+       .macro  RR0                             // input arguments : FN, A, B, C, D, E, i
+       $0              $2, $3, $4              // T1 = FN(B,C,D)
+       add             WK($6), $5              // E + WK(i)
+       rol             $$30, $2                // B = rol(B,30)
+       mov             $1, T2                  // T2 = A
+       add             WK($6+1), $4    // D + WK(i+1)
+       rol             $$5, T2                 // rol(A,5)
+       add             T1, $5                  // E = FN(B,C,D) + E + WK(i)
+       .endm
+
+       .macro  RR1
+       add             $5, T2                  // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
+       mov             T2, $5                  // E = FN(B,C,D) + E + rol(A,5) + WK(i)
+       rol             $$5, T2                 // rol(E,5)
+       add             T2, $4                  // D + WK(i+1) + rol(E,5)
+       $0              $1, $2, $3              // FN(A,B,C)
+       add             T1, $4                  // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+       rol             $$30, $1                // A = rol(A,30)
+       .endm
+
+
+
+       /*
+
+               The following macro definitions are used to expand code for the per-block sha1 operation.
+
+                       INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
+                       INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) 
+                       ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
+
+               For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
+               into 1 macro definition for software pipeling.
+
+                       SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79) 
+
+               assume cnt (the number of blocks)  >= 1, the main code body should look like
+
+               INITIAL_W_PRECALC_ssse3                         // W = big_endian_load and pre-compute W+K (i=0:15)
+               do {
+                       INTERNAL_ssse3                                  // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
+                       cnt--;
+                       if (cnt==0) break;
+                       BUFFER_PTR += 64;
+                       SOFTWARE_PIPELINING_ssse3;              // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
+               }
+               ENDING                                                          // update hash digests A/B/C/D/E (i=64:79)
+
+       */
+
+       #define W_PRECALC_00_15_0       W_PRECALC_00_15_0_ssse3
+       #define W_PRECALC_00_15_1       W_PRECALC_00_15_1_ssse3
+       #define W_PRECALC_16_31_0       W_PRECALC_16_31_0_ssse3
+       #define W_PRECALC_32_79_0       W_PRECALC_32_79_0_ssse3
+       #define W_PRECALC_32_79_0_i386  W_PRECALC_32_79_0_i386_ssse3
+
+
+       .macro  INITIAL_W_PRECALC_ssse3                 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory  
+
+       // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
+       W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP     
+       W_PRECALC_00_15_2                                               // W_TMP = W0 + K
+       W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
+
+       // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+       W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
+       W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP    
+       W_PRECALC_00_15_2                                               // W_TMP = W28 + K
+       W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K
+
+       // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+       W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
+       W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
+       W_PRECALC_00_15_2                                               // W_TMP = W24 + K
+       W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
+
+       // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
+       W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP    
+       W_PRECALC_00_15_2                                               // W_TMP = W20 + K
+       W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
+
+       .endm
+
+
+       .macro  INTERNAL_ssse3                                  // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
+
+       // i=16 : W12,W8,W4,W0,W28,W24,W20,W16
+       W_PRECALC_16_31_0       W0,W28,W24,W20,W16
+       RR0                                     F1,A,B,C,D,E,0
+       W_PRECALC_16_31_1       W0,W16
+       RR1                                     F1,A,B,C,D,E,0
+       W_PRECALC_16_31_2       W16
+       RR0                                     F1,D,E,A,B,C,2
+       W_PRECALC_16_31_3       W16, 2, 0
+       RR1                                     F1,D,E,A,B,C,2
+
+       // i=20 : W8,W4,W0,W28,W24,W20,W16,W12
+       W_PRECALC_16_31_0       W28,W24,W20,W16,W12
+       RR0                                     F1,B,C,D,E,A,4
+       W_PRECALC_16_31_1       W28,W12
+       RR1                                     F1,B,C,D,E,A,4
+       W_PRECALC_16_31_2       W12
+       RR0                                     F1,E,A,B,C,D,6
+       W_PRECALC_16_31_3       W12, 6, 16
+       RR1                                     F1,E,A,B,C,D,6
+
+       // i=24 : W4,W0,W28,W24,W20,W16,W12,W8
+       W_PRECALC_16_31_0       W24,W20,W16,W12,W8
+       RR0                                     F1,C,D,E,A,B,8
+       W_PRECALC_16_31_1       W24,W8
+       RR1                                     F1,C,D,E,A,B,8
+       W_PRECALC_16_31_2       W8
+       RR0                                     F1,A,B,C,D,E,10
+       W_PRECALC_16_31_3       W8,10,16
+       RR1                                     F1,A,B,C,D,E,10
+
+       // i=28 : W0,W28,W24,W20,W16,W12,W8,W4
+       W_PRECALC_16_31_0       W20,W16,W12,W8,W4
+       RR0                                     F1,D,E,A,B,C,12
+       W_PRECALC_16_31_1       W20,W4
+       RR1                                     F1,D,E,A,B,C,12
+       W_PRECALC_16_31_2       W4
+       RR0                                     F1,B,C,D,E,A,14
+       W_PRECALC_16_31_3       W4,14,16
+       RR1                                     F1,B,C,D,E,A,14
+
+       // i=32 : W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_32_79_0       W28,W8,W4,W0
+       RR0                                     F1,E,A,B,C,D,16
+       W_PRECALC_32_79_1       W16,W0
+       RR1                                     F1,E,A,B,C,D,16
+       W_PRECALC_32_79_2       W0
+       RR0                                     F1,C,D,E,A,B,18
+       W_PRECALC_32_79_3       W0,18,16
+       RR1                                     F1,C,D,E,A,B,18
+
+       // starting using F2    
+
+       // i=36 : W24,W20,W16,W12,W8,W4,W0,W28
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W24,W4,W0,W28
+#else
+       W_PRECALC_32_79_0_i386  W24,W4,W0,W28
+#endif
+       RR0                                     F2,A,B,C,D,E,20
+       W_PRECALC_32_79_1       W12,W28
+       RR1                                     F2,A,B,C,D,E,20
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W28
+#else
+       W_PRECALC_32_79_2_i386  W28
+#endif
+       RR0                                     F2,D,E,A,B,C,22
+       W_PRECALC_32_79_3       W28,22,16
+       RR1                                     F2,D,E,A,B,C,22
+
+       // i=40 : W20,W16,W12,W8,W4,W0,W28,W24  
+       #undef  K_XMM
+    #define K_XMM   32
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W20,W0,W28,W24
+#else
+       W_PRECALC_32_79_0_i386  W20,W0,W28,W24
+#endif
+       RR0                                     F2,B,C,D,E,A,24
+       W_PRECALC_32_79_1       W8,W24
+       RR1                                     F2,B,C,D,E,A,24
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W24
+#else
+       W_PRECALC_32_79_2_i386  W24
+#endif
+       RR0                                     F2,E,A,B,C,D,26
+       W_PRECALC_32_79_3       W24,26,K_XMM
+       RR1                                     F2,E,A,B,C,D,26
+
+       // i=44 : W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_32_79_0       W16,W28,W24,W20
+       RR0                                     F2,C,D,E,A,B,28
+       W_PRECALC_32_79_1       W4,W20
+       RR1                                     F2,C,D,E,A,B,28
+       W_PRECALC_32_79_2       W20
+       RR0                                     F2,A,B,C,D,E,30
+       W_PRECALC_32_79_3       W20,30,K_XMM
+       RR1                                     F2,A,B,C,D,E,30
+
+       // i=48 : W12,W8,W4,W0,W28,W24,W20,W16
+       W_PRECALC_32_79_0       W12,W24,W20,W16
+       RR0                                     F2,D,E,A,B,C,32
+       W_PRECALC_32_79_1       W0,W16
+       RR1                                     F2,D,E,A,B,C,32
+       W_PRECALC_32_79_2       W16
+       RR0                                     F2,B,C,D,E,A,34
+       W_PRECALC_32_79_3       W16,34,K_XMM
+       RR1                                     F2,B,C,D,E,A,34
+
+       // i=52 : W8,W4,W0,W28,W24,W20,W16,W12
+       W_PRECALC_32_79_0       W8,W20,W16,W12
+       RR0                                     F2,E,A,B,C,D,36
+       W_PRECALC_32_79_1       W28,W12
+       RR1                                     F2,E,A,B,C,D,36
+       W_PRECALC_32_79_2       W12
+       RR0                                     F2,C,D,E,A,B,38
+       W_PRECALC_32_79_3       W12,38,K_XMM
+       RR1                                     F2,C,D,E,A,B,38
+
+       // starting using F3    
+
+       // i=56 : W4,W0,W28,W24,W20,W16,W12,W8
+       W_PRECALC_32_79_0       W4,W16,W12,W8
+       RR0                                     F3,A,B,C,D,E,40
+       W_PRECALC_32_79_1       W24,W8
+       RR1                                     F3,A,B,C,D,E,40
+       W_PRECALC_32_79_2       W8
+       RR0                                     F3,D,E,A,B,C,42
+       W_PRECALC_32_79_3       W8,42,K_XMM
+       RR1                                     F3,D,E,A,B,C,42
+
+       // i=60 : W0,W28,W24,W20,W16,W12,W8,W4
+       #undef  K_XMM
+       #define K_XMM   48
+       W_PRECALC_32_79_0       W0,W12,W8,W4
+       RR0                                     F3,B,C,D,E,A,44
+       W_PRECALC_32_79_1       W20,W4
+       RR1                                     F3,B,C,D,E,A,44
+       W_PRECALC_32_79_2       W4
+       RR0                                     F3,E,A,B,C,D,46
+       W_PRECALC_32_79_3       W4,46,K_XMM
+       RR1                                     F3,E,A,B,C,D,46
+
+       // i=64 : W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_32_79_0       W28,W8,W4,W0
+       RR0                                     F3,C,D,E,A,B,48
+       W_PRECALC_32_79_1       W16,W0
+       RR1                                     F3,C,D,E,A,B,48
+       W_PRECALC_32_79_2       W0
+       RR0                                     F3,A,B,C,D,E,50
+       W_PRECALC_32_79_3       W0,50,K_XMM
+       RR1                                     F3,A,B,C,D,E,50
+
+       // i=68 : W24,W20,W16,W12,W8,W4,W0,W28
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W24,W4,W0,W28
+#else
+       W_PRECALC_32_79_0_i386  W24,W4,W0,W28
+#endif
+       RR0                                     F3,D,E,A,B,C,52
+       W_PRECALC_32_79_1       W12,W28
+       RR1                                     F3,D,E,A,B,C,52
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W28
+#else
+       W_PRECALC_32_79_2_i386  W28
+#endif
+       RR0                                     F3,B,C,D,E,A,54
+       W_PRECALC_32_79_3       W28,54,K_XMM
+       RR1                                     F3,B,C,D,E,A,54
+
+       // i=72 : W20,W16,W12,W8,W4,W0,W28,W24
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W20,W0,W28,W24
+#else
+       W_PRECALC_32_79_0_i386  W20,W0,W28,W24
+#endif
+       RR0                                     F3,E,A,B,C,D,56
+       W_PRECALC_32_79_1       W8,W24
+       RR1                                     F3,E,A,B,C,D,56
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W24
+#else
+       W_PRECALC_32_79_2_i386  W24
+#endif
+       RR0                                     F3,C,D,E,A,B,58
+       W_PRECALC_32_79_3       W24,58,K_XMM
+       RR1                                     F3,C,D,E,A,B,58
+
+       // starting using F4    
+
+       // i=76 : W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_32_79_0       W16,W28,W24,W20
+       RR0                                     F4,A,B,C,D,E,60
+       W_PRECALC_32_79_1       W4,W20
+       RR1                                     F4,A,B,C,D,E,60
+       W_PRECALC_32_79_2       W20
+       RR0                                     F4,D,E,A,B,C,62
+       W_PRECALC_32_79_3       W20,62,K_XMM
+       RR1                                     F4,D,E,A,B,C,62
+
+       .endm
+
+       .macro  SOFTWARE_PIPELINING_ssse3
+       // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
+       RR0                                     F4,B,C,D,E,A,64
+       W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP     
+       RR1                                     F4,B,C,D,E,A,64
+       W_PRECALC_00_15_2                                               // W_TMP = W0 + K
+       RR0                                     F4,E,A,B,C,D,66
+       W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
+       RR1                                     F4,E,A,B,C,D,66
+
+       // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+       W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
+       RR0                                     F4,C,D,E,A,B,68
+       W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP    
+       RR1                                     F4,C,D,E,A,B,68
+       W_PRECALC_00_15_2                                               // W_TMP = W28 + K
+       RR0                                     F4,A,B,C,D,E,70
+       W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K[0]
+       RR1                                     F4,A,B,C,D,E,70
+
+       // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+       W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
+       RR0                                     F4,D,E,A,B,C,72
+       W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
+       RR1                                     F4,D,E,A,B,C,72
+       W_PRECALC_00_15_2                                               // W_TMP = W24 + K
+       RR0                                     F4,B,C,D,E,A,74
+       W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
+       RR1                                     F4,B,C,D,E,A,74
+
+       // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
+       RR0                                     F4,E,A,B,C,D,76
+       W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP    
+       RR1                                     F4,E,A,B,C,D,76
+       W_PRECALC_00_15_2                                               // W_TMP = W20 + K
+       RR0                                     F4,C,D,E,A,B,78
+       W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
+       RR1                                     F4,C,D,E,A,B,78
+       .endm
+
+
+       #undef  W_PRECALC_00_15_0
+       #undef  W_PRECALC_00_15_1
+       #undef  W_PRECALC_16_31_0
+       #undef  W_PRECALC_32_79_0
+       #undef  W_PRECALC_32_79_0_i386
+
+
+
+       /* 
+
+               The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions.
+
+               INITIAL_W_PRECALC_nossse3
+               INTERNAL_nossse3
+               SOFTWARE_PIPELINING_nossse3
+               
+               They will be used in a sha1 code main body definition that will be used for system without ssse3 support.       
+
+       */
+
+       #define W_PRECALC_00_15_0       W_PRECALC_00_15_0_nossse3
+       #define W_PRECALC_00_15_1       W_PRECALC_00_15_1_nossse3
+       #define W_PRECALC_16_31_0       W_PRECALC_16_31_0_nossse3
+       #define W_PRECALC_32_79_0       W_PRECALC_32_79_0_nossse3
+       #define W_PRECALC_32_79_0_i386  W_PRECALC_32_79_0_i386_nossse3
+
+
+       .macro  INITIAL_W_PRECALC_nossse3
+
+       // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
+       W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP     
+       W_PRECALC_00_15_2                                               // W_TMP = W0 + K
+       W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
+
+       // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+       W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
+       W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP    
+       W_PRECALC_00_15_2                                               // W_TMP = W28 + K
+       W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K
+
+       // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+       W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
+       W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
+       W_PRECALC_00_15_2                                               // W_TMP = W24 + K
+       W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
+
+       // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
+       W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP    
+       W_PRECALC_00_15_2                                               // W_TMP = W20 + K
+       W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
+
+       .endm
+
+
+       .macro  INTERNAL_nossse3
+       // i=16
+       // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
+       W_PRECALC_16_31_0       W0,W28,W24,W20,W16
+       RR0                                     F1,A,B,C,D,E,0
+       W_PRECALC_16_31_1       W0,W16
+       RR1                                     F1,A,B,C,D,E,0
+       W_PRECALC_16_31_2       W16
+       RR0                                     F1,D,E,A,B,C,2
+       W_PRECALC_16_31_3       W16, 2, 0
+       RR1                                     F1,D,E,A,B,C,2
+
+       // i=20,
+       // W8,W4,W0,W28,W24,W20,W16,W12
+       W_PRECALC_16_31_0       W28,W24,W20,W16,W12
+       RR0                                     F1,B,C,D,E,A,4
+       W_PRECALC_16_31_1       W28,W12
+       RR1                                     F1,B,C,D,E,A,4
+
+       W_PRECALC_16_31_2       W12
+       RR0                                     F1,E,A,B,C,D,6
+       W_PRECALC_16_31_3       W12, 6, 16
+       RR1                                     F1,E,A,B,C,D,6
+
+       // i=24,
+       // W4,W0,W28,W24,W20,W16,W12,W8
+       W_PRECALC_16_31_0       W24,W20,W16,W12,W8
+       RR0                                     F1,C,D,E,A,B,8
+       W_PRECALC_16_31_1       W24,W8
+       RR1                                     F1,C,D,E,A,B,8
+
+       W_PRECALC_16_31_2       W8
+       RR0                                     F1,A,B,C,D,E,10
+       W_PRECALC_16_31_3       W8,10,16
+       RR1                                     F1,A,B,C,D,E,10
+
+       // i=28
+       // W0,W28,W24,W20,W16,W12,W8,W4
+       W_PRECALC_16_31_0       W20,W16,W12,W8,W4
+       RR0                                     F1,D,E,A,B,C,12
+       W_PRECALC_16_31_1       W20,W4
+       RR1                                     F1,D,E,A,B,C,12
+
+       W_PRECALC_16_31_2       W4
+       RR0                                     F1,B,C,D,E,A,14
+       W_PRECALC_16_31_3       W4,14,16
+       RR1                                     F1,B,C,D,E,A,14
+
+       //i=32
+       // W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_32_79_0       W28,W8,W4,W0
+       RR0                                     F1,E,A,B,C,D,16
+       W_PRECALC_32_79_1       W16,W0
+       RR1                                     F1,E,A,B,C,D,16
+       W_PRECALC_32_79_2       W0
+       RR0                                     F1,C,D,E,A,B,18
+       W_PRECALC_32_79_3       W0,18,16
+       RR1                                     F1,C,D,E,A,B,18
+
+       //i=36
+       // W24,W20,W16,W12,W8,W4,W0,W28
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W24,W4,W0,W28
+#else
+       W_PRECALC_32_79_0_i386  W24,W4,W0,W28
+#endif
+       RR0                                     F2,A,B,C,D,E,20
+       W_PRECALC_32_79_1       W12,W28
+       RR1                                     F2,A,B,C,D,E,20
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W28
+#else
+       W_PRECALC_32_79_2_i386  W28
+#endif
+       RR0                                     F2,D,E,A,B,C,22
+       W_PRECALC_32_79_3       W28,22,16
+       RR1                                     F2,D,E,A,B,C,22
+
+       //i=40
+       #undef  K_XMM
+    #define K_XMM   32
+       // W20,W16,W12,W8,W4,W0,W28,W24 
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W20,W0,W28,W24
+#else
+       W_PRECALC_32_79_0_i386  W20,W0,W28,W24
+#endif
+       RR0                                     F2,B,C,D,E,A,24
+       W_PRECALC_32_79_1       W8,W24
+       RR1                                     F2,B,C,D,E,A,24
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W24
+#else
+       W_PRECALC_32_79_2_i386  W24
+#endif
+       RR0                                     F2,E,A,B,C,D,26
+       W_PRECALC_32_79_3       W24,26,K_XMM
+       RR1                                     F2,E,A,B,C,D,26
+
+       //i=44
+       // W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_32_79_0       W16,W28,W24,W20
+       RR0                                     F2,C,D,E,A,B,28
+       W_PRECALC_32_79_1       W4,W20
+       RR1                                     F2,C,D,E,A,B,28
+       W_PRECALC_32_79_2       W20
+       RR0                                     F2,A,B,C,D,E,30
+       W_PRECALC_32_79_3       W20,30,K_XMM
+       RR1                                     F2,A,B,C,D,E,30
+
+       //i=48
+       // W12,W8,W4,W0,W28,W24,W20,W16
+       W_PRECALC_32_79_0       W12,W24,W20,W16
+       RR0                                     F2,D,E,A,B,C,32
+       W_PRECALC_32_79_1       W0,W16
+       RR1                                     F2,D,E,A,B,C,32
+       W_PRECALC_32_79_2       W16
+       RR0                                     F2,B,C,D,E,A,34
+       W_PRECALC_32_79_3       W16,34,K_XMM
+       RR1                                     F2,B,C,D,E,A,34
+
+       //i=52
+       // W8,W4,W0,W28,W24,W20,W16,W12
+       W_PRECALC_32_79_0       W8,W20,W16,W12
+       RR0                                     F2,E,A,B,C,D,36
+       W_PRECALC_32_79_1       W28,W12
+       RR1                                     F2,E,A,B,C,D,36
+       W_PRECALC_32_79_2       W12
+       RR0                                     F2,C,D,E,A,B,38
+       W_PRECALC_32_79_3       W12,38,K_XMM
+       RR1                                     F2,C,D,E,A,B,38
+
+       //i=56
+       // W4,W0,W28,W24,W20,W16,W12,W8
+       W_PRECALC_32_79_0       W4,W16,W12,W8
+       RR0                                     F3,A,B,C,D,E,40
+       W_PRECALC_32_79_1       W24,W8
+       RR1                                     F3,A,B,C,D,E,40
+       W_PRECALC_32_79_2       W8
+       RR0                                     F3,D,E,A,B,C,42
+       W_PRECALC_32_79_3       W8,42,K_XMM
+       RR1                                     F3,D,E,A,B,C,42
+
+       //i=60
+       #undef  K_XMM
+       #define K_XMM   48
+       // W0,W28,W24,W20,W16,W12,W8,W4
+       W_PRECALC_32_79_0       W0,W12,W8,W4
+       RR0                                     F3,B,C,D,E,A,44
+       W_PRECALC_32_79_1       W20,W4
+       RR1                                     F3,B,C,D,E,A,44
+       W_PRECALC_32_79_2       W4
+       RR0                                     F3,E,A,B,C,D,46
+       W_PRECALC_32_79_3       W4,46,K_XMM
+       RR1                                     F3,E,A,B,C,D,46
+
+       //i=64
+       // W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_32_79_0       W28,W8,W4,W0
+       RR0                                     F3,C,D,E,A,B,48
+       W_PRECALC_32_79_1       W16,W0
+       RR1                                     F3,C,D,E,A,B,48
+       W_PRECALC_32_79_2       W0
+       RR0                                     F3,A,B,C,D,E,50
+       W_PRECALC_32_79_3       W0,50,K_XMM
+       RR1                                     F3,A,B,C,D,E,50
+
+       //i=68
+       // W24,W20,W16,W12,W8,W4,W0,W28
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W24,W4,W0,W28
+#else
+       W_PRECALC_32_79_0_i386  W24,W4,W0,W28
+#endif
+       RR0                                     F3,D,E,A,B,C,52
+       W_PRECALC_32_79_1       W12,W28
+       RR1                                     F3,D,E,A,B,C,52
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W28
+#else
+       W_PRECALC_32_79_2_i386  W28
+#endif
+       RR0                                     F3,B,C,D,E,A,54
+       W_PRECALC_32_79_3       W28,54,K_XMM
+       RR1                                     F3,B,C,D,E,A,54
+
+       //i=72
+       // W20,W16,W12,W8,W4,W0,W28,W24
+#if defined (__x86_64__)
+       W_PRECALC_32_79_0       W20,W0,W28,W24
+#else
+       W_PRECALC_32_79_0_i386  W20,W0,W28,W24
+#endif
+       RR0                                     F3,E,A,B,C,D,56
+       W_PRECALC_32_79_1       W8,W24
+       RR1                                     F3,E,A,B,C,D,56
+#if defined (__x86_64__)
+       W_PRECALC_32_79_2       W24
+#else
+       W_PRECALC_32_79_2_i386  W24
+#endif
+       RR0                                     F3,C,D,E,A,B,58
+       W_PRECALC_32_79_3       W24,58,K_XMM
+       RR1                                     F3,C,D,E,A,B,58
+
+       // starting using F4    
+
+       //i=76
+       // W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_32_79_0       W16,W28,W24,W20
+       RR0                                     F4,A,B,C,D,E,60
+       W_PRECALC_32_79_1       W4,W20
+       RR1                                     F4,A,B,C,D,E,60
+       W_PRECALC_32_79_2       W20
+       RR0                                     F4,D,E,A,B,C,62
+       W_PRECALC_32_79_3       W20,62,K_XMM
+       RR1                                     F4,D,E,A,B,C,62
+
+       .endm
+
+       .macro  SOFTWARE_PIPELINING_nossse3
+       // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+       W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
+       RR0                                     F4,B,C,D,E,A,64
+       W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP     
+       RR1                                     F4,B,C,D,E,A,64
+       W_PRECALC_00_15_2                                               // W_TMP = W0 + K
+       RR0                                     F4,E,A,B,C,D,66
+       W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
+       RR1                                     F4,E,A,B,C,D,66
+
+       // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+       W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
+       RR0                                     F4,C,D,E,A,B,68
+       W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP    
+       RR1                                     F4,C,D,E,A,B,68
+       W_PRECALC_00_15_2                                               // W_TMP = W28 + K
+       RR0                                     F4,A,B,C,D,E,70
+       W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K[0]
+       RR1                                     F4,A,B,C,D,E,70
+
+       // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+       W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
+       RR0                                     F4,D,E,A,B,C,72
+       W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
+       RR1                                     F4,D,E,A,B,C,72
+       W_PRECALC_00_15_2                                               // W_TMP = W24 + K
+       RR0                                     F4,B,C,D,E,A,74
+       W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
+       RR1                                     F4,B,C,D,E,A,74
+
+       // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+       W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
+       RR0                                     F4,E,A,B,C,D,76
+       W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP    
+       RR1                                     F4,E,A,B,C,D,76
+       W_PRECALC_00_15_2                                               // W_TMP = W20 + K
+       RR0                                     F4,C,D,E,A,B,78
+       W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
+       RR1                                     F4,C,D,E,A,B,78
+       .endm
+
+       .macro  ENDING          // finish up updating hash digests (i=64:79)
+       //i=80
+       RR0                                     F4,B,C,D,E,A,64
+       RR1                                     F4,B,C,D,E,A,64
+       RR0                                     F4,E,A,B,C,D,66
+       RR1                                     F4,E,A,B,C,D,66
+
+       //i=84
+       RR0                                     F4,C,D,E,A,B,68
+       RR1                                     F4,C,D,E,A,B,68
+       RR0                                     F4,A,B,C,D,E,70
+       RR1                                     F4,A,B,C,D,E,70
+
+       //i=88
+       RR0                                     F4,D,E,A,B,C,72
+       RR1                                     F4,D,E,A,B,C,72
+       RR0                                     F4,B,C,D,E,A,74
+       RR1                                     F4,B,C,D,E,A,74
+
+       //i=92
+       RR0                                     F4,E,A,B,C,D,76
+       RR1                                     F4,E,A,B,C,D,76
+       RR0                                     F4,C,D,E,A,B,78
+       RR1                                     F4,C,D,E,A,B,78
+       .endm
+
+       // load hash digests A,B,C,D,E from memory into registers
+       .macro  LOAD_HASH
+#if defined (__x86_64__)
+       mov                     (HASH_PTR), A
+       mov                     4(HASH_PTR), B
+       mov                     8(HASH_PTR), C
+       mov                     12(HASH_PTR), D
+       mov                     16(HASH_PTR), E
+#else
+    mov         HASH_PTR, T1
+    mov         (T1), A
+    mov         4(T1), B
+    mov         8(T1), C
+    mov         12(T1), D
+    mov         16(T1), E
+#endif
+       .endm
+
+       .macro  UPDATE_HASH
+       add             $0, $1
+       mov             $1, $0
+       .endm
+
+       .macro UPDATE_ALL_HASH  
+#if defined (__x86_64__)
+       UPDATE_HASH             (HASH_PTR), A
+       UPDATE_HASH             4(HASH_PTR), B
+       UPDATE_HASH             8(HASH_PTR), C
+       UPDATE_HASH             12(HASH_PTR), D
+       UPDATE_HASH             16(HASH_PTR), E
+#else
+    mov             HASH_PTR, T1
+    UPDATE_HASH     (T1), A
+    UPDATE_HASH     4(T1), B
+    UPDATE_HASH     8(T1), C
+    UPDATE_HASH     12(T1), D
+    UPDATE_HASH     16(T1), E
+#endif
+       .endm
+
+
+       /*
+                main sha1 code for system without ssse3 support
+       */
+
+       .macro  SHA1_PIPELINED_MAIN_BODY_nossse3
+       LOAD_HASH                                               // load initial hashes into A,B,C,D,E (registers)
+       INITIAL_W_PRECALC_nossse3               // big_endian_load(W) and W+K (i=0:15)
+       .align  4,0x90
+0:
+       INTERNAL_nossse3                                // update W (i=16:79) and update ABCDE (i=0:63) 
+#if Multiple_Blocks
+       add     $$64, BUFFER_PTR                        // BUFFER_PTR+=64;
+       sub     $$1, cnt                                        // pre-decrement cnt by 1
+       jbe     1f                                                      // if cnt <= 0, branch to finish off
+       SOFTWARE_PIPELINING_nossse3             // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
+       UPDATE_ALL_HASH                                 // update output hashes
+       jmp     0b                                                      // repeat for next block
+       .align  4,0x90
+1:
+#endif
+       ENDING                                                  // update ABCDE (i=64:79)
+       UPDATE_ALL_HASH                                 // update output hashes
+       .endm
+
+       /*
+                main sha1 code for system with ssse3 support
+       */
+
+       .macro  SHA1_PIPELINED_MAIN_BODY_ssse3
+       LOAD_HASH                                               // load initial hashes into A,B,C,D,E
+       INITIAL_W_PRECALC_ssse3                 // big_endian_load(W) and W+K (i=0:15)
+       .align  4,0x90
+0:
+       INTERNAL_ssse3                                  // update W (i=16:79) and update ABCDE (i=0:63)
+#if Multiple_Blocks
+       add     $$64, BUFFER_PTR                        // BUFFER_PTR+=64;
+       sub     $$1, cnt                                        // pre-decrement cnt by 1
+       jbe     1f                                                      // if cnt <= 0, branch to finish off
+       SOFTWARE_PIPELINING_ssse3               // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
+       UPDATE_ALL_HASH                                 // update output hashes
+       jmp     0b                                                      // repeat for next block
+       .align  4,0x90
+1:
+#endif
+       ENDING                                                  // update ABCDE (i=64:79)
+       UPDATE_ALL_HASH                                 // update output hashes
+       .endm
+
+#include <i386/cpu_capabilities.h>
+
+       .text
+
+       .globl _SHA1Transform
+       .private_extern _SHA1Transform  
+_SHA1Transform:
+
+       // detect SSSE3 and dispatch appropriate code branch
+       #if defined __x86_64__
+       movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
+       mov     (%rax), %eax                                    // %eax = __cpu_capabilities
+       #else       // i386
+               #if defined KERNEL
+               leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
+               mov     (%eax), %eax                                // %eax = __cpu_capabilities
+               #else
+               mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
+               #endif
+       #endif
+    test    $(kHasSupplementalSSE3), %eax
+    je      _SHA1Transform_nossse3                                     // branch to no-ssse3 code
+
+
+       // start the sha1 code with ssse3 support
+
+       // save callee-save registers
+#if defined (__x86_64__)
+       push    %rbx
+       push    %rbp
+#else
+    push    %ebx
+    push    %ebp
+    push    %esi
+    push    %edi
+#endif
+
+       sub             $stack_size, sp                                 // allocate stack memory for use
+
+       // save used xmm register if this is for kernel
+#if    KERNEL
+       xmov    %xmm0, 4*16(sp)
+       xmov    %xmm1, 5*16(sp)
+       xmov    %xmm2, 6*16(sp)
+       xmov    %xmm3, 7*16(sp)
+       xmov    %xmm4, 8*16(sp)
+       xmov    %xmm5, 9*16(sp)
+       xmov    %xmm6, 10*16(sp)
+       xmov    %xmm7, 11*16(sp)
+#if defined (__x86_64__)
+       xmov    %xmm8, 12*16(sp)
+       xmov    %xmm9, 13*16(sp)
+       xmov    %xmm10, 14*16(sp)
+#endif
+#endif
+
+#if defined (__x86_64__)
+
+       // set up registers to free %edx/%edi/%esi for other use (ABCDE)
+       mov             ctx, HASH_PTR
+       mov             buf, BUFFER_PTR
+#if Multiple_Blocks
+       mov             %rdx, cnt
+#endif
+       lea             K_XMM_AR(%rip), K_BASE
+       xmov    0x40(K_BASE), XMM_SHUFB_BSWAP
+
+#else  // __i386__
+
+#if    KERNEL
+    lea     K_XMM_AR, %eax
+#else
+       // Get address of 0 in R.
+           call    0f          // Push program counter onto stack.
+        0: pop     %eax      // Get program counter.
+               lea     K_XMM_AR-0b(%eax), %eax
+#endif
+    mov     %eax, K_BASE
+    xmov    0x40(%eax), %xmm0
+    xmov    %xmm0, XMM_SHUFB_BSWAP
+
+#endif
+
+       SHA1_PIPELINED_MAIN_BODY_ssse3
+
+       // restore used xmm registers if this is for kernel
+#if    KERNEL
+       xmov    4*16(sp), %xmm0
+       xmov    5*16(sp), %xmm1
+       xmov    6*16(sp), %xmm2
+       xmov    7*16(sp), %xmm3
+       xmov    8*16(sp), %xmm4
+       xmov    9*16(sp), %xmm5
+       xmov    10*16(sp), %xmm6
+       xmov    11*16(sp), %xmm7
+#if defined (__x86_64__)
+       xmov    12*16(sp), %xmm8
+       xmov    13*16(sp), %xmm9
+       xmov    14*16(sp), %xmm10
+#endif
+#endif
+
+       add             $stack_size, sp         // deallocate stack memory
+
+       // restore callee-save registers
+#if defined (__x86_64__)
+       pop             %rbp
+       pop             %rbx
+#else
+    pop     %edi
+    pop     %esi
+    pop     %ebp
+    pop     %ebx
+#endif
+
+       ret                                                     // return
+
+       // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions
+
+       .globl _SHA1Transform_nossse3
+       .private_extern _SHA1Transform_nossse3
+_SHA1Transform_nossse3:
+
+       // push callee-save registers
+#if defined (__x86_64__)
+       push    %rbx
+       push    %rbp
+#else
+    push    %ebx
+    push    %ebp
+    push    %esi
+    push    %edi
+#endif
+
+       sub             $stack_size, sp                 // allocate stack memory for local use
+
+       // save used xmm registers if this is for kernel
+#if    KERNEL
+       xmov    %xmm0, 4*16(sp)
+       xmov    %xmm1, 5*16(sp)
+       xmov    %xmm2, 6*16(sp)
+       xmov    %xmm3, 7*16(sp)
+       xmov    %xmm4, 8*16(sp)
+       xmov    %xmm5, 9*16(sp)
+       xmov    %xmm6, 10*16(sp)
+       xmov    %xmm7, 11*16(sp)
+#if defined (__x86_64__)
+       xmov    %xmm8, 12*16(sp)
+       xmov    %xmm9, 13*16(sp)
+#endif
+#endif
+
+#if defined (__x86_64__)
+
+       // set up registers to free %edx/%edi/%esi for other use (ABCDE)
+       mov             ctx, HASH_PTR
+       mov             buf, BUFFER_PTR
+#if Multiple_Blocks
+       mov             %rdx, cnt
+#endif
+       lea             K_XMM_AR(%rip), K_BASE
+
+#else  // __i386__
+
+#if    KERNEL
+    lea     K_XMM_AR, %eax
+#else
+       // Get address of 0 in R.
+           call    0f          // Push program counter onto stack.
+        0: pop     %eax      // Get program counter.
+               lea     K_XMM_AR-0b(%eax), %eax
+#endif
+    mov     %eax, K_BASE
+
+#endif
+
+       SHA1_PIPELINED_MAIN_BODY_nossse3
+
+       // restore used xmm registers if this is for kernel
+#if    KERNEL
+       xmov    4*16(sp), %xmm0
+       xmov    5*16(sp), %xmm1
+       xmov    6*16(sp), %xmm2
+       xmov    7*16(sp), %xmm3
+       xmov    8*16(sp), %xmm4
+       xmov    9*16(sp), %xmm5
+       xmov    10*16(sp), %xmm6
+       xmov    11*16(sp), %xmm7
+#if defined (__x86_64__)
+       xmov    12*16(sp), %xmm8
+       xmov    13*16(sp), %xmm9
+#endif
+#endif
+
+       add             $stack_size, sp         // deallocate stack memory
+
+       // restore callee-save registers
+#if defined (__x86_64__)
+       pop             %rbp
+       pop             %rbx
+#else
+    pop     %edi
+    pop     %esi
+    pop     %ebp
+    pop     %ebx
+#endif
+
+       ret                                                     // return
+
+       .const
+       .align  4, 0x90 
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+K_XMM_AR:
+    .long      K1
+       .long   K1
+       .long   K1
+       .long   K1
+    .long      K2
+       .long   K2
+       .long   K2
+       .long   K2
+    .long      K3
+       .long   K3
+       .long   K3
+       .long   K3
+    .long      K4
+       .long   K4
+       .long   K4
+       .long   K4
+// bswap_shufb_ctl:    invoked thru 0x40(K_XMM_AR)
+    .long      0x00010203
+    .long      0x04050607
+    .long      0x08090a0b
+    .long      0x0c0d0e0f
+
+
+
+#endif // architecture x86_64 or i386