libkern/crypto/intel/sha1edp.s

   1 /*      sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
   2         CoreOS - vector and numerics group
   3         cclee   6-21-10
   4
   5         The implementation is based on the principle described in an Intel online article
   6         "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
   7         http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
   8
   9
  10         Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
  11
  12 void SHA1( int HASH[], int MESSAGE[] )
  13 {
  14     int A[81], B[81], C[81], D[81], E[81];
  15     int W[80];
  16
  17     int i, FN;
  18
  19     A[0] = HASH[0];
  20     B[0] = HASH[1];
  21     C[0] = HASH[2];
  22     D[0] = HASH[3];
  23     E[0] = HASH[4];
  24
  25     for ( i=0; i<80; ++i )
  26     {
  27         if ( i < 16 )
  28             W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
  29         else
  30             W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
  31
  32         FN = F( i, B[i], C[i], D[i] );
  33
  34         A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
  35         B[i+1] = A[i];
  36         C[i+1] = ROTATE_LEFT( B[i], 30 );
  37         D[i+1] = C[i];
  38         E[i+1] = D[i];
  39     }
  40
  41     HASH[0] += A[80];
  42     HASH[1] += B[80];
  43     HASH[2] += C[80];
  44     HASH[3] += D[80];
  45     HASH[4] += E[80];
  46 }
  47
  48         For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
  49
  50         The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
  51
  52         1. done on 4 consequtive W[i] values in a single XMM register
  53     W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
  54     W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
  55     W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
  56     W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
  57
  58     2. this additional calculation unfortunately requires many additional operations
  59     W[i+3] ^= W[i] rol 1
  60
  61     3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
  62     W[i:i+3] += {K,K,K,K}
  63
  64         Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
  65         The Dean Gaudet approach can be expressed as
  66
  67         1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
  68         2. W[i+3] ^= W[i] rol 1
  69         3. W0 += {K,K,K,K}
  70
  71         For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
  72
  73         1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
  74
  75         Note:
  76         1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
  77         2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
  78                 i=0, W28,W24,...,W0
  79                 i=4, W24,W20,...,W28
  80                 i=8, W20,W16,...,W24
  81                 .
  82                 .
  83                 and so forth.
  84         3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
  85                 a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
  86                 b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
  87         4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available.
  88            If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched.
  89
  90 */
  91
  92 /* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
  93 #define Multiple_Blocks 1
  94
  95 #if defined (__x86_64__) || defined(__i386__)           // x86_64 or i386 architectures
  96
  97 #if defined(__x86_64__)
  98
  99         // set up for x86_64
 100 #define stack_size      (8+16*11+16*4)                                  // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage
 101 #define sp                      %rsp                                                    // unifying architectural stack pointer representation
 102 #define ctx                     %rdi                                                    // 1st input argument, will move to HASH_PTR (%r9)
 103 #define buf                     %rsi                                                    // 2nd input argument, will move to BUFFER_PTR (%r10)
 104 #define cnt                     %r11                                                    // will copy from the 3rd input argument (%rdx)
 105 #define K_BASE          %r8                                                             // an aligned pointer to point to shufb reference numbers of table of K values
 106 #define HASH_PTR        %r9                                                             // pointer to Hash values (A,B,C,D,E)
 107 #define BUFFER_PTR      %r10                                                    // pointer to input blocks
 108
 109 #else   // !__x86_64__
 110
 111         // set up for i386
 112 #define stack_size      (12+16*2+16*11+16*4)                    // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
 113 #define sp                      %esp                                                    // unifying architectural stack pointer representation
 114 #define HASH_PTR        stack_size+16+4(sp)                             // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
 115 #define BUFFER_PTR      stack_size+16+8(sp)                             // use 2nd input argument from caller function
 116 #define cnt                     stack_size+16+12(sp)                    // use 3rd input argument from caller function
 117 #define K_BASE          stack_size-4(sp)                                // use for K_BASE
 118
 119 #endif  // __x86_64__
 120
 121 // symbolizing registers or stack memory with algorithmic variables     W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
 122
 123 #define W_TMP   %xmm0
 124 #define W_TMP2  %xmm1
 125 #define W0      %xmm2
 126 #define W4      %xmm3
 127 #define W8      %xmm4
 128 #define W12     %xmm5
 129 #define W16     %xmm6
 130 #define W20     %xmm7
 131 #if defined(__x86_64__)
 132 #define W24     %xmm8
 133 #define W28     %xmm9
 134 #define XMM_SHUFB_BSWAP %xmm10                          // used only when ssse3 is supported
 135 #else   // defined (__i386__)
 136 #define W24     12*16(sp)
 137 #define W28     13*16(sp)
 138 #define XMM_SHUFB_BSWAP 14*16(sp)                       // used only when ssse3 is supported
 139 #endif
 140
 141 #define xmov    movaps                                          // aligned 16-byte move
 142 #define xmovu   movups                                          // unaligned 16-byte move
 143
 144 // intermediate hash variables
 145 #define A %ecx
 146 #define B %esi
 147 #define C %edi
 148 #define D %ebp
 149 #define E %edx
 150
 151 // temp variables
 152 #define T1 %eax
 153 #define T2 %ebx
 154
 155 #define WK(t)   (t&15)*4(sp)
 156
 157         // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
 158         // result in T1
 159         .macro  F1
 160         mov     $1, T1
 161         xor     $2, T1
 162         and     $0, T1
 163         xor     $2, T1
 164         .endm
 165
 166         // int F2(int B, int C, int D) { return (D ^ B ^ C); }
 167         // result in T1
 168         .macro  F2
 169         mov     $2, T1
 170         xor     $1, T1
 171         xor     $0, T1
 172         .endm
 173
 174         // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
 175         // result in T1
 176         .macro  F3
 177                 mov $1, T1
 178         mov $0, T2
 179         or  $0, T1
 180         and $1, T2
 181         and $2, T1
 182         or  T2, T1
 183         .endm
 184
 185         // for i=60:79, F4 is identical to F2
 186         #define F4      F2
 187
 188
 189         /*
 190                 i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
 191
 192                 with ssse3 support, this is achived via
 193                 for (i=0;i<16;i+=4) {
 194                         1. W_TMP = new 16 bytes from MESSAGE[]
 195                         2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
 196                         3. WTMP += {K,K,K,K};
 197                         4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
 198                 }
 199
 200                 each step is represented in one of the following 4 macro definitions
 201
 202         */
 203
 204         .macro  W_PRECALC_00_15_0_ssse3                 // input argument $0 : 0/4/8/12
 205 #if defined (__x86_64__)                                        // BUFFER_PTR is already an address register in x86_64
 206         xmovu   $0*4(BUFFER_PTR), W_TMP                 // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
 207 #else                                                                           // BUFFER_PTR is from the argument set up in the caller
 208         mov     BUFFER_PTR, T1                                  // T1 = BUFFER_PTR
 209     xmovu  $0*4(T1), W_TMP                                      // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
 210 #endif
 211         .endm
 212
 213         .macro  W_PRECALC_00_15_1_ssse3                 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
 214         pshufb  XMM_SHUFB_BSWAP, W_TMP                  // convert W_TMP from little-endian into big-endian
 215         xmov    W_TMP, $0                                               // save W_TMP in the circular buffer
 216         .endm
 217
 218         .macro  W_PRECALC_00_15_2                               // K_BASE points to the current K quadruple.
 219 #if defined (__x86_64__)                                        // K_BASE is already an address register in x86_64
 220         paddd   (K_BASE), W_TMP                                 // W_TMP += {K,K,K,K};
 221 #else                                                                           // K_BASE is previously set up in the stack memory
 222         mov     K_BASE, T1                                              // T1 = K_BASE
 223     paddd   (T1), W_TMP                                         // W_TMP += {K,K,K,K};
 224 #endif
 225         .endm
 226
 227         .macro  W_PRECALC_00_15_3
 228         xmov    W_TMP, WK($0&~3)                                // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
 229         .endm
 230
 231         /*
 232                 without ssse3 support, steps 1 and 2 need to be modified
 233                 1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space
 234                 2. load the 16-bytes from the aligned stack memory into W_TMP
 235         */
 236
 237         .macro  W_PRECALC_00_15_0_nossse3               // input argument $0 : 0/4/8/12
 238
 239 #if     defined (__x86_64__)
 240         #define BUFFERP BUFFER_PTR
 241 #else
 242         mov             BUFFER_PTR, T2                                  // copy BUFFER_PTR (from caller 2nd argument) to T2
 243         #define BUFFERP T2
 244 #endif
 245
 246         // load 1st word, bswap it, save it to stack
 247         mov             $0*4(BUFFERP), T1
 248         bswap   T1
 249         mov             T1, 14*16(sp)
 250
 251         // load 2nd word, bswap it, save it to stack
 252         mov             4+$0*4(BUFFERP), T1
 253         bswap   T1
 254         mov             T1, 4+14*16(sp)
 255
 256         // load 3rd word, bswap it, save it to stack
 257         mov             8+$0*4(BUFFERP), T1
 258         bswap   T1
 259         mov             T1, 8+14*16(sp)
 260
 261         // load 4th word, bswap it, save it to stack
 262         mov             12+$0*4(BUFFERP), T1
 263         bswap   T1
 264         mov             T1, 12+14*16(sp)
 265         .endm
 266
 267         .macro  W_PRECALC_00_15_1_nossse3               // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
 268         xmov    14*16(sp), W_TMP                                // load the bswapped 16-bytes from the aligned stack memory
 269         xmov    W_TMP, $0                                               // save W = W_TMP in the circular buffer
 270         .endm
 271
 272         // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
 273         /*
 274         W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
 275     W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
 276     W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
 277     W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
 278
 279         W[i+3] ^= W[i] rol 1;   // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
 280
 281         The operation (updating W and W+K) is scheduled as and divided into 4 steps
 282
 283         0. W_tmp = W3; W = W14 ^ W8
 284         1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
 285         2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
 286         3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
 287
 288         */
 289
 290         .macro  W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W
 291         xmov    $1, $4                                  // W = W12
 292         palignr $$8, $0, $4                             // W = W14
 293         xmov    $3, W_TMP                               // W_TMP = W4
 294         psrldq  $$4, W_TMP                              // W_TMP = W3
 295         pxor    $2, $4                                  // W = W8 ^ W14
 296         .endm
 297
 298         .macro  W_PRECALC_16_31_1               // input arguments : W16,W
 299         pxor    $0, W_TMP                               // W_TMP = W3 ^ W16
 300         pxor    W_TMP, $1                               // W = W3 ^ W16 ^ W8 ^ W14
 301         xmov    $1, W_TMP2                              // W_TMP2 = W3 ^ W16 ^ W8 ^ W14
 302         xmov    $1, W_TMP                               // W_TMP = W3 ^ W16 ^ W8 ^ W14
 303         pslldq  $$12, W_TMP2                    // W_TMP2 = (W[i] 0 0 0)
 304         .endm
 305
 306         .macro  W_PRECALC_16_31_2               // input argument : W
 307         psrld   $$31, $0                                // (W3 ^ W16 ^ W8 ^ W14)>>31
 308         pslld   $$1, W_TMP                              // (W3 ^ W16 ^ W8 ^ W14)<<1
 309         por             $0, W_TMP                               // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
 310         xmov    W_TMP2, $0                              // copy W[i] at location of W[i+3]
 311         psrld   $$30, W_TMP2                    // W_TMP2 = W[i] lower 2 bits after rol 2
 312         pslld   $$2, $0                                 // W = W[i] higher 30 bits after rol 2
 313         .endm
 314
 315         .macro  W_PRECALC_16_31_3               // input arguments: W, i, K_XMM
 316 #if defined (__i386__)
 317         mov     K_BASE, T1                              // K_BASE is store in the stack memory for i386
 318 #endif
 319         pxor    $0, W_TMP
 320         pxor    W_TMP2, W_TMP                   // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
 321         xmov    W_TMP, $0                               // save W = W_TMP in the W circular buffer
 322 #if defined (__x86_64__)
 323         paddd   $2(K_BASE), W_TMP               // W+K
 324 #else
 325     paddd   $2(T1), W_TMP                       // W+K
 326 #endif
 327         xmov    W_TMP, WK($1&~3)                // save WK = W+K for later update of the hashes A/B/C/D/E
 328         .endm
 329
 330         // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions
 331
 332         .macro  W_PRECALC_16_31_0_nossse3       // input arguments : W16,W12,W8,W4,W
 333         xmov    $1, $4                                          // W = W12 = (w9 w10 w11 w12)
 334
 335         // the following is a wrokaround for palignr
 336         xmov    $0, W_TMP                                       // W16 = (w13 w14 w15 w16)
 337         pslldq  $$8, $4                                         // shift left to make (w11 w12 0 0)
 338         psrldq  $$8, W_TMP                                      // shift right to make (0 0 w13 w14)
 339         por             W_TMP, $4                                       // W = W14 = (w11 w12 w13 w14)
 340
 341         xmov    $3, W_TMP                                       // W_TMP = W4 = (w1 w2 w3 w4)
 342         psrldq  $$4, W_TMP                                      // W_TMP = W3 = (0 w1 w2 w3)
 343         pxor    $2, $4                                          // W = W8 ^ W14
 344         .endm
 345
 346         /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
 347
 348                 W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
 349
 350                 where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
 351
 352
 353         0. W_tmp = W6; W = W28 ^ W32;
 354         1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
 355         2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
 356         3. W = W_Tmp; WK = W_tmp + K;
 357
 358         */
 359
 360
 361         .macro  W_PRECALC_32_79_0_ssse3         // inputr arguments : W28,W8,W4,W
 362         xmov    $2, W_TMP                                       // (w1 w2 w3 w4)
 363         pxor    $0, $3                                          // W = W28 ^ W32;
 364         palignr $$8, $1, W_TMP                          // W_tmp = (w3 w4 w5 w6) = W6;
 365         .endm
 366
 367         // the following is a variant and will be used for system without ssse3 support
 368         .macro  W_PRECALC_32_79_0_nossse3       // input arguments : W28,W8,W4,W
 369         xmov    $2, W_TMP                                       // (w1 w2 w3 w4)
 370         xmov    $1, W_TMP2                                      // (w5 w6 w7 w8)
 371         pxor    $0, $3                                          // W = W28 ^ W32
 372         pslldq  $$8, W_TMP                                      // (w3 w4 0 0)
 373         psrldq  $$8, W_TMP2                                     // (0 0 w5 w6)
 374         por             W_TMP2, W_TMP                           // W_tmp = (w3 w4 w5 w6) = W6
 375         .endm
 376
 377         // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
 378         .macro  W_PRECALC_32_79_0_i386_ssse3    // input arguments : W28,W8,W4,W
 379     xmov    $3, W_TMP                                           // W32
 380     pxor    $0, W_TMP                                           // W28 ^ W32
 381     xmov    W_TMP, $3                                           // W = W28 ^ W32;
 382     xmov    $2, W_TMP                                           // W4
 383     palignr $$8, $1, W_TMP                                      // W_tmp = (w3 w4 w5 w6) = W6;
 384     .endm
 385
 386         // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers)
 387         .macro  W_PRECALC_32_79_0_i386_nossse3  // input arguments : W28,W8,W4,W
 388     xmov    $3, W_TMP                                           // W32
 389     pxor    $0, W_TMP                                           // W28 ^ W32
 390     xmov    W_TMP, $3                                           // W = W28 ^ W32
 391     xmov    $2, W_TMP                                           // W4 = (w1 w2 w3 w4)
 392         xmov    $1, W_TMP2                                              // W8 = (w5 w6 w7 w8)
 393         pslldq  $$8, W_TMP                                              // (w3 w4 0 0)
 394         psrldq  $$8, W_TMP2                                             // (0 0 w5 w6)
 395         por             W_TMP2, W_TMP                                   // W_tmp = (w3 w4 w5 w6) = W6
 396     .endm
 397
 398         .macro  W_PRECALC_32_79_1                       // input arguments : W16,W
 399         pxor    $0, W_TMP                                       // W_tmp = W6 ^ W16
 400         pxor    $1, W_TMP                                       // W_tmp = W6 ^ W16 ^ W28 ^ W32
 401         xmov    W_TMP, $1                                       // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
 402         .endm
 403
 404         .macro  W_PRECALC_32_79_2                       // input argument : W
 405         psrld   $$30, $0                                        // W >> 30
 406         pslld   $$2, W_TMP                                      // W << 2
 407         por             $0, W_TMP                                       // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
 408         .endm
 409
 410         // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
 411         // this should be used when the input is either W24 or W28 on i386 architecture
 412     .macro  W_PRECALC_32_79_2_i386      // input argument : W
 413     xmov    $0, W_TMP2                                  // W
 414     psrld   $$30, W_TMP2                                // W >> 30
 415     xmov    W_TMP2, $0                                  // save (W >> 30) at W
 416     pslld   $$2, W_TMP                                  // W_tmp << 2
 417     por     $0, W_TMP                                   // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
 418     .endm
 419
 420         .macro  W_PRECALC_32_79_3                       // input argument W, i, K_XMM
 421 #if defined (__x86_64__)
 422         xmov    W_TMP, $0                                       // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
 423         paddd   $2(K_BASE), W_TMP                       // W + K
 424         xmov    W_TMP, WK($1&~3)                        // write W+K
 425 #else
 426     mov     K_BASE, T1                                  // T1 = K_BASE (which is in the caller argument)
 427     xmov    W_TMP, $0                                   // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
 428     paddd   $2(T1), W_TMP                               // W_tmp = W + K
 429     xmov    W_TMP, WK($1&~3)                    // write WK
 430 #endif
 431         .endm
 432
 433
 434         /* The hash update operation is completed by the following statements.
 435
 436                 A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
 437         B[i+1] = A[i];
 438         C[i+1] = ROTATE_LEFT( B[i], 30 );
 439         D[i+1] = C[i];
 440         E[i+1] = D[i];
 441
 442                 Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
 443
 444                 A1 = FN + E0 + rol(A0,5) + WK;
 445                 B1 = A0;
 446                 C1 = rol(B0, 30);
 447                 D1 = C0;
 448                 E1 = D0;
 449
 450                 to avoid excessive memory movement between registers,
 451                         1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
 452                         2. C1 = rol(B0,30) can be temporarily saved in B0.
 453
 454                 Therefore, ignoring the time index, the update operation is equivalent to
 455                         1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
 456                         2. B = rol(B,30)
 457                         3. the hashes are now stored in the order of E,A,B,C,D
 458
 459
 460                 To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
 461                 1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
 462                 2. B = rol(B,30)
 463                 // now the hashes are in the order of E,A,B,C,D
 464                 3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
 465                 4. A = rol(A,30)
 466                 // now the hashes are in the order of D,E,A,B,C
 467
 468                 These operations are distributed into the following 2 macro definitions RR0 and RR1.
 469
 470         */
 471
 472         .macro  RR0                             // input arguments : FN, A, B, C, D, E, i
 473         $0              $2, $3, $4              // T1 = FN(B,C,D)
 474         add             WK($6), $5              // E + WK(i)
 475         rol             $$30, $2                // B = rol(B,30)
 476         mov             $1, T2                  // T2 = A
 477         add             WK($6+1), $4    // D + WK(i+1)
 478         rol             $$5, T2                 // rol(A,5)
 479         add             T1, $5                  // E = FN(B,C,D) + E + WK(i)
 480         .endm
 481
 482         .macro  RR1
 483         add             $5, T2                  // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
 484         mov             T2, $5                  // E = FN(B,C,D) + E + rol(A,5) + WK(i)
 485         rol             $$5, T2                 // rol(E,5)
 486         add             T2, $4                  // D + WK(i+1) + rol(E,5)
 487         $0              $1, $2, $3              // FN(A,B,C)
 488         add             T1, $4                  // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
 489         rol             $$30, $1                // A = rol(A,30)
 490         .endm
 491
 492
 493
 494         /*
 495
 496                 The following macro definitions are used to expand code for the per-block sha1 operation.
 497
 498                         INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
 499                         INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
 500                         ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
 501
 502                 For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
 503                 into 1 macro definition for software pipeling.
 504
 505                         SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
 506
 507                 assume cnt (the number of blocks)  >= 1, the main code body should look like
 508
 509                 INITIAL_W_PRECALC_ssse3                         // W = big_endian_load and pre-compute W+K (i=0:15)
 510                 do {
 511                         INTERNAL_ssse3                                  // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
 512                         cnt--;
 513                         if (cnt==0) break;
 514                         BUFFER_PTR += 64;
 515                         SOFTWARE_PIPELINING_ssse3;              // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
 516                 }
 517                 ENDING                                                          // update hash digests A/B/C/D/E (i=64:79)
 518
 519         */
 520
 521         #define W_PRECALC_00_15_0       W_PRECALC_00_15_0_ssse3
 522         #define W_PRECALC_00_15_1       W_PRECALC_00_15_1_ssse3
 523         #define W_PRECALC_16_31_0       W_PRECALC_16_31_0_ssse3
 524         #define W_PRECALC_32_79_0       W_PRECALC_32_79_0_ssse3
 525         #define W_PRECALC_32_79_0_i386  W_PRECALC_32_79_0_i386_ssse3
 526
 527
 528         .macro  INITIAL_W_PRECALC_ssse3                 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
 529
 530         // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
 531         W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
 532         W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP
 533         W_PRECALC_00_15_2                                               // W_TMP = W0 + K
 534         W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
 535
 536         // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
 537         W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
 538         W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP
 539         W_PRECALC_00_15_2                                               // W_TMP = W28 + K
 540         W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K
 541
 542         // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
 543         W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
 544         W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
 545         W_PRECALC_00_15_2                                               // W_TMP = W24 + K
 546         W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
 547
 548         // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
 549         W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
 550         W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP
 551         W_PRECALC_00_15_2                                               // W_TMP = W20 + K
 552         W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
 553
 554         .endm
 555
 556
 557         .macro  INTERNAL_ssse3                                  // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
 558
 559         // i=16 : W12,W8,W4,W0,W28,W24,W20,W16
 560         W_PRECALC_16_31_0       W0,W28,W24,W20,W16
 561         RR0                                     F1,A,B,C,D,E,0
 562         W_PRECALC_16_31_1       W0,W16
 563         RR1                                     F1,A,B,C,D,E,0
 564         W_PRECALC_16_31_2       W16
 565         RR0                                     F1,D,E,A,B,C,2
 566         W_PRECALC_16_31_3       W16, 2, 0
 567         RR1                                     F1,D,E,A,B,C,2
 568
 569         // i=20 : W8,W4,W0,W28,W24,W20,W16,W12
 570         W_PRECALC_16_31_0       W28,W24,W20,W16,W12
 571         RR0                                     F1,B,C,D,E,A,4
 572         W_PRECALC_16_31_1       W28,W12
 573         RR1                                     F1,B,C,D,E,A,4
 574         W_PRECALC_16_31_2       W12
 575         RR0                                     F1,E,A,B,C,D,6
 576         W_PRECALC_16_31_3       W12, 6, 16
 577         RR1                                     F1,E,A,B,C,D,6
 578
 579         // i=24 : W4,W0,W28,W24,W20,W16,W12,W8
 580         W_PRECALC_16_31_0       W24,W20,W16,W12,W8
 581         RR0                                     F1,C,D,E,A,B,8
 582         W_PRECALC_16_31_1       W24,W8
 583         RR1                                     F1,C,D,E,A,B,8
 584         W_PRECALC_16_31_2       W8
 585         RR0                                     F1,A,B,C,D,E,10
 586         W_PRECALC_16_31_3       W8,10,16
 587         RR1                                     F1,A,B,C,D,E,10
 588
 589         // i=28 : W0,W28,W24,W20,W16,W12,W8,W4
 590         W_PRECALC_16_31_0       W20,W16,W12,W8,W4
 591         RR0                                     F1,D,E,A,B,C,12
 592         W_PRECALC_16_31_1       W20,W4
 593         RR1                                     F1,D,E,A,B,C,12
 594         W_PRECALC_16_31_2       W4
 595         RR0                                     F1,B,C,D,E,A,14
 596         W_PRECALC_16_31_3       W4,14,16
 597         RR1                                     F1,B,C,D,E,A,14
 598
 599         // i=32 : W28,W24,W20,W16,W12,W8,W4,W0
 600         W_PRECALC_32_79_0       W28,W8,W4,W0
 601         RR0                                     F1,E,A,B,C,D,16
 602         W_PRECALC_32_79_1       W16,W0
 603         RR1                                     F1,E,A,B,C,D,16
 604         W_PRECALC_32_79_2       W0
 605         RR0                                     F1,C,D,E,A,B,18
 606         W_PRECALC_32_79_3       W0,18,16
 607         RR1                                     F1,C,D,E,A,B,18
 608
 609         // starting using F2
 610
 611         // i=36 : W24,W20,W16,W12,W8,W4,W0,W28
 612 #if defined (__x86_64__)
 613         W_PRECALC_32_79_0       W24,W4,W0,W28
 614 #else
 615         W_PRECALC_32_79_0_i386  W24,W4,W0,W28
 616 #endif
 617         RR0                                     F2,A,B,C,D,E,20
 618         W_PRECALC_32_79_1       W12,W28
 619         RR1                                     F2,A,B,C,D,E,20
 620 #if defined (__x86_64__)
 621         W_PRECALC_32_79_2       W28
 622 #else
 623         W_PRECALC_32_79_2_i386  W28
 624 #endif
 625         RR0                                     F2,D,E,A,B,C,22
 626         W_PRECALC_32_79_3       W28,22,16
 627         RR1                                     F2,D,E,A,B,C,22
 628
 629         // i=40 : W20,W16,W12,W8,W4,W0,W28,W24
 630         #undef  K_XMM
 631     #define K_XMM   32
 632 #if defined (__x86_64__)
 633         W_PRECALC_32_79_0       W20,W0,W28,W24
 634 #else
 635         W_PRECALC_32_79_0_i386  W20,W0,W28,W24
 636 #endif
 637         RR0                                     F2,B,C,D,E,A,24
 638         W_PRECALC_32_79_1       W8,W24
 639         RR1                                     F2,B,C,D,E,A,24
 640 #if defined (__x86_64__)
 641         W_PRECALC_32_79_2       W24
 642 #else
 643         W_PRECALC_32_79_2_i386  W24
 644 #endif
 645         RR0                                     F2,E,A,B,C,D,26
 646         W_PRECALC_32_79_3       W24,26,K_XMM
 647         RR1                                     F2,E,A,B,C,D,26
 648
 649         // i=44 : W16,W12,W8,W4,W0,W28,W24,W20
 650         W_PRECALC_32_79_0       W16,W28,W24,W20
 651         RR0                                     F2,C,D,E,A,B,28
 652         W_PRECALC_32_79_1       W4,W20
 653         RR1                                     F2,C,D,E,A,B,28
 654         W_PRECALC_32_79_2       W20
 655         RR0                                     F2,A,B,C,D,E,30
 656         W_PRECALC_32_79_3       W20,30,K_XMM
 657         RR1                                     F2,A,B,C,D,E,30
 658
 659         // i=48 : W12,W8,W4,W0,W28,W24,W20,W16
 660         W_PRECALC_32_79_0       W12,W24,W20,W16
 661         RR0                                     F2,D,E,A,B,C,32
 662         W_PRECALC_32_79_1       W0,W16
 663         RR1                                     F2,D,E,A,B,C,32
 664         W_PRECALC_32_79_2       W16
 665         RR0                                     F2,B,C,D,E,A,34
 666         W_PRECALC_32_79_3       W16,34,K_XMM
 667         RR1                                     F2,B,C,D,E,A,34
 668
 669         // i=52 : W8,W4,W0,W28,W24,W20,W16,W12
 670         W_PRECALC_32_79_0       W8,W20,W16,W12
 671         RR0                                     F2,E,A,B,C,D,36
 672         W_PRECALC_32_79_1       W28,W12
 673         RR1                                     F2,E,A,B,C,D,36
 674         W_PRECALC_32_79_2       W12
 675         RR0                                     F2,C,D,E,A,B,38
 676         W_PRECALC_32_79_3       W12,38,K_XMM
 677         RR1                                     F2,C,D,E,A,B,38
 678
 679         // starting using F3
 680
 681         // i=56 : W4,W0,W28,W24,W20,W16,W12,W8
 682         W_PRECALC_32_79_0       W4,W16,W12,W8
 683         RR0                                     F3,A,B,C,D,E,40
 684         W_PRECALC_32_79_1       W24,W8
 685         RR1                                     F3,A,B,C,D,E,40
 686         W_PRECALC_32_79_2       W8
 687         RR0                                     F3,D,E,A,B,C,42
 688         W_PRECALC_32_79_3       W8,42,K_XMM
 689         RR1                                     F3,D,E,A,B,C,42
 690
 691         // i=60 : W0,W28,W24,W20,W16,W12,W8,W4
 692         #undef  K_XMM
 693         #define K_XMM   48
 694         W_PRECALC_32_79_0       W0,W12,W8,W4
 695         RR0                                     F3,B,C,D,E,A,44
 696         W_PRECALC_32_79_1       W20,W4
 697         RR1                                     F3,B,C,D,E,A,44
 698         W_PRECALC_32_79_2       W4
 699         RR0                                     F3,E,A,B,C,D,46
 700         W_PRECALC_32_79_3       W4,46,K_XMM
 701         RR1                                     F3,E,A,B,C,D,46
 702
 703         // i=64 : W28,W24,W20,W16,W12,W8,W4,W0
 704         W_PRECALC_32_79_0       W28,W8,W4,W0
 705         RR0                                     F3,C,D,E,A,B,48
 706         W_PRECALC_32_79_1       W16,W0
 707         RR1                                     F3,C,D,E,A,B,48
 708         W_PRECALC_32_79_2       W0
 709         RR0                                     F3,A,B,C,D,E,50
 710         W_PRECALC_32_79_3       W0,50,K_XMM
 711         RR1                                     F3,A,B,C,D,E,50
 712
 713         // i=68 : W24,W20,W16,W12,W8,W4,W0,W28
 714 #if defined (__x86_64__)
 715         W_PRECALC_32_79_0       W24,W4,W0,W28
 716 #else
 717         W_PRECALC_32_79_0_i386  W24,W4,W0,W28
 718 #endif
 719         RR0                                     F3,D,E,A,B,C,52
 720         W_PRECALC_32_79_1       W12,W28
 721         RR1                                     F3,D,E,A,B,C,52
 722 #if defined (__x86_64__)
 723         W_PRECALC_32_79_2       W28
 724 #else
 725         W_PRECALC_32_79_2_i386  W28
 726 #endif
 727         RR0                                     F3,B,C,D,E,A,54
 728         W_PRECALC_32_79_3       W28,54,K_XMM
 729         RR1                                     F3,B,C,D,E,A,54
 730
 731         // i=72 : W20,W16,W12,W8,W4,W0,W28,W24
 732 #if defined (__x86_64__)
 733         W_PRECALC_32_79_0       W20,W0,W28,W24
 734 #else
 735         W_PRECALC_32_79_0_i386  W20,W0,W28,W24
 736 #endif
 737         RR0                                     F3,E,A,B,C,D,56
 738         W_PRECALC_32_79_1       W8,W24
 739         RR1                                     F3,E,A,B,C,D,56
 740 #if defined (__x86_64__)
 741         W_PRECALC_32_79_2       W24
 742 #else
 743         W_PRECALC_32_79_2_i386  W24
 744 #endif
 745         RR0                                     F3,C,D,E,A,B,58
 746         W_PRECALC_32_79_3       W24,58,K_XMM
 747         RR1                                     F3,C,D,E,A,B,58
 748
 749         // starting using F4
 750
 751         // i=76 : W16,W12,W8,W4,W0,W28,W24,W20
 752         W_PRECALC_32_79_0       W16,W28,W24,W20
 753         RR0                                     F4,A,B,C,D,E,60
 754         W_PRECALC_32_79_1       W4,W20
 755         RR1                                     F4,A,B,C,D,E,60
 756         W_PRECALC_32_79_2       W20
 757         RR0                                     F4,D,E,A,B,C,62
 758         W_PRECALC_32_79_3       W20,62,K_XMM
 759         RR1                                     F4,D,E,A,B,C,62
 760
 761         .endm
 762
 763         .macro  SOFTWARE_PIPELINING_ssse3
 764         // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
 765         W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
 766         RR0                                     F4,B,C,D,E,A,64
 767         W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP
 768         RR1                                     F4,B,C,D,E,A,64
 769         W_PRECALC_00_15_2                                               // W_TMP = W0 + K
 770         RR0                                     F4,E,A,B,C,D,66
 771         W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
 772         RR1                                     F4,E,A,B,C,D,66
 773
 774         // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
 775         W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
 776         RR0                                     F4,C,D,E,A,B,68
 777         W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP
 778         RR1                                     F4,C,D,E,A,B,68
 779         W_PRECALC_00_15_2                                               // W_TMP = W28 + K
 780         RR0                                     F4,A,B,C,D,E,70
 781         W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K[0]
 782         RR1                                     F4,A,B,C,D,E,70
 783
 784         // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
 785         W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
 786         RR0                                     F4,D,E,A,B,C,72
 787         W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
 788         RR1                                     F4,D,E,A,B,C,72
 789         W_PRECALC_00_15_2                                               // W_TMP = W24 + K
 790         RR0                                     F4,B,C,D,E,A,74
 791         W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
 792         RR1                                     F4,B,C,D,E,A,74
 793
 794         // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
 795         W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
 796         RR0                                     F4,E,A,B,C,D,76
 797         W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP
 798         RR1                                     F4,E,A,B,C,D,76
 799         W_PRECALC_00_15_2                                               // W_TMP = W20 + K
 800         RR0                                     F4,C,D,E,A,B,78
 801         W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
 802         RR1                                     F4,C,D,E,A,B,78
 803         .endm
 804
 805
 806         #undef  W_PRECALC_00_15_0
 807         #undef  W_PRECALC_00_15_1
 808         #undef  W_PRECALC_16_31_0
 809         #undef  W_PRECALC_32_79_0
 810         #undef  W_PRECALC_32_79_0_i386
 811
 812
 813
 814         /*
 815
 816                 The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions.
 817
 818                 INITIAL_W_PRECALC_nossse3
 819                 INTERNAL_nossse3
 820                 SOFTWARE_PIPELINING_nossse3
 821
 822                 They will be used in a sha1 code main body definition that will be used for system without ssse3 support.
 823
 824         */
 825
 826         #define W_PRECALC_00_15_0       W_PRECALC_00_15_0_nossse3
 827         #define W_PRECALC_00_15_1       W_PRECALC_00_15_1_nossse3
 828         #define W_PRECALC_16_31_0       W_PRECALC_16_31_0_nossse3
 829         #define W_PRECALC_32_79_0       W_PRECALC_32_79_0_nossse3
 830         #define W_PRECALC_32_79_0_i386  W_PRECALC_32_79_0_i386_nossse3
 831
 832
 833         .macro  INITIAL_W_PRECALC_nossse3
 834
 835         // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
 836         W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
 837         W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP
 838         W_PRECALC_00_15_2                                               // W_TMP = W0 + K
 839         W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
 840
 841         // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
 842         W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
 843         W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP
 844         W_PRECALC_00_15_2                                               // W_TMP = W28 + K
 845         W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K
 846
 847         // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
 848         W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
 849         W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
 850         W_PRECALC_00_15_2                                               // W_TMP = W24 + K
 851         W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
 852
 853         // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
 854         W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
 855         W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP
 856         W_PRECALC_00_15_2                                               // W_TMP = W20 + K
 857         W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
 858
 859         .endm
 860
 861
 862         .macro  INTERNAL_nossse3
 863         // i=16
 864         // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
 865         W_PRECALC_16_31_0       W0,W28,W24,W20,W16
 866         RR0                                     F1,A,B,C,D,E,0
 867         W_PRECALC_16_31_1       W0,W16
 868         RR1                                     F1,A,B,C,D,E,0
 869         W_PRECALC_16_31_2       W16
 870         RR0                                     F1,D,E,A,B,C,2
 871         W_PRECALC_16_31_3       W16, 2, 0
 872         RR1                                     F1,D,E,A,B,C,2
 873
 874         // i=20,
 875         // W8,W4,W0,W28,W24,W20,W16,W12
 876         W_PRECALC_16_31_0       W28,W24,W20,W16,W12
 877         RR0                                     F1,B,C,D,E,A,4
 878         W_PRECALC_16_31_1       W28,W12
 879         RR1                                     F1,B,C,D,E,A,4
 880
 881         W_PRECALC_16_31_2       W12
 882         RR0                                     F1,E,A,B,C,D,6
 883         W_PRECALC_16_31_3       W12, 6, 16
 884         RR1                                     F1,E,A,B,C,D,6
 885
 886         // i=24,
 887         // W4,W0,W28,W24,W20,W16,W12,W8
 888         W_PRECALC_16_31_0       W24,W20,W16,W12,W8
 889         RR0                                     F1,C,D,E,A,B,8
 890         W_PRECALC_16_31_1       W24,W8
 891         RR1                                     F1,C,D,E,A,B,8
 892
 893         W_PRECALC_16_31_2       W8
 894         RR0                                     F1,A,B,C,D,E,10
 895         W_PRECALC_16_31_3       W8,10,16
 896         RR1                                     F1,A,B,C,D,E,10
 897
 898         // i=28
 899         // W0,W28,W24,W20,W16,W12,W8,W4
 900         W_PRECALC_16_31_0       W20,W16,W12,W8,W4
 901         RR0                                     F1,D,E,A,B,C,12
 902         W_PRECALC_16_31_1       W20,W4
 903         RR1                                     F1,D,E,A,B,C,12
 904
 905         W_PRECALC_16_31_2       W4
 906         RR0                                     F1,B,C,D,E,A,14
 907         W_PRECALC_16_31_3       W4,14,16
 908         RR1                                     F1,B,C,D,E,A,14
 909
 910         //i=32
 911         // W28,W24,W20,W16,W12,W8,W4,W0
 912         W_PRECALC_32_79_0       W28,W8,W4,W0
 913         RR0                                     F1,E,A,B,C,D,16
 914         W_PRECALC_32_79_1       W16,W0
 915         RR1                                     F1,E,A,B,C,D,16
 916         W_PRECALC_32_79_2       W0
 917         RR0                                     F1,C,D,E,A,B,18
 918         W_PRECALC_32_79_3       W0,18,16
 919         RR1                                     F1,C,D,E,A,B,18
 920
 921         //i=36
 922         // W24,W20,W16,W12,W8,W4,W0,W28
 923 #if defined (__x86_64__)
 924         W_PRECALC_32_79_0       W24,W4,W0,W28
 925 #else
 926         W_PRECALC_32_79_0_i386  W24,W4,W0,W28
 927 #endif
 928         RR0                                     F2,A,B,C,D,E,20
 929         W_PRECALC_32_79_1       W12,W28
 930         RR1                                     F2,A,B,C,D,E,20
 931 #if defined (__x86_64__)
 932         W_PRECALC_32_79_2       W28
 933 #else
 934         W_PRECALC_32_79_2_i386  W28
 935 #endif
 936         RR0                                     F2,D,E,A,B,C,22
 937         W_PRECALC_32_79_3       W28,22,16
 938         RR1                                     F2,D,E,A,B,C,22
 939
 940         //i=40
 941         #undef  K_XMM
 942     #define K_XMM   32
 943         // W20,W16,W12,W8,W4,W0,W28,W24
 944 #if defined (__x86_64__)
 945         W_PRECALC_32_79_0       W20,W0,W28,W24
 946 #else
 947         W_PRECALC_32_79_0_i386  W20,W0,W28,W24
 948 #endif
 949         RR0                                     F2,B,C,D,E,A,24
 950         W_PRECALC_32_79_1       W8,W24
 951         RR1                                     F2,B,C,D,E,A,24
 952 #if defined (__x86_64__)
 953         W_PRECALC_32_79_2       W24
 954 #else
 955         W_PRECALC_32_79_2_i386  W24
 956 #endif
 957         RR0                                     F2,E,A,B,C,D,26
 958         W_PRECALC_32_79_3       W24,26,K_XMM
 959         RR1                                     F2,E,A,B,C,D,26
 960
 961         //i=44
 962         // W16,W12,W8,W4,W0,W28,W24,W20
 963         W_PRECALC_32_79_0       W16,W28,W24,W20
 964         RR0                                     F2,C,D,E,A,B,28
 965         W_PRECALC_32_79_1       W4,W20
 966         RR1                                     F2,C,D,E,A,B,28
 967         W_PRECALC_32_79_2       W20
 968         RR0                                     F2,A,B,C,D,E,30
 969         W_PRECALC_32_79_3       W20,30,K_XMM
 970         RR1                                     F2,A,B,C,D,E,30
 971
 972         //i=48
 973         // W12,W8,W4,W0,W28,W24,W20,W16
 974         W_PRECALC_32_79_0       W12,W24,W20,W16
 975         RR0                                     F2,D,E,A,B,C,32
 976         W_PRECALC_32_79_1       W0,W16
 977         RR1                                     F2,D,E,A,B,C,32
 978         W_PRECALC_32_79_2       W16
 979         RR0                                     F2,B,C,D,E,A,34
 980         W_PRECALC_32_79_3       W16,34,K_XMM
 981         RR1                                     F2,B,C,D,E,A,34
 982
 983         //i=52
 984         // W8,W4,W0,W28,W24,W20,W16,W12
 985         W_PRECALC_32_79_0       W8,W20,W16,W12
 986         RR0                                     F2,E,A,B,C,D,36
 987         W_PRECALC_32_79_1       W28,W12
 988         RR1                                     F2,E,A,B,C,D,36
 989         W_PRECALC_32_79_2       W12
 990         RR0                                     F2,C,D,E,A,B,38
 991         W_PRECALC_32_79_3       W12,38,K_XMM
 992         RR1                                     F2,C,D,E,A,B,38
 993
 994         //i=56
 995         // W4,W0,W28,W24,W20,W16,W12,W8
 996         W_PRECALC_32_79_0       W4,W16,W12,W8
 997         RR0                                     F3,A,B,C,D,E,40
 998         W_PRECALC_32_79_1       W24,W8
 999         RR1                                     F3,A,B,C,D,E,40
1000         W_PRECALC_32_79_2       W8
1001         RR0                                     F3,D,E,A,B,C,42
1002         W_PRECALC_32_79_3       W8,42,K_XMM
1003         RR1                                     F3,D,E,A,B,C,42
1004
1005         //i=60
1006         #undef  K_XMM
1007         #define K_XMM   48
1008         // W0,W28,W24,W20,W16,W12,W8,W4
1009         W_PRECALC_32_79_0       W0,W12,W8,W4
1010         RR0                                     F3,B,C,D,E,A,44
1011         W_PRECALC_32_79_1       W20,W4
1012         RR1                                     F3,B,C,D,E,A,44
1013         W_PRECALC_32_79_2       W4
1014         RR0                                     F3,E,A,B,C,D,46
1015         W_PRECALC_32_79_3       W4,46,K_XMM
1016         RR1                                     F3,E,A,B,C,D,46
1017
1018         //i=64
1019         // W28,W24,W20,W16,W12,W8,W4,W0
1020         W_PRECALC_32_79_0       W28,W8,W4,W0
1021         RR0                                     F3,C,D,E,A,B,48
1022         W_PRECALC_32_79_1       W16,W0
1023         RR1                                     F3,C,D,E,A,B,48
1024         W_PRECALC_32_79_2       W0
1025         RR0                                     F3,A,B,C,D,E,50
1026         W_PRECALC_32_79_3       W0,50,K_XMM
1027         RR1                                     F3,A,B,C,D,E,50
1028
1029         //i=68
1030         // W24,W20,W16,W12,W8,W4,W0,W28
1031 #if defined (__x86_64__)
1032         W_PRECALC_32_79_0       W24,W4,W0,W28
1033 #else
1034         W_PRECALC_32_79_0_i386  W24,W4,W0,W28
1035 #endif
1036         RR0                                     F3,D,E,A,B,C,52
1037         W_PRECALC_32_79_1       W12,W28
1038         RR1                                     F3,D,E,A,B,C,52
1039 #if defined (__x86_64__)
1040         W_PRECALC_32_79_2       W28
1041 #else
1042         W_PRECALC_32_79_2_i386  W28
1043 #endif
1044         RR0                                     F3,B,C,D,E,A,54
1045         W_PRECALC_32_79_3       W28,54,K_XMM
1046         RR1                                     F3,B,C,D,E,A,54
1047
1048         //i=72
1049         // W20,W16,W12,W8,W4,W0,W28,W24
1050 #if defined (__x86_64__)
1051         W_PRECALC_32_79_0       W20,W0,W28,W24
1052 #else
1053         W_PRECALC_32_79_0_i386  W20,W0,W28,W24
1054 #endif
1055         RR0                                     F3,E,A,B,C,D,56
1056         W_PRECALC_32_79_1       W8,W24
1057         RR1                                     F3,E,A,B,C,D,56
1058 #if defined (__x86_64__)
1059         W_PRECALC_32_79_2       W24
1060 #else
1061         W_PRECALC_32_79_2_i386  W24
1062 #endif
1063         RR0                                     F3,C,D,E,A,B,58
1064         W_PRECALC_32_79_3       W24,58,K_XMM
1065         RR1                                     F3,C,D,E,A,B,58
1066
1067         // starting using F4
1068
1069         //i=76
1070         // W16,W12,W8,W4,W0,W28,W24,W20
1071         W_PRECALC_32_79_0       W16,W28,W24,W20
1072         RR0                                     F4,A,B,C,D,E,60
1073         W_PRECALC_32_79_1       W4,W20
1074         RR1                                     F4,A,B,C,D,E,60
1075         W_PRECALC_32_79_2       W20
1076         RR0                                     F4,D,E,A,B,C,62
1077         W_PRECALC_32_79_3       W20,62,K_XMM
1078         RR1                                     F4,D,E,A,B,C,62
1079
1080         .endm
1081
1082         .macro  SOFTWARE_PIPELINING_nossse3
1083         // i=0  : W28,W24,W20,W16,W12,W8,W4,W0
1084         W_PRECALC_00_15_0       0                                       // W_TMP = (BUFFER_PTR)
1085         RR0                                     F4,B,C,D,E,A,64
1086         W_PRECALC_00_15_1       W0                                      // convert W_TMP to big-endian, and save W0 = W_TMP
1087         RR1                                     F4,B,C,D,E,A,64
1088         W_PRECALC_00_15_2                                               // W_TMP = W0 + K
1089         RR0                                     F4,E,A,B,C,D,66
1090         W_PRECALC_00_15_3       3                                       // (sp) = W_TMP = W0 + K
1091         RR1                                     F4,E,A,B,C,D,66
1092
1093         // i=4  : W24,W20,W16,W12,W8,W4,W0,W28
1094         W_PRECALC_00_15_0       4                                       // W_TMP = 16(BUFFER_PTR)
1095         RR0                                     F4,C,D,E,A,B,68
1096         W_PRECALC_00_15_1       W28                                     // convert W_TMP to big-endian, and save W28 = W_TMP
1097         RR1                                     F4,C,D,E,A,B,68
1098         W_PRECALC_00_15_2                                               // W_TMP = W28 + K
1099         RR0                                     F4,A,B,C,D,E,70
1100         W_PRECALC_00_15_3       7                                       // 16(sp) = W_TMP = W28 + K[0]
1101         RR1                                     F4,A,B,C,D,E,70
1102
1103         // i=8  : W20,W16,W12,W8,W4,W0,W28,W24
1104         W_PRECALC_00_15_0       8                                       // W_TMP = 32(BUFFER_PTR)
1105         RR0                                     F4,D,E,A,B,C,72
1106         W_PRECALC_00_15_1       W24                                     // convert W_TMP to big-endian, and save W24 = W_TMP
1107         RR1                                     F4,D,E,A,B,C,72
1108         W_PRECALC_00_15_2                                               // W_TMP = W24 + K
1109         RR0                                     F4,B,C,D,E,A,74
1110         W_PRECALC_00_15_3       11                                      // 32(sp) = W_TMP = W24 + K
1111         RR1                                     F4,B,C,D,E,A,74
1112
1113         // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
1114         W_PRECALC_00_15_0       12                                      // W_TMP = 48(BUFFER_PTR)
1115         RR0                                     F4,E,A,B,C,D,76
1116         W_PRECALC_00_15_1       W20                                     // convert W_TMP to big-endian, and save W20 = W_TMP
1117         RR1                                     F4,E,A,B,C,D,76
1118         W_PRECALC_00_15_2                                               // W_TMP = W20 + K
1119         RR0                                     F4,C,D,E,A,B,78
1120         W_PRECALC_00_15_3       15                                      // 48(sp) = W_TMP = W20 + K
1121         RR1                                     F4,C,D,E,A,B,78
1122         .endm
1123
1124         .macro  ENDING          // finish up updating hash digests (i=64:79)
1125         //i=80
1126         RR0                                     F4,B,C,D,E,A,64
1127         RR1                                     F4,B,C,D,E,A,64
1128         RR0                                     F4,E,A,B,C,D,66
1129         RR1                                     F4,E,A,B,C,D,66
1130
1131         //i=84
1132         RR0                                     F4,C,D,E,A,B,68
1133         RR1                                     F4,C,D,E,A,B,68
1134         RR0                                     F4,A,B,C,D,E,70
1135         RR1                                     F4,A,B,C,D,E,70
1136
1137         //i=88
1138         RR0                                     F4,D,E,A,B,C,72
1139         RR1                                     F4,D,E,A,B,C,72
1140         RR0                                     F4,B,C,D,E,A,74
1141         RR1                                     F4,B,C,D,E,A,74
1142
1143         //i=92
1144         RR0                                     F4,E,A,B,C,D,76
1145         RR1                                     F4,E,A,B,C,D,76
1146         RR0                                     F4,C,D,E,A,B,78
1147         RR1                                     F4,C,D,E,A,B,78
1148         .endm
1149
1150         // load hash digests A,B,C,D,E from memory into registers
1151         .macro  LOAD_HASH
1152 #if defined (__x86_64__)
1153         mov                     (HASH_PTR), A
1154         mov                     4(HASH_PTR), B
1155         mov                     8(HASH_PTR), C
1156         mov                     12(HASH_PTR), D
1157         mov                     16(HASH_PTR), E
1158 #else
1159     mov         HASH_PTR, T1
1160     mov         (T1), A
1161     mov         4(T1), B
1162     mov         8(T1), C
1163     mov         12(T1), D
1164     mov         16(T1), E
1165 #endif
1166         .endm
1167
1168         .macro  UPDATE_HASH
1169         add             $0, $1
1170         mov             $1, $0
1171         .endm
1172
1173         .macro UPDATE_ALL_HASH
1174 #if defined (__x86_64__)
1175         UPDATE_HASH             (HASH_PTR), A
1176         UPDATE_HASH             4(HASH_PTR), B
1177         UPDATE_HASH             8(HASH_PTR), C
1178         UPDATE_HASH             12(HASH_PTR), D
1179         UPDATE_HASH             16(HASH_PTR), E
1180 #else
1181     mov             HASH_PTR, T1
1182     UPDATE_HASH     (T1), A
1183     UPDATE_HASH     4(T1), B
1184     UPDATE_HASH     8(T1), C
1185     UPDATE_HASH     12(T1), D
1186     UPDATE_HASH     16(T1), E
1187 #endif
1188         .endm
1189
1190
1191         /*
1192                  main sha1 code for system without ssse3 support
1193         */
1194
1195         .macro  SHA1_PIPELINED_MAIN_BODY_nossse3
1196         LOAD_HASH                                               // load initial hashes into A,B,C,D,E (registers)
1197         INITIAL_W_PRECALC_nossse3               // big_endian_load(W) and W+K (i=0:15)
1198         .align  4,0x90
1199 0:
1200         INTERNAL_nossse3                                // update W (i=16:79) and update ABCDE (i=0:63)
1201 #if Multiple_Blocks
1202 #if defined(__x86_64__)
1203         add     $$64, BUFFER_PTR                        // BUFFER_PTR+=64;
1204         sub     $$1, cnt                                        // pre-decrement cnt by 1
1205 #else
1206         addl    $$64, BUFFER_PTR                        // BUFFER_PTR+=64;
1207         subl    $$1, cnt                                        // pre-decrement cnt by 1
1208 #endif
1209         jbe     1f                                                      // if cnt <= 0, branch to finish off
1210         SOFTWARE_PIPELINING_nossse3             // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1211         UPDATE_ALL_HASH                                 // update output hashes
1212         jmp     0b                                                      // repeat for next block
1213         .align  4,0x90
1214 1:
1215 #endif
1216         ENDING                                                  // update ABCDE (i=64:79)
1217         UPDATE_ALL_HASH                                 // update output hashes
1218         .endm
1219
1220         /*
1221                  main sha1 code for system with ssse3 support
1222         */
1223
1224         .macro  SHA1_PIPELINED_MAIN_BODY_ssse3
1225         LOAD_HASH                                               // load initial hashes into A,B,C,D,E
1226         INITIAL_W_PRECALC_ssse3                 // big_endian_load(W) and W+K (i=0:15)
1227         .align  4,0x90
1228 0:
1229         INTERNAL_ssse3                                  // update W (i=16:79) and update ABCDE (i=0:63)
1230 #if Multiple_Blocks
1231 #if defined(__x86_64__)
1232         add     $$64, BUFFER_PTR                        // BUFFER_PTR+=64;
1233         sub     $$1, cnt                                        // pre-decrement cnt by 1
1234 #else
1235         addl    $$64, BUFFER_PTR                        // BUFFER_PTR+=64;
1236         subl    $$1, cnt                                        // pre-decrement cnt by 1
1237 #endif
1238         jbe     1f                                                      // if cnt <= 0, branch to finish off
1239         SOFTWARE_PIPELINING_ssse3               // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
1240         UPDATE_ALL_HASH                                 // update output hashes
1241         jmp     0b                                                      // repeat for next block
1242         .align  4,0x90
1243 1:
1244 #endif
1245         ENDING                                                  // update ABCDE (i=64:79)
1246         UPDATE_ALL_HASH                                 // update output hashes
1247         .endm
1248
1249 #ifdef  KERNEL
1250 #include <i386/cpu_capabilities.h>
1251 #else
1252 #include <System/i386/cpu_capabilities.h>
1253 #endif
1254
1255         .text
1256
1257         .globl _SHA1Transform
1258         //.private_extern       _SHA1Transform
1259 _SHA1Transform:
1260
1261         // detect SSSE3 and dispatch appropriate code branch
1262         #if defined __x86_64__
1263         movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
1264         mov     (%rax), %eax                                    // %eax = __cpu_capabilities
1265         #else       // i386
1266                 #if defined KERNEL
1267                 leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
1268                 mov     (%eax), %eax                                // %eax = __cpu_capabilities
1269                 #else
1270                 mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
1271                 #endif
1272         #endif
1273     test    $(kHasSupplementalSSE3), %eax
1274     je      _SHA1Transform_nossse3                                      // branch to no-ssse3 code
1275
1276
1277         // start the sha1 code with ssse3 support
1278
1279         // save callee-save registers
1280 #if defined (__x86_64__)
1281         push    %rbx
1282         push    %rbp
1283 #else
1284     push    %ebx
1285     push    %ebp
1286     push    %esi
1287     push    %edi
1288 #endif
1289
1290         sub             $stack_size, sp                                 // allocate stack memory for use
1291
1292         // save used xmm register if this is for kernel
1293 #if     KERNEL
1294         xmov    %xmm0, 4*16(sp)
1295         xmov    %xmm1, 5*16(sp)
1296         xmov    %xmm2, 6*16(sp)
1297         xmov    %xmm3, 7*16(sp)
1298         xmov    %xmm4, 8*16(sp)
1299         xmov    %xmm5, 9*16(sp)
1300         xmov    %xmm6, 10*16(sp)
1301         xmov    %xmm7, 11*16(sp)
1302 #if defined (__x86_64__)
1303         xmov    %xmm8, 12*16(sp)
1304         xmov    %xmm9, 13*16(sp)
1305         xmov    %xmm10, 14*16(sp)
1306 #endif
1307 #endif
1308
1309 #if defined (__x86_64__)
1310
1311         // set up registers to free %edx/%edi/%esi for other use (ABCDE)
1312         mov             ctx, HASH_PTR
1313         mov             buf, BUFFER_PTR
1314 #if Multiple_Blocks
1315         mov             %rdx, cnt
1316 #endif
1317         lea             K_XMM_AR(%rip), K_BASE
1318         xmov    0x40(K_BASE), XMM_SHUFB_BSWAP
1319
1320 #else   // __i386__
1321
1322 #if     KERNEL
1323     lea     K_XMM_AR, %eax
1324 #else
1325         // Get address of 0 in R.
1326            call    0f          // Push program counter onto stack.
1327         0: pop     %eax      // Get program counter.
1328                 lea     K_XMM_AR-0b(%eax), %eax
1329 #endif
1330     mov     %eax, K_BASE
1331     xmov    0x40(%eax), %xmm0
1332     xmov    %xmm0, XMM_SHUFB_BSWAP
1333
1334 #endif
1335
1336         SHA1_PIPELINED_MAIN_BODY_ssse3
1337
1338         // restore used xmm registers if this is for kernel
1339 #if     KERNEL
1340         xmov    4*16(sp), %xmm0
1341         xmov    5*16(sp), %xmm1
1342         xmov    6*16(sp), %xmm2
1343         xmov    7*16(sp), %xmm3
1344         xmov    8*16(sp), %xmm4
1345         xmov    9*16(sp), %xmm5
1346         xmov    10*16(sp), %xmm6
1347         xmov    11*16(sp), %xmm7
1348 #if defined (__x86_64__)
1349         xmov    12*16(sp), %xmm8
1350         xmov    13*16(sp), %xmm9
1351         xmov    14*16(sp), %xmm10
1352 #endif
1353 #endif
1354
1355         add             $stack_size, sp         // deallocate stack memory
1356
1357         // restore callee-save registers
1358 #if defined (__x86_64__)
1359         pop             %rbp
1360         pop             %rbx
1361 #else
1362     pop     %edi
1363     pop     %esi
1364     pop     %ebp
1365     pop     %ebx
1366 #endif
1367
1368         ret                                                     // return
1369
1370         // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions
1371
1372         .globl _SHA1Transform_nossse3
1373         .private_extern _SHA1Transform_nossse3
1374 _SHA1Transform_nossse3:
1375
1376         // push callee-save registers
1377 #if defined (__x86_64__)
1378         push    %rbx
1379         push    %rbp
1380 #else
1381     push    %ebx
1382     push    %ebp
1383     push    %esi
1384     push    %edi
1385 #endif
1386
1387         sub             $stack_size, sp                 // allocate stack memory for local use
1388
1389         // save used xmm registers if this is for kernel
1390 #if     KERNEL
1391         xmov    %xmm0, 4*16(sp)
1392         xmov    %xmm1, 5*16(sp)
1393         xmov    %xmm2, 6*16(sp)
1394         xmov    %xmm3, 7*16(sp)
1395         xmov    %xmm4, 8*16(sp)
1396         xmov    %xmm5, 9*16(sp)
1397         xmov    %xmm6, 10*16(sp)
1398         xmov    %xmm7, 11*16(sp)
1399 #if defined (__x86_64__)
1400         xmov    %xmm8, 12*16(sp)
1401         xmov    %xmm9, 13*16(sp)
1402 #endif
1403 #endif
1404
1405 #if defined (__x86_64__)
1406
1407         // set up registers to free %edx/%edi/%esi for other use (ABCDE)
1408         mov             ctx, HASH_PTR
1409         mov             buf, BUFFER_PTR
1410 #if Multiple_Blocks
1411         mov             %rdx, cnt
1412 #endif
1413         lea             K_XMM_AR(%rip), K_BASE
1414
1415 #else   // __i386__
1416
1417 #if     KERNEL
1418     lea     K_XMM_AR, %eax
1419 #else
1420         // Get address of 0 in R.
1421            call    0f          // Push program counter onto stack.
1422         0: pop     %eax      // Get program counter.
1423                 lea     K_XMM_AR-0b(%eax), %eax
1424 #endif
1425     mov     %eax, K_BASE
1426
1427 #endif
1428
1429         SHA1_PIPELINED_MAIN_BODY_nossse3
1430
1431         // restore used xmm registers if this is for kernel
1432 #if     KERNEL
1433         xmov    4*16(sp), %xmm0
1434         xmov    5*16(sp), %xmm1
1435         xmov    6*16(sp), %xmm2
1436         xmov    7*16(sp), %xmm3
1437         xmov    8*16(sp), %xmm4
1438         xmov    9*16(sp), %xmm5
1439         xmov    10*16(sp), %xmm6
1440         xmov    11*16(sp), %xmm7
1441 #if defined (__x86_64__)
1442         xmov    12*16(sp), %xmm8
1443         xmov    13*16(sp), %xmm9
1444 #endif
1445 #endif
1446
1447         add             $stack_size, sp         // deallocate stack memory
1448
1449         // restore callee-save registers
1450 #if defined (__x86_64__)
1451         pop             %rbp
1452         pop             %rbx
1453 #else
1454     pop     %edi
1455     pop     %esi
1456     pop     %ebp
1457     pop     %ebx
1458 #endif
1459
1460         ret                                                     // return
1461
1462         .const
1463         .align  4, 0x90
1464
1465 #define K1 0x5a827999
1466 #define K2 0x6ed9eba1
1467 #define K3 0x8f1bbcdc
1468 #define K4 0xca62c1d6
1469
1470 K_XMM_AR:
1471     .long       K1
1472         .long   K1
1473         .long   K1
1474         .long   K1
1475     .long       K2
1476         .long   K2
1477         .long   K2
1478         .long   K2
1479     .long       K3
1480         .long   K3
1481         .long   K3
1482         .long   K3
1483     .long       K4
1484         .long   K4
1485         .long   K4
1486         .long   K4
1487 // bswap_shufb_ctl:     invoked thru 0x40(K_XMM_AR)
1488     .long       0x00010203
1489     .long       0x04050607
1490     .long       0x08090a0b
1491     .long       0x0c0d0e0f
1492
1493
1494
1495 #endif  // architecture x86_64 or i386