libkern/zlib/intel/adler32vec.s

   1 /* Apple Copyright 2009
   2    CoreOS - vector & Numerics, cclee 10-22-09
   3
   4         This following source code implements a vectorized version of adler32 computation that is defined in zlib.
   5         The target architectures are x86_64 and i386.
   6
   7         Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
   8         The adler-sum2 pair is updated according to
   9
  10                 for (i=0;i<N;i++) {
  11                         adler = (adler+x[i])%BASE;
  12                         sum2 = (sum2+adler)%BASE;
  13                 }
  14
  15         To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
  16         adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
  17         algorithm to
  18
  19                 for (i=0;i<N;i+=NMAX) {
  20                         for (k=0;k<NMAX;k++) {
  21                                 adler+=x[i+k];
  22                                 sum2+=adler;
  23                         }
  24                         adler%=BASE;
  25                         sum2%=BASE;
  26                 }
  27
  28         The hand optimization of this function is now reduced to
  29
  30                         for (k=0;k<NMAX;k++) {
  31                 adler+=x[k];
  32                 sum2+=adler;
  33             }
  34
  35         This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
  36
  37                         for (k=0;k<K;k++) {
  38                 adler+=x[k];
  39                 sum2+=adler;
  40             }
  41
  42         It can be shown that the sum2-adler pair can be updated according to
  43
  44                 sum2 += adler*K;
  45                 adler += (x[0] + x[1] + ... + x[K-1]);
  46                 sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
  47
  48         The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
  49         The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors
  50                 [ 1 1 1 ... 1 ] for adler update.
  51                 [ K K-1 ... 1 ] for sum2 update.
  52
  53         The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm
  54         registers. It then uses SSE instructions to perform the aforementioned vector computation.
  55
  56         For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32),
  57         followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
  58
  59         For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block),
  60         it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48),
  61         before calling a modulo operation for adler and sum2.
  62
  63 */
  64
  65 /* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3
  66
  67         Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3
  68         add code here to probe cpu_capabilities for ssse3 support
  69                 if ssse3 is supported, branch to ssse3-based code, otherwise use the original code
  70
  71         cclee 5-3-10
  72 */
  73
  74 #define BASE 65521  /* largest prime smaller than 65536 */
  75 #define NMAX 5552       /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
  76
  77 // uLong        adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
  78 //    unsigned n;
  79 //    while (len >= NMAX) {
  80 //        len -= NMAX;
  81 //        n = NMAX / 16;          /* NMAX is divisible by 16 */
  82 //        do {
  83 //            DO16(buf);          /* 16 sums unrolled */
  84 //            buf += 16;
  85 //        } while (--n);
  86 //        MOD(adler);
  87 //        MOD(sum2);
  88 //    }
  89 //    if (len) {                  /* avoid modulos if none remaining */
  90 //        while (len >= 16) {
  91 //            len -= 16;
  92 //            DO16(buf);
  93 //            buf += 16;
  94 //        }
  95 //        while (len--) {
  96 //            adler += *buf++;
  97 //            sum2 += adler;
  98 //        }
  99 //        MOD(adler);
 100 //        MOD(sum2);
 101 //    }
 102 //    return adler | (sum2 << 16);
 103 // }
 104
 105 #if (defined __i386__ || defined __x86_64__)
 106
 107 #include <i386/cpu_capabilities.h>
 108
 109         .text
 110         .align 4,0x90
 111 .globl _adler32_vec
 112 _adler32_vec:
 113
 114 #if (defined __i386__)
 115
 116         pushl   %ebp
 117         movl    %esp, %ebp
 118
 119         pushl   %ebx
 120         pushl   %edi
 121         pushl   %esi
 122
 123 #ifdef  KERNEL                                          // if this is for kernel, need to save xmm registers
 124         subl    $140, %esp                              // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary
 125         movaps  %xmm0, 0(%esp)          // save xmm0, offset -12 for ebx/edi/esi
 126         movaps  %xmm1, 16(%esp)         // save xmm1
 127         movaps  %xmm2, 32(%esp)         // save xmm2
 128         movaps  %xmm3, 48(%esp)         // save xmm3
 129         movaps  %xmm4, 64(%esp)         // save xmm4
 130         movaps  %xmm5, 80(%esp)         // save xmm5
 131         movaps  %xmm6, 96(%esp)         // save xmm6
 132         movaps  %xmm7, 112(%esp)                // save xmm7, if this is for SSSE3 or above
 133 #endif
 134
 135         #define adler   %edi                            // 8(%ebp)
 136         #define sum2    %esi                            // 12(%ebp)
 137         #define buf             %ecx                            // 16(%ebp)
 138         #define len             %ebx                            // 20(%ebp)
 139         #define zero    %xmm0
 140         #define ones    %xmm5
 141
 142         movl    8(%ebp), adler
 143         movl    12(%ebp), sum2
 144         movl    16(%ebp), buf                   // use ecx as buf pointer
 145         movl    20(%ebp), len
 146
 147         .macro          modulo_BASE
 148         movl            $$-2146992015, %eax             // 1/BASE in Q47
 149         mull            adler                                   // edx:eax = adler divided by BASE in Q47
 150         shrl            $$15, %edx                              // edx is now the floor integer of adler and BASE
 151         imull           $$BASE, %edx, %edx              // edx * BASE
 152         subl            %edx, adler                             // adler -= edx*BASE
 153         movl            $$-2146992015, %eax             // 1/BASE in Q47
 154         mull            sum2                                    // edx:eax = sum2 divided by BASE in Q47
 155         shrl            $$15, %edx                              // edx is now the floor integer of sum2 and BASE
 156         imull           $$BASE, %edx, %eax              // eax = edx * BASE
 157         subl            %eax, sum2                              // sum2 -= sdx*BASE
 158         .endmacro
 159
 160         // update adler/sum2 according to a new 16-byte vector
 161         .macro          DO16
 162         movaps          (buf), %xmm1                    // 16 bytes vector, in xmm1
 163         movaps          %xmm1, %xmm3                    // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
 164         addl            $$16, buf                               // buf -> next vector
 165         psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
 166         pmaddubsw       %xmm4, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
 167         imull           $$16, adler, %edx               // edx = 16*adler;
 168         movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2
 169         pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
 170         paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
 171         addl            %edx, sum2                              // sum2 += adler*16;
 172         movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 173         movd            %xmm1, %edx                             // to be added to adler
 174         paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
 175         addl            %edx, adler                             // update adler
 176         movd            %xmm3, %edx                             // to be added to sum2
 177         psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
 178         addl            %edx, sum2                              // sum2 += 1st half of update
 179         movd            %xmm3, %edx                             // to be added to sum2
 180         addl            %edx, sum2                              // sum2 += 2nd half of update
 181         .endm
 182
 183         // update adler/sum2 according to a new 32-byte vector
 184         .macro          DO32
 185         imull           $$32, adler, %edx               // edx = 32*adler
 186         movaps          (buf), %xmm1                    // 1st 16 bytes vector
 187         movaps          16(buf), %xmm7                  // 2nd 16 bytes vector
 188         movaps          %xmm1, %xmm3                    // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
 189         movaps          %xmm7, %xmm2                    // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
 190         psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
 191         psadbw          zero, %xmm7                             // 2 16-bit words to be added for adler in xmm7
 192         addl            %edx, sum2                              // sum2 += adler*32;
 193         pmaddubsw       %xmm6, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
 194         pmaddubsw       %xmm4, %xmm2                    // 8 16-bit words to be added for sum2 in xmm2
 195         paddd           %xmm7, %xmm1                    // 2 16-bit words to be added for adler in xmm1
 196         paddd           %xmm2, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
 197         addl            $$32, buf                               // buf -> vector for next iteration
 198         movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2
 199         pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
 200         paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
 201         movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 202         movd            %xmm1, %edx                             // to be added to adler
 203         paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
 204         addl            %edx, adler                             // update adler
 205         movd            %xmm3, %edx                             // to be added to sum2
 206         psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
 207         addl            %edx, sum2                              // sum2 += 1st half of update
 208         movd            %xmm3, %edx                             // to be added to sum2
 209         addl            %edx, sum2                              // sum2 += 2nd half of update
 210         .endm
 211
 212         // this defines the macro DO16 for SSSE3 not supported
 213     .macro      DO16_nossse3
 214     movaps      (buf), %xmm1            // 16 bytes vector
 215     movaps      %xmm1, %xmm3            // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
 216     movaps      %xmm1, %xmm2            // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
 217     psrldq      $$8, %xmm2              // shift down 8 bytes, to reuse the shuffle vector
 218     punpcklbw   zero, %xmm3             // convert lower 8 bytes into 8 words
 219     punpcklbw   zero, %xmm2             // convert higher 8 bytes into 8 words
 220     pmullw      %xmm6, %xmm3            // lower 8 words * 16:9
 221     pmullw      %xmm4, %xmm2            // higher 8 words * 8:1
 222     addl        $$16, buf               // buf -> next vector
 223     psadbw      zero, %xmm1             // 2 16-bit words to be added for adler in xmm1
 224     paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
 225     imull       $$16, adler, %edx       // edx = 16*adler;
 226     movhlps     %xmm1, %xmm2            // higher 16-bit word (for adler) in xmm2
 227     pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
 228     paddq       %xmm2, %xmm1            // xmm1 lower 32-bit to be added to adler
 229     addl        %edx, sum2              // sum2 += adler*16;
 230     movhlps     %xmm3, %xmm2            // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 231     movd        %xmm1, %edx             // to be added to adler
 232     paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
 233     addl        %edx, adler             // update adler
 234     movd        %xmm3, %edx             // to be added to sum2
 235     psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
 236     addl        %edx, sum2              // sum2 += 1st half of update
 237     movd        %xmm3, %edx             // to be added to sum2
 238     addl        %edx, sum2              // sum2 += 2nd half of update
 239     .endm
 240
 241 #ifdef  KERNEL
 242     leal    __cpu_capabilities, %eax                        // %eax -> __cpu_capabilities
 243     mov     (%eax), %eax                                    // %eax = __cpu_capabilities
 244 #else
 245     mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
 246 #endif
 247     test    $(kHasSupplementalSSE3), %eax                                       // __cpu_capabilities & kHasAES
 248         je              L_no_ssse3
 249
 250         // i386 adler32 with ssse3
 251
 252         // need to fill up xmm4/xmm5/xmm6 only if len>=16
 253         cmpl    $16, len
 254         jl              L_skip_loading_tables
 255
 256         // set up table starting address to %eax
 257         leal    sum2_coefficients, %eax
 258
 259         // reading coefficients
 260         pxor    zero, zero
 261         movaps  (%eax), %xmm6                   // coefficients for computing sum2 : pmaddubsw 32:17
 262         movaps  16(%eax), %xmm4                 // coefficients for computing sum2 : pmaddubsw 16:1
 263         movaps  32(%eax), ones                  // coefficients for computing sum2 : pmaddwd 1,1,...,1
 264
 265 L_skip_loading_tables:
 266
 267         cmpl    $NMAX, len                              // len vs NMAX
 268         jl              len_lessthan_NMAX               // if (len < NMAX), skip the following NMAX batches processing
 269
 270 len_ge_NMAX_loop:                                       // while (len>=NMAX) {
 271
 272         subl    $NMAX, len                              //              len -= NMAX
 273         movl    $(NMAX/32), %eax                //              n = NMAX/32
 274
 275 n_loop:                                                         //              do {
 276         DO32                                                    //                      update adler/sum2 for a 32-byte input
 277         decl    %eax                                    //                      n--;
 278         jg              n_loop                                  //      } while (n);
 279         DO16                                                    //      update adler/sum2 for a 16-byte input
 280         modulo_BASE                                             //              (adler/sum2) modulo BASE;
 281         cmpl    $NMAX, len                              //
 282         jge             len_ge_NMAX_loop                // }    /* len>=NMAX */
 283
 284 len_lessthan_NMAX:
 285
 286         subl    $32, len                                // pre-decrement len by 32
 287         jl              len_lessthan_32                 // if len < 32, skip the 32-vector code
 288 len32_loop:                                                     // while (len>=32) {
 289         DO32                                                    //   update adler/sum2 for a 32-byte input
 290         subl    $32, len                                //   len -= 32;
 291         jge             len32_loop                              // }
 292
 293 len_lessthan_32:
 294
 295         addl    $(32-16), len                   // post-increment by 32 + pre-decrement by 16 on len
 296         jl              L_len_lessthan_16                       // if len < 16, skip the 16-vector code
 297         DO16                                                    // update adler/sum2 for a 16-byte input
 298         subl    $16, len                                // len -= 16;
 299
 300 L_len_lessthan_16:
 301         addl    $16, len                                // post-increment len by 16
 302         jz              len_is_zero                             // if len==0, branch over scalar processing
 303
 304 0:                                                                      // while (len) {
 305         movzbl  (buf), %edx                             //      new input byte
 306         incl    buf                                             //      buf++
 307         addl    %edx, adler                             //      adler += *buf
 308         addl    adler, sum2                             //      sum2 += adler
 309         subl    $1, len                                 //      len--
 310         jg              0b                                              // }
 311
 312 len_is_zero:
 313
 314         modulo_BASE                                             // (adler/sum2) modulo BASE;
 315
 316         // construct 32-bit (sum2<<16 | adler) to be returned
 317
 318         sall    $16, sum2                               // sum2 <<16
 319         movl    adler, %eax                             // adler
 320         orl             sum2, %eax                              // sum2<<16 | adler
 321
 322
 323 #ifdef  KERNEL                                  // if this is for kernel code, need to restore xmm registers
 324         movaps  (%esp), %xmm0           // restore xmm0, offset -12 for ebx/edi/esi
 325         movaps  16(%esp), %xmm1         // restore xmm1
 326         movaps  32(%esp), %xmm2         // restore xmm2
 327         movaps  48(%esp), %xmm3         // restore xmm3
 328         movaps  64(%esp), %xmm4         // restore xmm4
 329         movaps  80(%esp), %xmm5         // restore xmm5
 330         movaps  96(%esp), %xmm6         // restore xmm6
 331         movaps  112(%esp), %xmm7        // restore xmm7, if this is for SSSE3 or above
 332         addl    $140, %esp                      // we've already restored %xmm0-%xmm7 from stack
 333 #endif
 334
 335     popl   %esi
 336     popl   %edi
 337         popl   %ebx
 338         leave                                           // pop ebp out from stack
 339         ret
 340
 341
 342 L_no_ssse3:
 343
 344         // i386 adler32 without ssse3
 345
 346         // need to fill up xmm4/xmm5/xmm6 only if len>=16
 347         cmpl    $16, len
 348         jl              2f
 349
 350         // set up table starting address to %eax
 351         leal    sum2_coefficients, %eax
 352
 353         // reading coefficients
 354         pxor    zero, zero
 355         movaps  48(%eax), %xmm6         // coefficients for computing sum2 : pmaddubsw 16:9
 356     movaps  64(%eax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
 357     movaps  80(%eax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
 358
 359 2:
 360
 361         cmpl    $NMAX, len                              // len vs NMAX
 362         jl              3f                                              // if (len < NMAX), skip the following NMAX batches processing
 363
 364 0:                                                                      // while (len>=NMAX) {
 365
 366         subl    $NMAX, len                              //              len -= NMAX
 367         movl    $(NMAX/16), %eax                //              n = NMAX/16
 368
 369 1:                                                                      //              do {
 370         DO16_nossse3                                    //                      update adler/sum2 for a 16-byte input
 371         decl    %eax                                    //                      n--;
 372         jg              1b                                              //      } while (n);
 373
 374         modulo_BASE                                             //              (adler/sum2) modulo BASE;
 375
 376         cmpl    $NMAX, len                              //
 377         jge             0b                                              // }    /* len>=NMAX */
 378
 379 3:
 380
 381         subl    $16, len                                // pre-decrement len by 16
 382         jl              L_len_lessthan_16               // if len < 16, skip the 16-vector code
 383         DO16_nossse3                                    // update adler/sum2 for a 16-byte input
 384         subl    $16, len                                // len -= 16;
 385         jmp             L_len_lessthan_16
 386
 387
 388         .const
 389         .align  4
 390 sum2_coefficients:      // used for vectorizing adler32 computation
 391
 392         .byte   32
 393         .byte   31
 394         .byte   30
 395         .byte   29
 396         .byte   28
 397         .byte   27
 398         .byte   26
 399         .byte   25
 400         .byte   24
 401         .byte   23
 402         .byte   22
 403         .byte   21
 404         .byte   20
 405         .byte   19
 406         .byte   18
 407         .byte   17
 408         .byte   16
 409         .byte   15
 410         .byte   14
 411         .byte   13
 412         .byte   12
 413         .byte   11
 414         .byte   10
 415         .byte   9
 416         .byte   8
 417         .byte   7
 418         .byte   6
 419         .byte   5
 420         .byte   4
 421         .byte   3
 422         .byte   2
 423         .byte   1
 424
 425         // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
 426         .word   1
 427         .word   1
 428         .word   1
 429         .word   1
 430         .word   1
 431         .word   1
 432         .word   1
 433         .word   1
 434
 435
 436         // data for without ssse3
 437
 438         .word   16
 439     .word   15
 440     .word   14
 441     .word   13
 442     .word   12
 443     .word   11
 444     .word   10
 445     .word   9
 446     .word   8
 447     .word   7
 448     .word   6
 449     .word   5
 450     .word   4
 451     .word   3
 452     .word   2
 453     .word   1
 454
 455         // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
 456         .word   1
 457         .word   1
 458         .word   1
 459         .word   1
 460         .word   1
 461         .word   1
 462         .word   1
 463         .word   1
 464
 465 #else   // (defined __x86_64__)
 466
 467         movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
 468         mov     (%rax), %eax                                    // %eax = __cpu_capabilities
 469         test    $(kHasSupplementalSSE3), %eax                   // __cpu_capabilities & kHasSupplementalSSE3
 470     jne      L_has_ssse3
 471
 472         // ----------------------------------------------------------------------------------
 473         // the following is added for x86_64 without SSSE3 support
 474         // it is essentially a translated copy of the i386 code without SSSE3 code
 475         // ----------------------------------------------------------------------------------
 476
 477         // input :
 478         //               adler : rdi
 479         //               sum2  : rsi
 480         //               buf   : rdx
 481         //               len   : rcx
 482
 483         pushq   %rbp
 484         movq    %rsp, %rbp
 485         pushq   %rbx
 486
 487 #ifdef  KERNEL                  // if for kernel, save %xmm0-%xmm11
 488         subq    $200, %rsp      // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
 489         movaps  %xmm0, -32(%rbp)
 490         movaps  %xmm1, -48(%rbp)
 491         movaps  %xmm2, -64(%rbp)
 492         movaps  %xmm3, -80(%rbp)
 493         movaps  %xmm4, -96(%rbp)
 494         movaps  %xmm5, -112(%rbp)
 495         movaps  %xmm6, -128(%rbp)
 496 #endif
 497
 498         #define adler   %rdi                            // 16(%rbp)
 499         #define sum2    %rsi                            // 24(%ebp)
 500         #define buf             %rcx                            // 32(%ebp)
 501         #define len             %rbx                            // 40(%ebp)
 502         #define zero    %xmm0
 503         #define ones    %xmm5
 504
 505         movq    %rcx, len
 506         movq    %rdx, buf
 507
 508         .macro          modulo_BASE
 509         movl            $$-2146992015, %eax             // 1/BASE in Q47
 510         mull            %edi                                    // edx:eax = adler divided by BASE in Q47
 511         shrl            $$15, %edx                              // edx is now the floor integer of adler and BASE
 512         imull           $$BASE, %edx, %edx              // edx * BASE
 513         subq            %rdx, adler                             // adler -= edx*BASE
 514         movl            $$-2146992015, %eax             // 1/BASE in Q47
 515         mull            %esi                                    // edx:eax = sum2 divided by BASE in Q47
 516         shrl            $$15, %edx                              // edx is now the floor integer of sum2 and BASE
 517         imull           $$BASE, %edx, %eax              // eax = edx * BASE
 518         subq            %rax, sum2                              // sum2 -= sdx*BASE
 519         .endmacro
 520
 521         // update adler/sum2 according to a new 16-byte vector, no ssse3
 522         .macro          DO16_nossse3
 523     movaps      (buf), %xmm1            // 16 bytes vector
 524     movaps      %xmm1, %xmm3            // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
 525     movaps      %xmm1, %xmm2            // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
 526     psrldq      $$8, %xmm2              // shift down 8 bytes, to reuse the shuffle vector
 527     punpcklbw   zero, %xmm3             // convert lower 8 bytes into 8 words
 528     punpcklbw   zero, %xmm2             // convert higher 8 bytes into 8 words
 529     pmullw      %xmm6, %xmm3            // lower 8 words * 16:9
 530     pmullw      %xmm4, %xmm2            // higher 8 words * 8:1
 531     add         $$16, buf               // buf -> next vector
 532     psadbw      zero, %xmm1             // 2 16-bit words to be added for adler in xmm1
 533     paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
 534     imulq       $$16, adler, %rdx       // edx = 16*adler;
 535     movhlps     %xmm1, %xmm2            // higher 16-bit word (for adler) in xmm2
 536     pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
 537     paddq       %xmm2, %xmm1            // xmm1 lower 32-bit to be added to adler
 538     add         %rdx, sum2              // sum2 += adler*16;
 539     movhlps     %xmm3, %xmm2            // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 540     movd        %xmm1, %edx             // to be added to adler
 541     paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
 542     addq        %rdx, adler             // update adler
 543     movd        %xmm3, %edx             // to be added to sum2
 544     psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
 545     addq        %rdx, sum2              // sum2 += 1st half of update
 546     movd        %xmm3, %edx             // to be added to sum2
 547     addq        %rdx, sum2              // sum2 += 2nd half of update
 548         .endm
 549
 550         // need to fill up xmm4/xmm5/xmm6 only if len>=16
 551         cmpq    $16, len
 552         jl              0f
 553
 554         // set up table starting address to %eax
 555         leaq    sum2_coefficients_nossse3(%rip), %rax
 556
 557         // reading coefficients
 558         pxor    zero, zero
 559         movaps  (%rax), %xmm6           // coefficients for computing sum2 : pmaddubsw 16:9
 560     movaps  16(%rax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
 561     movaps  32(%rax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
 562 0:
 563
 564         cmp             $NMAX, len                              // len vs NMAX
 565         jl              3f                                              // if (len < NMAX), skip the following NMAX batches processing
 566
 567 0:                                                                      // while (len>=NMAX) {
 568
 569         sub             $NMAX, len                              //              len -= NMAX
 570         mov             $(NMAX/16), %eax                //              n = NMAX/16
 571
 572 1:                                                                      //              do {
 573         DO16_nossse3                                    //                      update adler/sum2 for a 16-byte input
 574         decl    %eax                                    //                      n--;
 575         jg              1b                                              //      } while (n);
 576
 577         modulo_BASE                                             //              (adler/sum2) modulo BASE;
 578
 579         cmp             $NMAX, len                              //
 580         jge             0b                                              // }    /* len>=NMAX */
 581
 582 3:
 583
 584         sub             $16, len                                // pre-decrement len by 16
 585         jl              2f                                              // if len < 16, skip the 16-vector code
 586         DO16_nossse3                                    // update adler/sum2 for a 16-byte input
 587         sub             $16, len                                // len -= 16;
 588
 589 2:
 590         add             $16, len                                // post-increment len by 16
 591         jz              1f                                              // if len==0, branch over scalar processing
 592
 593 0:                                                                      // while (len) {
 594         movzbq  (buf), %rdx                             //      new input byte
 595         incq    buf                                             //      buf++
 596         addq    %rdx, adler                             //      adler += *buf
 597         addq    adler, sum2                             //      sum2 += adler
 598         decq    len                                             //      len--
 599         jg              0b                                              // }
 600
 601 1:
 602
 603         modulo_BASE                                             // (adler/sum2) modulo BASE;
 604
 605         // construct 32-bit (sum2<<16 | adler) to be returned
 606
 607         salq    $16, sum2                               // sum2 <<16
 608         movq    adler, %rax                             // adler
 609         orq             sum2, %rax                              // sum2<<16 | adler
 610
 611 #ifdef  KERNEL                                  // if this is for kernel code, need to restore xmm registers
 612         movaps  -32(%rbp), %xmm0
 613         movaps  -48(%rbp), %xmm1
 614         movaps  -64(%rbp), %xmm2
 615         movaps  -80(%rbp), %xmm3
 616         movaps  -96(%rbp), %xmm4
 617         movaps  -112(%rbp), %xmm5
 618         movaps  -128(%rbp), %xmm6
 619         addq    $200, %rsp      // we've already restored %xmm0-%xmm11 from stack
 620 #endif
 621
 622         popq   %rbx
 623         leave
 624         ret
 625
 626
 627
 628         .const
 629         .align  4
 630 sum2_coefficients_nossse3:      // used for vectorizing adler32 computation
 631
 632         // data for without ssse3
 633
 634         .word   16
 635     .word   15
 636     .word   14
 637     .word   13
 638     .word   12
 639     .word   11
 640     .word   10
 641     .word   9
 642     .word   8
 643     .word   7
 644     .word   6
 645     .word   5
 646     .word   4
 647     .word   3
 648     .word   2
 649     .word   1
 650
 651         // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
 652         .word   1
 653         .word   1
 654         .word   1
 655         .word   1
 656         .word   1
 657         .word   1
 658         .word   1
 659         .word   1
 660
 661
 662         .text
 663
 664         // ----------------------------------------------------------------------------------
 665         // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions
 666         // ----------------------------------------------------------------------------------
 667
 668 L_has_ssse3:
 669
 670         // input :
 671         //               adler : rdi
 672         //               sum2  : rsi
 673         //               buf   : rdx
 674         //               len   : rcx
 675
 676         pushq   %rbp
 677         movq    %rsp, %rbp
 678         pushq   %rbx
 679
 680 #ifdef  KERNEL                  // if for kernel, save %xmm0-%xmm11
 681         subq    $200, %rsp      // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
 682         movaps  %xmm0, -32(%rbp)
 683         movaps  %xmm1, -48(%rbp)
 684         movaps  %xmm2, -64(%rbp)
 685         movaps  %xmm3, -80(%rbp)
 686         movaps  %xmm4, -96(%rbp)
 687         movaps  %xmm5, -112(%rbp)
 688         movaps  %xmm6, -128(%rbp)
 689         movaps  %xmm7, -144(%rbp)
 690         movaps  %xmm8, -160(%rbp)
 691         movaps  %xmm9, -176(%rbp)
 692         movaps  %xmm10, -192(%rbp)
 693         movaps  %xmm11, -208(%rbp)
 694 #endif
 695
 696         #define adler   %rdi                            // 16(%rbp)
 697         #define sum2    %rsi                            // 24(%ebp)
 698         #define buf             %rcx                            // 32(%ebp)
 699         #define len             %rbx                            // 40(%ebp)
 700         #define zero    %xmm0
 701         #define ones    %xmm5
 702
 703         movq    %rcx, len
 704         movq    %rdx, buf
 705
 706         // update adler/sum2 according to a new 16-byte vector
 707         .macro          DO16
 708         movaps          (buf), %xmm1                    // 16 bytes vector
 709         movaps          %xmm1, %xmm3                    // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
 710         addq            $$16, buf                               // buf -> next vector
 711         psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
 712         pmaddubsw       %xmm4, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
 713         imulq           $$16, adler, %rdx               // edx = 16*adler;
 714         movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2
 715         pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
 716         paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
 717         addq            %rdx, sum2                              // sum2 += adler*16;
 718         movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 719         movd            %xmm1, %edx                             // to be added to adler
 720         paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
 721         addq            %rdx, adler                             // update adler
 722         movd            %xmm3, %edx                             // to be added to sum2
 723         psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
 724         addq            %rdx, sum2                              // sum2 += 1st half of update
 725         movd            %xmm3, %edx                             // to be added to sum2
 726         addq            %rdx, sum2                              // sum2 += 2nd half of update
 727         .endm
 728
 729         // update adler/sum2 according to a new 32-byte vector
 730         .macro          DO32
 731         imulq           $$32, adler, %rdx               // edx = 32*adler
 732         movaps          (buf), %xmm1                    // 1st 16 bytes vector
 733         movaps          16(buf), %xmm7                  // 2nd 16 bytes vector
 734         movaps          %xmm1, %xmm3                    // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
 735         movaps          %xmm7, %xmm2                    // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
 736         psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
 737         psadbw          zero, %xmm7                             // 2 16-bit words to be added for adler in xmm7
 738         addq            %rdx, sum2                              // sum2 += adler*32;
 739         pmaddubsw       %xmm6, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
 740         pmaddubsw       %xmm4, %xmm2                    // 8 16-bit words to be added for sum2 in xmm2
 741         paddd           %xmm7, %xmm1                    // 2 16-bit words to be added for adler in xmm1
 742         paddw           %xmm2, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
 743         addq            $$32, buf                               // buf -> vector for next iteration
 744         movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2
 745         pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
 746         paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
 747         movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 748         movd            %xmm1, %edx                             // to be added to adler
 749         paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
 750         addq            %rdx, adler                             // update adler
 751         movd            %xmm3, %edx                             // to be added to sum2
 752         psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
 753         addq            %rdx, sum2                              // sum2 += 1st half of update
 754         movd            %xmm3, %edx                             // to be added to sum2
 755         addq            %rdx, sum2                              // sum2 += 2nd half of update
 756         .endm
 757
 758         // update adler/sum2 according to a new 48-byte vector
 759
 760         .macro          DO48
 761         imulq           $$48, adler, %rdx               // edx = 48*adler
 762
 763         movaps          (buf), %xmm7                    // 1st 16 bytes vector
 764         movaps          16(buf), %xmm10                 // 2nd 16 bytes vector
 765         movaps          32(buf), %xmm11                 // 3rd 16 bytes vector
 766
 767         movaps          %xmm7, %xmm1                    // 1st vector
 768         movaps          %xmm10, %xmm2                   // 2nd vector
 769         movaps          %xmm11, %xmm3                   // 3rd vector
 770
 771         psadbw          zero, %xmm7                             // 1st vector for adler
 772         psadbw          zero, %xmm10                    // 2nd vector for adler
 773         psadbw          zero, %xmm11                    // 3rd vector for adler
 774
 775         addq            %rdx, sum2                              // sum2 += adler*48;
 776
 777         pmaddubsw       %xmm9, %xmm1                    // 8 16-bit words to be added for sum2 : 1st vector
 778         pmaddubsw       %xmm6, %xmm2                    // 8 16-bit words to be added for sum2 : 2nd vector
 779         pmaddubsw       %xmm4, %xmm3                    // 8 16-bit words to be added for sum2 : 3rd vector
 780
 781         pmaddwd         ones, %xmm1                             // 4 32-bit elements to be added for sum2 in xmm1
 782         pmaddwd         ones, %xmm2                             // 4 32-bit elements to be added for sum2 in xmm2
 783         pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
 784
 785         paddd           %xmm10, %xmm7                   // 2 16-bit words to be added for adler
 786         paddd           %xmm11, %xmm7                   // 2 16-bit words to be added for adler
 787
 788         paddd           %xmm1, %xmm3                    // 4 32-bit elements to be added for sum2
 789         paddd           %xmm2, %xmm3                    // 4 32-bit elements to be added for sum2
 790
 791         addq            $$48, buf                               // buf -> vector for next iteration
 792
 793         movhlps         %xmm7, %xmm2                    // higher 16-bit word (for adler) in xmm2
 794         paddq           %xmm2, %xmm7                    // xmm7 lower 32-bit to be added to adler
 795
 796         movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 797         movd            %xmm7, %edx                             // to be added to adler
 798         paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
 799         addq            %rdx, adler                             // update adler
 800         movd            %xmm3, %edx                             // to be added to sum2
 801         psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
 802         addq            %rdx, sum2                              // sum2 += 1st half of update
 803         movd            %xmm3, %edx                             // to be added to sum2
 804         addq            %rdx, sum2                              // sum2 += 2nd half of update
 805         .endm
 806
 807         // update adler/sum2 according to a new 64-byte vector
 808         .macro          DO64
 809         imulq           $$64, adler, %rdx               // edx = 64*adler
 810
 811         movaps          (buf), %xmm1                    // 1st 16 bytes vector
 812         movaps          16(buf), %xmm7                  // 2nd 16 bytes vector
 813         movaps          32(buf), %xmm10                 // 3rd 16 bytes vector
 814         movaps          48(buf), %xmm11                 // 4th 16 bytes vector
 815
 816         movaps          %xmm1, %xmm3                    // 1st vector
 817         movaps          %xmm11, %xmm2                   // 4th vector
 818         psadbw          zero, %xmm1                             // 1st vector for adler
 819         psadbw          zero, %xmm11                    // 4th vector for adler
 820
 821         addq            %rdx, sum2                              // sum2 += adler*64;
 822
 823         pmaddubsw       %xmm8, %xmm3                    // 8 16-bit words to be added for sum2 : 1st vector
 824         pmaddubsw       %xmm4, %xmm2                    // 8 16-bit words to be added for sum2 : 4th vector
 825         pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
 826         pmaddwd         ones, %xmm2                             // 4 32-bit elements to be added for sum2 in xmm2
 827
 828         paddd           %xmm11, %xmm1                   // 2 16-bit words to be added for adler in xmm1
 829         paddd           %xmm2, %xmm3                    // 4 32-bit elements to be added for sum2 in xmm3
 830
 831         movaps          %xmm7, %xmm2                    // 2nd vector
 832         movaps          %xmm10, %xmm11                  // 3rd vector
 833
 834         psadbw          zero, %xmm7                             // 2nd vector for adler
 835         psadbw          zero, %xmm10                    // 3rd vector for adler
 836
 837         pmaddubsw       %xmm9, %xmm2                    // 8 16-bit words to be added for sum2 : 2nd vector
 838         pmaddubsw       %xmm6, %xmm11                   // 8 16-bit words to be added for sum2 : 3rd vector
 839         pmaddwd         ones, %xmm2                             // 4 32-bit elements to be added for sum2 in xmm2
 840         pmaddwd         ones, %xmm11                    // 4 32-bit elements to be added for sum2 in xmm11
 841
 842         paddd           %xmm7, %xmm1                    // 2 16-bit words to be added for adler in xmm1
 843         paddd           %xmm10, %xmm1                   // 2 16-bit words to be added for adler in xmm1
 844
 845         paddd           %xmm2, %xmm3                    // 4 32-bit elements to be added for sum2 in xmm3
 846         paddd           %xmm11, %xmm3                   // 4 32-bit elements to be added for sum2 in xmm3
 847
 848         addq            $$64, buf                               // buf -> vector for next iteration
 849
 850         movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2
 851         paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
 852         movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
 853         movd            %xmm1, %edx                             // to be added to adler
 854         paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
 855         addq            %rdx, adler                             // update adler
 856         movd            %xmm3, %edx                             // to be added to sum2
 857         psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
 858         addq            %rdx, sum2                              // sum2 += 1st half of update
 859         movd            %xmm3, %edx                             // to be added to sum2
 860         addq            %rdx, sum2                              // sum2 += 2nd half of update
 861         .endm
 862
 863         // need to fill up xmm4/xmm5/xmm6 only if len>=16
 864         cmpq    $16, len
 865         jl              skip_loading_tables
 866
 867         // set up table starting address to %eax
 868         leaq    sum2_coefficients(%rip), %rax
 869
 870         // reading coefficients
 871         pxor    zero, zero
 872         movaps  (%rax), %xmm8                   // coefficients for computing sum2 : pmaddubsw 64:49
 873         movaps  16(%rax), %xmm9                 // coefficients for computing sum2 : pmaddubsw 48:33
 874         movaps  32(%rax), %xmm6                 // coefficients for computing sum2 : pmaddubsw 32:17
 875         movaps  48(%rax), %xmm4                 // coefficients for computing sum2 : pmaddubsw 16:1
 876         movaps  64(%rax), ones                  // coefficients for computing sum2 : pmaddwd 1,1,...,1
 877
 878 skip_loading_tables:
 879
 880
 881         cmpq    $NMAX, len                              // len vs NMAX
 882         jl              len_lessthan_NMAX               // if (len < NMAX), skip the following NMAX batches processing
 883
 884 len_ge_NMAX_loop:                                       // while (len>=NMAX) {
 885
 886         subq    $NMAX, len                              //              len -= NMAX
 887         movq    $(NMAX/64), %rax                //              n = NMAX/64
 888
 889 n_loop:                                                         //              do {
 890         DO64                                                    //                      update adler/sum2 for a 64-byte input
 891         decq    %rax                                    //                      n--;
 892         jg              n_loop                                  //      } while (n);
 893
 894         DO48                                                    //              update adler/sum2 for a 48-byte input
 895
 896         modulo_BASE                                             //              (adler/sum2) modulo BASE;
 897
 898         cmpq    $NMAX, len                              //
 899         jge             len_ge_NMAX_loop                // }    /* len>=NMAX */
 900
 901 len_lessthan_NMAX:
 902
 903         subq    $64, len                                // pre-decrement len by 64
 904         jl              len_lessthan_64                 // if len < 64, skip the 64-vector code
 905 len64_loop:                                                     // while (len>=64) {
 906         DO64                                                    //   update adler/sum2 for a 64-byte input
 907         subq    $64, len                                //   len -= 64;
 908         jge             len64_loop                              // }
 909
 910 len_lessthan_64:
 911         addq    $(64-32), len                   // post-increment 64 + pre-decrement 32 of len
 912         jl              len_lessthan_32                 // if len < 32, skip the 32-vector code
 913         DO32                                                    //   update adler/sum2 for a 32-byte input
 914         subq    $32, len                                //   len -= 32;
 915
 916 len_lessthan_32:
 917
 918         addq    $(32-16), len                   // post-increment by 32 + pre-decrement by 16 on len
 919         jl              len_lessthan_16                 // if len < 16, skip the 16-vector code
 920         DO16                                                    // update adler/sum2 for a 16-byte input
 921         subq    $16, len                                // len -= 16;
 922
 923 len_lessthan_16:
 924         addq    $16, len                                // post-increment len by 16
 925         jz              len_is_zero                             // if len==0, branch over scalar processing
 926
 927 scalar_loop:                                            // while (len) {
 928         movzbq  (buf), %rdx                             //      new input byte
 929         incq    buf                                             //      buf++
 930         addq    %rdx, adler                             //      adler += *buf
 931         addq    adler, sum2                             //      sum2 += adler
 932         decq    len                                             //      len--
 933         jg              scalar_loop                             // }
 934
 935 len_is_zero:
 936
 937         modulo_BASE                                             // (adler/sum2) modulo BASE;
 938
 939         // construct 32-bit (sum2<<16 | adler) to be returned
 940
 941         salq    $16, sum2                               // sum2 <<16
 942         movq    adler, %rax                             // adler
 943         orq             sum2, %rax                              // sum2<<16 | adler
 944
 945
 946 #ifdef  KERNEL                  // if for kernel, restore %xmm0-%xmm11
 947         movaps  -32(%rbp), %xmm0
 948         movaps  -48(%rbp), %xmm1
 949         movaps  -64(%rbp), %xmm2
 950         movaps  -80(%rbp), %xmm3
 951         movaps  -96(%rbp), %xmm4
 952         movaps  -112(%rbp), %xmm5
 953         movaps  -128(%rbp), %xmm6
 954         movaps  -144(%rbp), %xmm7
 955         movaps  -160(%rbp), %xmm8
 956         movaps  -176(%rbp), %xmm9
 957         movaps  -192(%rbp), %xmm10
 958         movaps  -208(%rbp), %xmm11
 959         addq    $200, %rsp      // we've already restored %xmm0-%xmm11 from stack
 960 #endif
 961
 962         popq   %rbx
 963         leave                                                   // pop ebp out from stack
 964         ret
 965
 966
 967         .const
 968         .align  4
 969 sum2_coefficients:      // used for vectorizing adler32 computation
 970
 971         // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
 972
 973         .byte   64
 974         .byte   63
 975         .byte   62
 976         .byte   61
 977         .byte   60
 978         .byte   59
 979         .byte   58
 980         .byte   57
 981         .byte   56
 982         .byte   55
 983         .byte   54
 984         .byte   53
 985         .byte   52
 986         .byte   51
 987         .byte   50
 988         .byte   49
 989         .byte   48
 990         .byte   47
 991         .byte   46
 992         .byte   45
 993         .byte   44
 994         .byte   43
 995         .byte   42
 996         .byte   41
 997         .byte   40
 998         .byte   39
 999         .byte   38
1000         .byte   37
1001         .byte   36
1002         .byte   35
1003         .byte   34
1004         .byte   33
1005         .byte   32
1006         .byte   31
1007         .byte   30
1008         .byte   29
1009         .byte   28
1010         .byte   27
1011         .byte   26
1012         .byte   25
1013         .byte   24
1014         .byte   23
1015         .byte   22
1016         .byte   21
1017         .byte   20
1018         .byte   19
1019         .byte   18
1020         .byte   17
1021         .byte   16
1022         .byte   15
1023         .byte   14
1024         .byte   13
1025         .byte   12
1026         .byte   11
1027         .byte   10
1028         .byte   9
1029         .byte   8
1030         .byte   7
1031         .byte   6
1032         .byte   5
1033         .byte   4
1034         .byte   3
1035         .byte   2
1036         .byte   1
1037
1038         // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
1039         .word   1
1040         .word   1
1041         .word   1
1042         .word   1
1043         .word   1
1044         .word   1
1045         .word   1
1046         .word   1
1047
1048 #endif  // (defined __i386__)
1049
1050 #endif  // (defined __i386__ || defined __x86_64__)