xnu-1699.22.73.tar.gz

[apple/xnu.git] / libkern / zlib / intel / adler32vec.s
diff --git a/libkern/zlib/intel/adler32vec.s b/libkern/zlib/intel/adler32vec.s

new file mode 100644 (file)

index 0000000..df9dcf3
--- /dev/null
+++ b/libkern/zlib/intel/adler32vec.s
@@ -0,0 +1,1050 @@
+/* Apple Copyright 2009
+   CoreOS - vector & Numerics, cclee 10-22-09
+
+       This following source code implements a vectorized version of adler32 computation that is defined in zlib.
+       The target architectures are x86_64 and i386.
+
+       Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
+       The adler-sum2 pair is updated according to
+
+               for (i=0;i<N;i++) {
+                       adler = (adler+x[i])%BASE;
+                       sum2 = (sum2+adler)%BASE;
+               }
+
+       To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
+       adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
+       algorithm to 
+
+               for (i=0;i<N;i+=NMAX) {
+                       for (k=0;k<NMAX;k++) {
+                               adler+=x[i+k];
+                               sum2+=adler;
+                       }
+                       adler%=BASE;
+                       sum2%=BASE;
+               }
+
+       The hand optimization of this function is now reduced to 
+
+                       for (k=0;k<NMAX;k++) {
+                adler+=x[k];
+                sum2+=adler;
+            }
+
+       This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
+
+                       for (k=0;k<K;k++) {
+                adler+=x[k];
+                sum2+=adler;
+            }
+
+       It can be shown that the sum2-adler pair can be updated according to
+
+               sum2 += adler*K;
+               adler += (x[0] + x[1] + ... + x[K-1]); 
+               sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
+
+       The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
+       The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors
+               [ 1 1 1 ... 1 ] for adler update.
+               [ K K-1 ... 1 ] for sum2 update.
+
+       The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm
+       registers. It then uses SSE instructions to perform the aforementioned vector computation.
+
+       For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32),
+       followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
+
+       For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block), 
+       it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48), 
+       before calling a modulo operation for adler and sum2.
+
+*/
+
+/* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3
+
+       Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3
+       add code here to probe cpu_capabilities for ssse3 support
+               if ssse3 is supported, branch to ssse3-based code, otherwise use the original code
+
+       cclee 5-3-10
+*/
+
+#define BASE 65521  /* largest prime smaller than 65536 */
+#define NMAX 5552      /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+// uLong       adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
+//    unsigned n;
+//    while (len >= NMAX) {
+//        len -= NMAX;
+//        n = NMAX / 16;          /* NMAX is divisible by 16 */
+//        do {
+//            DO16(buf);          /* 16 sums unrolled */
+//            buf += 16;
+//        } while (--n);
+//        MOD(adler);
+//        MOD(sum2);
+//    }
+//    if (len) {                  /* avoid modulos if none remaining */
+//        while (len >= 16) {
+//            len -= 16;
+//            DO16(buf);
+//            buf += 16;
+//        }
+//        while (len--) {
+//            adler += *buf++;
+//            sum2 += adler;
+//        }
+//        MOD(adler);
+//        MOD(sum2);
+//    }
+//    return adler | (sum2 << 16);
+// }
+
+#if (defined __i386__ || defined __x86_64__)
+
+#include <i386/cpu_capabilities.h>
+
+       .text
+       .align 4,0x90
+.globl _adler32_vec
+_adler32_vec:
+
+#if (defined __i386__)
+
+       pushl   %ebp
+       movl    %esp, %ebp
+
+       pushl   %ebx
+       pushl   %edi
+       pushl   %esi
+
+#ifdef KERNEL                                          // if this is for kernel, need to save xmm registers
+       subl    $140, %esp                              // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary
+       movaps  %xmm0, 0(%esp)          // save xmm0, offset -12 for ebx/edi/esi
+       movaps  %xmm1, 16(%esp)         // save xmm1
+       movaps  %xmm2, 32(%esp)         // save xmm2
+       movaps  %xmm3, 48(%esp)         // save xmm3
+       movaps  %xmm4, 64(%esp)         // save xmm4
+       movaps  %xmm5, 80(%esp)         // save xmm5
+       movaps  %xmm6, 96(%esp)         // save xmm6
+       movaps  %xmm7, 112(%esp)                // save xmm7, if this is for SSSE3 or above
+#endif
+
+       #define adler   %edi                            // 8(%ebp)
+       #define sum2    %esi                            // 12(%ebp)
+       #define buf             %ecx                            // 16(%ebp)
+       #define len             %ebx                            // 20(%ebp)
+       #define zero    %xmm0
+       #define ones    %xmm5
+
+       movl    8(%ebp), adler
+       movl    12(%ebp), sum2
+       movl    16(%ebp), buf                   // use ecx as buf pointer
+       movl    20(%ebp), len
+
+       .macro          modulo_BASE
+       movl            $$-2146992015, %eax             // 1/BASE in Q47
+       mull            adler                                   // edx:eax = adler divided by BASE in Q47
+       shrl            $$15, %edx                              // edx is now the floor integer of adler and BASE
+       imull           $$BASE, %edx, %edx              // edx * BASE
+       subl            %edx, adler                             // adler -= edx*BASE
+       movl            $$-2146992015, %eax             // 1/BASE in Q47
+       mull            sum2                                    // edx:eax = sum2 divided by BASE in Q47
+       shrl            $$15, %edx                              // edx is now the floor integer of sum2 and BASE
+       imull           $$BASE, %edx, %eax              // eax = edx * BASE
+       subl            %eax, sum2                              // sum2 -= sdx*BASE
+       .endmacro
+
+       // update adler/sum2 according to a new 16-byte vector
+       .macro          DO16
+       movaps          (buf), %xmm1                    // 16 bytes vector, in xmm1
+       movaps          %xmm1, %xmm3                    // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
+       addl            $$16, buf                               // buf -> next vector
+       psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
+       pmaddubsw       %xmm4, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
+       imull           $$16, adler, %edx               // edx = 16*adler;
+       movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2       
+       pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
+       paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
+       addl            %edx, sum2                              // sum2 += adler*16;
+       movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+       movd            %xmm1, %edx                             // to be added to adler
+       paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
+       addl            %edx, adler                             // update adler
+       movd            %xmm3, %edx                             // to be added to sum2
+       psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
+       addl            %edx, sum2                              // sum2 += 1st half of update
+       movd            %xmm3, %edx                             // to be added to sum2
+       addl            %edx, sum2                              // sum2 += 2nd half of update
+       .endm
+
+       // update adler/sum2 according to a new 32-byte vector
+       .macro          DO32
+       imull           $$32, adler, %edx               // edx = 32*adler
+       movaps          (buf), %xmm1                    // 1st 16 bytes vector
+       movaps          16(buf), %xmm7                  // 2nd 16 bytes vector
+       movaps          %xmm1, %xmm3                    // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
+       movaps          %xmm7, %xmm2                    // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
+       psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
+       psadbw          zero, %xmm7                             // 2 16-bit words to be added for adler in xmm7
+       addl            %edx, sum2                              // sum2 += adler*32;
+       pmaddubsw       %xmm6, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
+       pmaddubsw       %xmm4, %xmm2                    // 8 16-bit words to be added for sum2 in xmm2
+       paddd           %xmm7, %xmm1                    // 2 16-bit words to be added for adler in xmm1
+       paddd           %xmm2, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
+       addl            $$32, buf                               // buf -> vector for next iteration
+       movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2       
+       pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
+       paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
+       movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+       movd            %xmm1, %edx                             // to be added to adler
+       paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
+       addl            %edx, adler                             // update adler
+       movd            %xmm3, %edx                             // to be added to sum2
+       psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
+       addl            %edx, sum2                              // sum2 += 1st half of update
+       movd            %xmm3, %edx                             // to be added to sum2
+       addl            %edx, sum2                              // sum2 += 2nd half of update
+       .endm
+
+       // this defines the macro DO16 for SSSE3 not supported
+    .macro      DO16_nossse3
+    movaps      (buf), %xmm1            // 16 bytes vector
+    movaps      %xmm1, %xmm3            // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
+    movaps      %xmm1, %xmm2            // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
+    psrldq      $$8, %xmm2              // shift down 8 bytes, to reuse the shuffle vector
+    punpcklbw   zero, %xmm3             // convert lower 8 bytes into 8 words
+    punpcklbw   zero, %xmm2             // convert higher 8 bytes into 8 words
+    pmullw      %xmm6, %xmm3            // lower 8 words * 16:9
+    pmullw      %xmm4, %xmm2            // higher 8 words * 8:1
+    addl        $$16, buf               // buf -> next vector
+    psadbw      zero, %xmm1             // 2 16-bit words to be added for adler in xmm1
+    paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
+    imull       $$16, adler, %edx       // edx = 16*adler;
+    movhlps     %xmm1, %xmm2            // higher 16-bit word (for adler) in xmm2   
+    pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
+    paddq       %xmm2, %xmm1            // xmm1 lower 32-bit to be added to adler
+    addl        %edx, sum2              // sum2 += adler*16;
+    movhlps     %xmm3, %xmm2            // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+    movd        %xmm1, %edx             // to be added to adler
+    paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
+    addl        %edx, adler             // update adler
+    movd        %xmm3, %edx             // to be added to sum2
+    psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
+    addl        %edx, sum2              // sum2 += 1st half of update
+    movd        %xmm3, %edx             // to be added to sum2
+    addl        %edx, sum2              // sum2 += 2nd half of update
+    .endm
+
+#ifdef  KERNEL
+    leal    __cpu_capabilities, %eax                        // %eax -> __cpu_capabilities
+    mov     (%eax), %eax                                    // %eax = __cpu_capabilities
+#else
+    mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
+#endif
+    test    $(kHasSupplementalSSE3), %eax                                      // __cpu_capabilities & kHasAES
+       je              L_no_ssse3
+
+       // i386 adler32 with ssse3
+
+       // need to fill up xmm4/xmm5/xmm6 only if len>=16
+       cmpl    $16, len
+       jl              L_skip_loading_tables
+
+       // set up table starting address to %eax
+       leal    sum2_coefficients, %eax
+
+       // reading coefficients
+       pxor    zero, zero
+       movaps  (%eax), %xmm6                   // coefficients for computing sum2 : pmaddubsw 32:17
+       movaps  16(%eax), %xmm4                 // coefficients for computing sum2 : pmaddubsw 16:1
+       movaps  32(%eax), ones                  // coefficients for computing sum2 : pmaddwd 1,1,...,1
+
+L_skip_loading_tables:
+
+       cmpl    $NMAX, len                              // len vs NMAX
+       jl              len_lessthan_NMAX               // if (len < NMAX), skip the following NMAX batches processing
+
+len_ge_NMAX_loop:                                      // while (len>=NMAX) {
+
+       subl    $NMAX, len                              //              len -= NMAX
+       movl    $(NMAX/32), %eax                //              n = NMAX/32
+
+n_loop:                                                                //              do {
+       DO32                                                    //                      update adler/sum2 for a 32-byte input
+       decl    %eax                                    //                      n--;
+       jg              n_loop                                  //      } while (n);
+       DO16                                                    //      update adler/sum2 for a 16-byte input
+       modulo_BASE                                             //              (adler/sum2) modulo BASE;
+       cmpl    $NMAX, len                              //  
+       jge             len_ge_NMAX_loop                // }    /* len>=NMAX */
+
+len_lessthan_NMAX:
+
+       subl    $32, len                                // pre-decrement len by 32
+       jl              len_lessthan_32                 // if len < 32, skip the 32-vector code
+len32_loop:                                                    // while (len>=32) {
+       DO32                                                    //   update adler/sum2 for a 32-byte input
+       subl    $32, len                                //   len -= 32;
+       jge             len32_loop                              // } 
+
+len_lessthan_32:
+
+       addl    $(32-16), len                   // post-increment by 32 + pre-decrement by 16 on len
+       jl              L_len_lessthan_16                       // if len < 16, skip the 16-vector code
+       DO16                                                    // update adler/sum2 for a 16-byte input
+       subl    $16, len                                // len -= 16;
+
+L_len_lessthan_16:
+       addl    $16, len                                // post-increment len by 16
+       jz              len_is_zero                             // if len==0, branch over scalar processing
+
+0:                                                                     // while (len) {
+       movzbl  (buf), %edx                             //      new input byte
+       incl    buf                                             //      buf++
+       addl    %edx, adler                             //      adler += *buf
+       addl    adler, sum2                             //      sum2 += adler
+       subl    $1, len                                 //      len--
+       jg              0b                                              // }
+
+len_is_zero:
+
+       modulo_BASE                                             // (adler/sum2) modulo BASE;
+
+       // construct 32-bit (sum2<<16 | adler) to be returned
+
+       sall    $16, sum2                               // sum2 <<16
+       movl    adler, %eax                             // adler                
+       orl             sum2, %eax                              // sum2<<16 | adler
+
+
+#ifdef KERNEL                                  // if this is for kernel code, need to restore xmm registers
+       movaps  (%esp), %xmm0           // restore xmm0, offset -12 for ebx/edi/esi
+       movaps  16(%esp), %xmm1         // restore xmm1
+       movaps  32(%esp), %xmm2         // restore xmm2
+       movaps  48(%esp), %xmm3         // restore xmm3
+       movaps  64(%esp), %xmm4         // restore xmm4
+       movaps  80(%esp), %xmm5         // restore xmm5
+       movaps  96(%esp), %xmm6         // restore xmm6
+       movaps  112(%esp), %xmm7        // restore xmm7, if this is for SSSE3 or above
+       addl    $140, %esp                      // we've already restored %xmm0-%xmm7 from stack
+#endif
+
+    popl   %esi
+    popl   %edi
+       popl   %ebx
+       leave                                           // pop ebp out from stack
+       ret
+
+
+L_no_ssse3:
+
+       // i386 adler32 without ssse3
+
+       // need to fill up xmm4/xmm5/xmm6 only if len>=16
+       cmpl    $16, len
+       jl              2f
+
+       // set up table starting address to %eax
+       leal    sum2_coefficients, %eax
+
+       // reading coefficients
+       pxor    zero, zero
+       movaps  48(%eax), %xmm6         // coefficients for computing sum2 : pmaddubsw 16:9
+    movaps  64(%eax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
+    movaps  80(%eax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
+
+2:
+
+       cmpl    $NMAX, len                              // len vs NMAX
+       jl              3f                                              // if (len < NMAX), skip the following NMAX batches processing
+
+0:                                                                     // while (len>=NMAX) {
+
+       subl    $NMAX, len                              //              len -= NMAX
+       movl    $(NMAX/16), %eax                //              n = NMAX/16
+
+1:                                                                     //              do {
+       DO16_nossse3                                    //                      update adler/sum2 for a 16-byte input
+       decl    %eax                                    //                      n--;
+       jg              1b                                              //      } while (n);
+
+       modulo_BASE                                             //              (adler/sum2) modulo BASE;
+
+       cmpl    $NMAX, len                              //  
+       jge             0b                                              // }    /* len>=NMAX */
+
+3:
+
+       subl    $16, len                                // pre-decrement len by 16
+       jl              L_len_lessthan_16               // if len < 16, skip the 16-vector code
+       DO16_nossse3                                    // update adler/sum2 for a 16-byte input
+       subl    $16, len                                // len -= 16;
+       jmp             L_len_lessthan_16
+
+
+       .const
+       .align  4
+sum2_coefficients:     // used for vectorizing adler32 computation
+
+       .byte   32
+       .byte   31
+       .byte   30
+       .byte   29
+       .byte   28
+       .byte   27
+       .byte   26
+       .byte   25
+       .byte   24
+       .byte   23
+       .byte   22
+       .byte   21
+       .byte   20
+       .byte   19
+       .byte   18
+       .byte   17
+       .byte   16
+       .byte   15
+       .byte   14
+       .byte   13
+       .byte   12
+       .byte   11
+       .byte   10
+       .byte   9
+       .byte   8
+       .byte   7
+       .byte   6
+       .byte   5
+       .byte   4
+       .byte   3
+       .byte   2
+       .byte   1
+
+       // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+
+
+       // data for without ssse3
+
+       .word   16
+    .word   15
+    .word   14
+    .word   13
+    .word   12
+    .word   11
+    .word   10
+    .word   9
+    .word   8
+    .word   7
+    .word   6
+    .word   5
+    .word   4
+    .word   3
+    .word   2
+    .word   1
+
+       // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+
+#else  // (defined __x86_64__)
+
+       movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
+       mov     (%rax), %eax                                    // %eax = __cpu_capabilities
+       test    $(kHasSupplementalSSE3), %eax                   // __cpu_capabilities & kHasSupplementalSSE3
+    jne      L_has_ssse3
+
+       // ----------------------------------------------------------------------------------
+       // the following is added for x86_64 without SSSE3 support
+       // it is essentially a translated copy of the i386 code without SSSE3 code
+       // ----------------------------------------------------------------------------------
+
+       // input :
+       //               adler : rdi
+       //               sum2  : rsi
+       //               buf   : rdx
+       //               len   : rcx
+
+       pushq   %rbp
+       movq    %rsp, %rbp
+       pushq   %rbx
+
+#ifdef KERNEL                  // if for kernel, save %xmm0-%xmm11
+       subq    $200, %rsp      // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
+       movaps  %xmm0, -32(%rbp)
+       movaps  %xmm1, -48(%rbp)
+       movaps  %xmm2, -64(%rbp)
+       movaps  %xmm3, -80(%rbp)
+       movaps  %xmm4, -96(%rbp)
+       movaps  %xmm5, -112(%rbp)
+       movaps  %xmm6, -128(%rbp)
+#endif
+
+       #define adler   %rdi                            // 16(%rbp)
+       #define sum2    %rsi                            // 24(%ebp)
+       #define buf             %rcx                            // 32(%ebp)
+       #define len             %rbx                            // 40(%ebp)
+       #define zero    %xmm0
+       #define ones    %xmm5
+
+       movq    %rcx, len
+       movq    %rdx, buf
+
+       .macro          modulo_BASE
+       movl            $$-2146992015, %eax             // 1/BASE in Q47
+       mull            %edi                                    // edx:eax = adler divided by BASE in Q47
+       shrl            $$15, %edx                              // edx is now the floor integer of adler and BASE
+       imull           $$BASE, %edx, %edx              // edx * BASE
+       subq            %rdx, adler                             // adler -= edx*BASE
+       movl            $$-2146992015, %eax             // 1/BASE in Q47
+       mull            %esi                                    // edx:eax = sum2 divided by BASE in Q47
+       shrl            $$15, %edx                              // edx is now the floor integer of sum2 and BASE
+       imull           $$BASE, %edx, %eax              // eax = edx * BASE
+       subq            %rax, sum2                              // sum2 -= sdx*BASE
+       .endmacro
+
+       // update adler/sum2 according to a new 16-byte vector, no ssse3
+       .macro          DO16_nossse3
+    movaps      (buf), %xmm1            // 16 bytes vector
+    movaps      %xmm1, %xmm3            // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
+    movaps      %xmm1, %xmm2            // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
+    psrldq      $$8, %xmm2              // shift down 8 bytes, to reuse the shuffle vector
+    punpcklbw   zero, %xmm3             // convert lower 8 bytes into 8 words
+    punpcklbw   zero, %xmm2             // convert higher 8 bytes into 8 words
+    pmullw      %xmm6, %xmm3            // lower 8 words * 16:9
+    pmullw      %xmm4, %xmm2            // higher 8 words * 8:1
+    add                $$16, buf               // buf -> next vector
+    psadbw      zero, %xmm1             // 2 16-bit words to be added for adler in xmm1
+    paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
+    imulq       $$16, adler, %rdx       // edx = 16*adler;
+    movhlps     %xmm1, %xmm2            // higher 16-bit word (for adler) in xmm2   
+    pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
+    paddq       %xmm2, %xmm1            // xmm1 lower 32-bit to be added to adler
+    add         %rdx, sum2              // sum2 += adler*16;
+    movhlps     %xmm3, %xmm2            // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+    movd        %xmm1, %edx             // to be added to adler
+    paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
+    addq        %rdx, adler             // update adler
+    movd        %xmm3, %edx             // to be added to sum2
+    psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
+    addq        %rdx, sum2              // sum2 += 1st half of update
+    movd        %xmm3, %edx             // to be added to sum2
+    addq        %rdx, sum2              // sum2 += 2nd half of update
+       .endm
+
+       // need to fill up xmm4/xmm5/xmm6 only if len>=16
+       cmpq    $16, len
+       jl              0f
+
+       // set up table starting address to %eax
+       leaq    sum2_coefficients_nossse3(%rip), %rax
+
+       // reading coefficients
+       pxor    zero, zero
+       movaps  (%rax), %xmm6           // coefficients for computing sum2 : pmaddubsw 16:9
+    movaps  16(%rax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
+    movaps  32(%rax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
+0:
+
+       cmp             $NMAX, len                              // len vs NMAX
+       jl              3f                                              // if (len < NMAX), skip the following NMAX batches processing
+
+0:                                                                     // while (len>=NMAX) {
+
+       sub             $NMAX, len                              //              len -= NMAX
+       mov             $(NMAX/16), %eax                //              n = NMAX/16
+
+1:                                                                     //              do {
+       DO16_nossse3                                    //                      update adler/sum2 for a 16-byte input
+       decl    %eax                                    //                      n--;
+       jg              1b                                              //      } while (n);
+
+       modulo_BASE                                             //              (adler/sum2) modulo BASE;
+
+       cmp             $NMAX, len                              //  
+       jge             0b                                              // }    /* len>=NMAX */
+
+3:
+
+       sub             $16, len                                // pre-decrement len by 16
+       jl              2f                                              // if len < 16, skip the 16-vector code
+       DO16_nossse3                                    // update adler/sum2 for a 16-byte input
+       sub             $16, len                                // len -= 16;
+
+2:
+       add             $16, len                                // post-increment len by 16
+       jz              1f                                              // if len==0, branch over scalar processing
+
+0:                                                                     // while (len) {
+       movzbq  (buf), %rdx                             //      new input byte
+       incq    buf                                             //      buf++
+       addq    %rdx, adler                             //      adler += *buf
+       addq    adler, sum2                             //      sum2 += adler
+       decq    len                                             //      len--
+       jg              0b                                              // }
+
+1:
+
+       modulo_BASE                                             // (adler/sum2) modulo BASE;
+
+       // construct 32-bit (sum2<<16 | adler) to be returned
+
+       salq    $16, sum2                               // sum2 <<16
+       movq    adler, %rax                             // adler                
+       orq             sum2, %rax                              // sum2<<16 | adler
+
+#ifdef KERNEL                                  // if this is for kernel code, need to restore xmm registers
+       movaps  -32(%rbp), %xmm0
+       movaps  -48(%rbp), %xmm1
+       movaps  -64(%rbp), %xmm2
+       movaps  -80(%rbp), %xmm3
+       movaps  -96(%rbp), %xmm4
+       movaps  -112(%rbp), %xmm5
+       movaps  -128(%rbp), %xmm6
+       addq    $200, %rsp      // we've already restored %xmm0-%xmm11 from stack
+#endif
+
+       popq   %rbx
+       leave
+       ret
+
+
+
+       .const
+       .align  4
+sum2_coefficients_nossse3:     // used for vectorizing adler32 computation
+
+       // data for without ssse3
+
+       .word   16
+    .word   15
+    .word   14
+    .word   13
+    .word   12
+    .word   11
+    .word   10
+    .word   9
+    .word   8
+    .word   7
+    .word   6
+    .word   5
+    .word   4
+    .word   3
+    .word   2
+    .word   1
+
+       // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+
+
+       .text
+
+       // ----------------------------------------------------------------------------------
+       // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions
+       // ----------------------------------------------------------------------------------
+
+L_has_ssse3:
+
+       // input :
+       //               adler : rdi
+       //               sum2  : rsi
+       //               buf   : rdx
+       //               len   : rcx
+
+       pushq   %rbp
+       movq    %rsp, %rbp
+       pushq   %rbx
+
+#ifdef KERNEL                  // if for kernel, save %xmm0-%xmm11
+       subq    $200, %rsp      // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
+       movaps  %xmm0, -32(%rbp)
+       movaps  %xmm1, -48(%rbp)
+       movaps  %xmm2, -64(%rbp)
+       movaps  %xmm3, -80(%rbp)
+       movaps  %xmm4, -96(%rbp)
+       movaps  %xmm5, -112(%rbp)
+       movaps  %xmm6, -128(%rbp)
+       movaps  %xmm7, -144(%rbp)
+       movaps  %xmm8, -160(%rbp)
+       movaps  %xmm9, -176(%rbp)
+       movaps  %xmm10, -192(%rbp)
+       movaps  %xmm11, -208(%rbp)
+#endif
+
+       #define adler   %rdi                            // 16(%rbp)
+       #define sum2    %rsi                            // 24(%ebp)
+       #define buf             %rcx                            // 32(%ebp)
+       #define len             %rbx                            // 40(%ebp)
+       #define zero    %xmm0
+       #define ones    %xmm5
+
+       movq    %rcx, len
+       movq    %rdx, buf
+
+       // update adler/sum2 according to a new 16-byte vector
+       .macro          DO16
+       movaps          (buf), %xmm1                    // 16 bytes vector
+       movaps          %xmm1, %xmm3                    // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
+       addq            $$16, buf                               // buf -> next vector
+       psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
+       pmaddubsw       %xmm4, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
+       imulq           $$16, adler, %rdx               // edx = 16*adler;
+       movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2       
+       pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
+       paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
+       addq            %rdx, sum2                              // sum2 += adler*16;
+       movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+       movd            %xmm1, %edx                             // to be added to adler
+       paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
+       addq            %rdx, adler                             // update adler
+       movd            %xmm3, %edx                             // to be added to sum2
+       psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 1st half of update
+       movd            %xmm3, %edx                             // to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 2nd half of update
+       .endm
+
+       // update adler/sum2 according to a new 32-byte vector
+       .macro          DO32
+       imulq           $$32, adler, %rdx               // edx = 32*adler
+       movaps          (buf), %xmm1                    // 1st 16 bytes vector
+       movaps          16(buf), %xmm7                  // 2nd 16 bytes vector
+       movaps          %xmm1, %xmm3                    // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
+       movaps          %xmm7, %xmm2                    // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
+       psadbw          zero, %xmm1                             // 2 16-bit words to be added for adler in xmm1
+       psadbw          zero, %xmm7                             // 2 16-bit words to be added for adler in xmm7
+       addq            %rdx, sum2                              // sum2 += adler*32;
+       pmaddubsw       %xmm6, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
+       pmaddubsw       %xmm4, %xmm2                    // 8 16-bit words to be added for sum2 in xmm2
+       paddd           %xmm7, %xmm1                    // 2 16-bit words to be added for adler in xmm1
+       paddw           %xmm2, %xmm3                    // 8 16-bit words to be added for sum2 in xmm3
+       addq            $$32, buf                               // buf -> vector for next iteration
+       movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2       
+       pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
+       paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
+       movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+       movd            %xmm1, %edx                             // to be added to adler
+       paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
+       addq            %rdx, adler                             // update adler
+       movd            %xmm3, %edx                             // to be added to sum2
+       psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 1st half of update
+       movd            %xmm3, %edx                             // to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 2nd half of update
+       .endm
+
+       // update adler/sum2 according to a new 48-byte vector
+
+       .macro          DO48
+       imulq           $$48, adler, %rdx               // edx = 48*adler
+
+       movaps          (buf), %xmm7                    // 1st 16 bytes vector
+       movaps          16(buf), %xmm10                 // 2nd 16 bytes vector
+       movaps          32(buf), %xmm11                 // 3rd 16 bytes vector
+
+       movaps          %xmm7, %xmm1                    // 1st vector
+       movaps          %xmm10, %xmm2                   // 2nd vector
+       movaps          %xmm11, %xmm3                   // 3rd vector
+
+       psadbw          zero, %xmm7                             // 1st vector for adler
+       psadbw          zero, %xmm10                    // 2nd vector for adler
+       psadbw          zero, %xmm11                    // 3rd vector for adler
+
+       addq            %rdx, sum2                              // sum2 += adler*48;
+
+       pmaddubsw       %xmm9, %xmm1                    // 8 16-bit words to be added for sum2 : 1st vector
+       pmaddubsw       %xmm6, %xmm2                    // 8 16-bit words to be added for sum2 : 2nd vector
+       pmaddubsw       %xmm4, %xmm3                    // 8 16-bit words to be added for sum2 : 3rd vector
+
+       pmaddwd         ones, %xmm1                             // 4 32-bit elements to be added for sum2 in xmm1
+       pmaddwd         ones, %xmm2                             // 4 32-bit elements to be added for sum2 in xmm2
+       pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
+
+       paddd           %xmm10, %xmm7                   // 2 16-bit words to be added for adler 
+       paddd           %xmm11, %xmm7                   // 2 16-bit words to be added for adler
+
+       paddd           %xmm1, %xmm3                    // 4 32-bit elements to be added for sum2
+       paddd           %xmm2, %xmm3                    // 4 32-bit elements to be added for sum2
+
+       addq            $$48, buf                               // buf -> vector for next iteration
+
+       movhlps         %xmm7, %xmm2                    // higher 16-bit word (for adler) in xmm2       
+       paddq           %xmm2, %xmm7                    // xmm7 lower 32-bit to be added to adler
+
+       movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+       movd            %xmm7, %edx                             // to be added to adler
+       paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
+       addq            %rdx, adler                             // update adler
+       movd            %xmm3, %edx                             // to be added to sum2
+       psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 1st half of update
+       movd            %xmm3, %edx                             // to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 2nd half of update
+       .endm
+
+       // update adler/sum2 according to a new 64-byte vector
+       .macro          DO64
+       imulq           $$64, adler, %rdx               // edx = 64*adler
+
+       movaps          (buf), %xmm1                    // 1st 16 bytes vector
+       movaps          16(buf), %xmm7                  // 2nd 16 bytes vector
+       movaps          32(buf), %xmm10                 // 3rd 16 bytes vector
+       movaps          48(buf), %xmm11                 // 4th 16 bytes vector
+
+       movaps          %xmm1, %xmm3                    // 1st vector
+       movaps          %xmm11, %xmm2                   // 4th vector
+       psadbw          zero, %xmm1                             // 1st vector for adler
+       psadbw          zero, %xmm11                    // 4th vector for adler
+
+       addq            %rdx, sum2                              // sum2 += adler*64;
+
+       pmaddubsw       %xmm8, %xmm3                    // 8 16-bit words to be added for sum2 : 1st vector
+       pmaddubsw       %xmm4, %xmm2                    // 8 16-bit words to be added for sum2 : 4th vector
+       pmaddwd         ones, %xmm3                             // 4 32-bit elements to be added for sum2 in xmm3
+       pmaddwd         ones, %xmm2                             // 4 32-bit elements to be added for sum2 in xmm2
+
+       paddd           %xmm11, %xmm1                   // 2 16-bit words to be added for adler in xmm1
+       paddd           %xmm2, %xmm3                    // 4 32-bit elements to be added for sum2 in xmm3 
+
+       movaps          %xmm7, %xmm2                    // 2nd vector
+       movaps          %xmm10, %xmm11                  // 3rd vector
+
+       psadbw          zero, %xmm7                             // 2nd vector for adler
+       psadbw          zero, %xmm10                    // 3rd vector for adler
+
+       pmaddubsw       %xmm9, %xmm2                    // 8 16-bit words to be added for sum2 : 2nd vector
+       pmaddubsw       %xmm6, %xmm11                   // 8 16-bit words to be added for sum2 : 3rd vector 
+       pmaddwd         ones, %xmm2                             // 4 32-bit elements to be added for sum2 in xmm2
+       pmaddwd         ones, %xmm11                    // 4 32-bit elements to be added for sum2 in xmm11
+
+       paddd           %xmm7, %xmm1                    // 2 16-bit words to be added for adler in xmm1
+       paddd           %xmm10, %xmm1                   // 2 16-bit words to be added for adler in xmm1
+
+       paddd           %xmm2, %xmm3                    // 4 32-bit elements to be added for sum2 in xmm3
+       paddd           %xmm11, %xmm3                   // 4 32-bit elements to be added for sum2 in xmm3
+
+       addq            $$64, buf                               // buf -> vector for next iteration
+
+       movhlps         %xmm1, %xmm2                    // higher 16-bit word (for adler) in xmm2       
+       paddq           %xmm2, %xmm1                    // xmm1 lower 32-bit to be added to adler
+       movhlps         %xmm3, %xmm2                    // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+       movd            %xmm1, %edx                             // to be added to adler
+       paddd           %xmm2, %xmm3                    // 2 32-bits elements in xmm3 to be added to sum2
+       addq            %rdx, adler                             // update adler
+       movd            %xmm3, %edx                             // to be added to sum2
+       psrlq           $$32, %xmm3                             // another 32-bit to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 1st half of update
+       movd            %xmm3, %edx                             // to be added to sum2
+       addq            %rdx, sum2                              // sum2 += 2nd half of update
+       .endm
+
+       // need to fill up xmm4/xmm5/xmm6 only if len>=16
+       cmpq    $16, len
+       jl              skip_loading_tables
+
+       // set up table starting address to %eax
+       leaq    sum2_coefficients(%rip), %rax
+
+       // reading coefficients
+       pxor    zero, zero
+       movaps  (%rax), %xmm8                   // coefficients for computing sum2 : pmaddubsw 64:49
+       movaps  16(%rax), %xmm9                 // coefficients for computing sum2 : pmaddubsw 48:33
+       movaps  32(%rax), %xmm6                 // coefficients for computing sum2 : pmaddubsw 32:17
+       movaps  48(%rax), %xmm4                 // coefficients for computing sum2 : pmaddubsw 16:1
+       movaps  64(%rax), ones                  // coefficients for computing sum2 : pmaddwd 1,1,...,1
+
+skip_loading_tables:
+
+
+       cmpq    $NMAX, len                              // len vs NMAX
+       jl              len_lessthan_NMAX               // if (len < NMAX), skip the following NMAX batches processing
+
+len_ge_NMAX_loop:                                      // while (len>=NMAX) {
+
+       subq    $NMAX, len                              //              len -= NMAX
+       movq    $(NMAX/64), %rax                //              n = NMAX/64
+
+n_loop:                                                                //              do {
+       DO64                                                    //                      update adler/sum2 for a 64-byte input
+       decq    %rax                                    //                      n--;
+       jg              n_loop                                  //      } while (n);
+
+       DO48                                                    //              update adler/sum2 for a 48-byte input
+
+       modulo_BASE                                             //              (adler/sum2) modulo BASE;
+
+       cmpq    $NMAX, len                              //  
+       jge             len_ge_NMAX_loop                // }    /* len>=NMAX */
+
+len_lessthan_NMAX:
+
+       subq    $64, len                                // pre-decrement len by 64
+       jl              len_lessthan_64                 // if len < 64, skip the 64-vector code
+len64_loop:                                                    // while (len>=64) {
+       DO64                                                    //   update adler/sum2 for a 64-byte input
+       subq    $64, len                                //   len -= 64;
+       jge             len64_loop                              // } 
+
+len_lessthan_64:
+       addq    $(64-32), len                   // post-increment 64 + pre-decrement 32 of len
+       jl              len_lessthan_32                 // if len < 32, skip the 32-vector code
+       DO32                                                    //   update adler/sum2 for a 32-byte input
+       subq    $32, len                                //   len -= 32;
+
+len_lessthan_32:
+
+       addq    $(32-16), len                   // post-increment by 32 + pre-decrement by 16 on len
+       jl              len_lessthan_16                 // if len < 16, skip the 16-vector code
+       DO16                                                    // update adler/sum2 for a 16-byte input
+       subq    $16, len                                // len -= 16;
+
+len_lessthan_16:
+       addq    $16, len                                // post-increment len by 16
+       jz              len_is_zero                             // if len==0, branch over scalar processing
+
+scalar_loop:                                           // while (len) {
+       movzbq  (buf), %rdx                             //      new input byte
+       incq    buf                                             //      buf++
+       addq    %rdx, adler                             //      adler += *buf
+       addq    adler, sum2                             //      sum2 += adler
+       decq    len                                             //      len--
+       jg              scalar_loop                             // }
+
+len_is_zero:
+
+       modulo_BASE                                             // (adler/sum2) modulo BASE;
+
+       // construct 32-bit (sum2<<16 | adler) to be returned
+
+       salq    $16, sum2                               // sum2 <<16
+       movq    adler, %rax                             // adler                
+       orq             sum2, %rax                              // sum2<<16 | adler
+
+
+#ifdef KERNEL                  // if for kernel, restore %xmm0-%xmm11
+       movaps  -32(%rbp), %xmm0
+       movaps  -48(%rbp), %xmm1
+       movaps  -64(%rbp), %xmm2
+       movaps  -80(%rbp), %xmm3
+       movaps  -96(%rbp), %xmm4
+       movaps  -112(%rbp), %xmm5
+       movaps  -128(%rbp), %xmm6
+       movaps  -144(%rbp), %xmm7
+       movaps  -160(%rbp), %xmm8
+       movaps  -176(%rbp), %xmm9
+       movaps  -192(%rbp), %xmm10
+       movaps  -208(%rbp), %xmm11
+       addq    $200, %rsp      // we've already restored %xmm0-%xmm11 from stack
+#endif
+
+       popq   %rbx
+       leave                                                   // pop ebp out from stack
+       ret
+
+
+       .const
+       .align  4
+sum2_coefficients:     // used for vectorizing adler32 computation
+
+       // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
+
+       .byte   64
+       .byte   63
+       .byte   62
+       .byte   61
+       .byte   60
+       .byte   59
+       .byte   58
+       .byte   57
+       .byte   56
+       .byte   55
+       .byte   54
+       .byte   53
+       .byte   52
+       .byte   51
+       .byte   50
+       .byte   49
+       .byte   48
+       .byte   47
+       .byte   46
+       .byte   45
+       .byte   44
+       .byte   43
+       .byte   42
+       .byte   41
+       .byte   40
+       .byte   39
+       .byte   38
+       .byte   37
+       .byte   36
+       .byte   35
+       .byte   34
+       .byte   33
+       .byte   32
+       .byte   31
+       .byte   30
+       .byte   29
+       .byte   28
+       .byte   27
+       .byte   26
+       .byte   25
+       .byte   24
+       .byte   23
+       .byte   22
+       .byte   21
+       .byte   20
+       .byte   19
+       .byte   18
+       .byte   17
+       .byte   16
+       .byte   15
+       .byte   14
+       .byte   13
+       .byte   12
+       .byte   11
+       .byte   10
+       .byte   9
+       .byte   8
+       .byte   7
+       .byte   6
+       .byte   5
+       .byte   4
+       .byte   3
+       .byte   2
+       .byte   1
+
+       // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+       .word   1
+
+#endif // (defined __i386__)
+
+#endif // (defined __i386__ || defined __x86_64__)