--- /dev/null
+/* Apple Copyright 2009
+ CoreOS - vector & Numerics, cclee 10-22-09
+
+ This following source code implements a vectorized version of adler32 computation that is defined in zlib.
+ The target architectures are x86_64 and i386.
+
+ Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
+ The adler-sum2 pair is updated according to
+
+ for (i=0;i<N;i++) {
+ adler = (adler+x[i])%BASE;
+ sum2 = (sum2+adler)%BASE;
+ }
+
+ To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
+ adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
+ algorithm to
+
+ for (i=0;i<N;i+=NMAX) {
+ for (k=0;k<NMAX;k++) {
+ adler+=x[i+k];
+ sum2+=adler;
+ }
+ adler%=BASE;
+ sum2%=BASE;
+ }
+
+ The hand optimization of this function is now reduced to
+
+ for (k=0;k<NMAX;k++) {
+ adler+=x[k];
+ sum2+=adler;
+ }
+
+ This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
+
+ for (k=0;k<K;k++) {
+ adler+=x[k];
+ sum2+=adler;
+ }
+
+ It can be shown that the sum2-adler pair can be updated according to
+
+ sum2 += adler*K;
+ adler += (x[0] + x[1] + ... + x[K-1]);
+ sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
+
+ The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
+ The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors
+ [ 1 1 1 ... 1 ] for adler update.
+ [ K K-1 ... 1 ] for sum2 update.
+
+ The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm
+ registers. It then uses SSE instructions to perform the aforementioned vector computation.
+
+ For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32),
+ followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
+
+ For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block),
+ it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48),
+ before calling a modulo operation for adler and sum2.
+
+*/
+
+/* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3
+
+ Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3
+ add code here to probe cpu_capabilities for ssse3 support
+ if ssse3 is supported, branch to ssse3-based code, otherwise use the original code
+
+ cclee 5-3-10
+*/
+
+#define BASE 65521 /* largest prime smaller than 65536 */
+#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
+// unsigned n;
+// while (len >= NMAX) {
+// len -= NMAX;
+// n = NMAX / 16; /* NMAX is divisible by 16 */
+// do {
+// DO16(buf); /* 16 sums unrolled */
+// buf += 16;
+// } while (--n);
+// MOD(adler);
+// MOD(sum2);
+// }
+// if (len) { /* avoid modulos if none remaining */
+// while (len >= 16) {
+// len -= 16;
+// DO16(buf);
+// buf += 16;
+// }
+// while (len--) {
+// adler += *buf++;
+// sum2 += adler;
+// }
+// MOD(adler);
+// MOD(sum2);
+// }
+// return adler | (sum2 << 16);
+// }
+
+#if (defined __i386__ || defined __x86_64__)
+
+#include <i386/cpu_capabilities.h>
+
+ .text
+ .align 4,0x90
+.globl _adler32_vec
+_adler32_vec:
+
+#if (defined __i386__)
+
+ pushl %ebp
+ movl %esp, %ebp
+
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+
+#ifdef KERNEL // if this is for kernel, need to save xmm registers
+ subl $140, %esp // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary
+ movaps %xmm0, 0(%esp) // save xmm0, offset -12 for ebx/edi/esi
+ movaps %xmm1, 16(%esp) // save xmm1
+ movaps %xmm2, 32(%esp) // save xmm2
+ movaps %xmm3, 48(%esp) // save xmm3
+ movaps %xmm4, 64(%esp) // save xmm4
+ movaps %xmm5, 80(%esp) // save xmm5
+ movaps %xmm6, 96(%esp) // save xmm6
+ movaps %xmm7, 112(%esp) // save xmm7, if this is for SSSE3 or above
+#endif
+
+ #define adler %edi // 8(%ebp)
+ #define sum2 %esi // 12(%ebp)
+ #define buf %ecx // 16(%ebp)
+ #define len %ebx // 20(%ebp)
+ #define zero %xmm0
+ #define ones %xmm5
+
+ movl 8(%ebp), adler
+ movl 12(%ebp), sum2
+ movl 16(%ebp), buf // use ecx as buf pointer
+ movl 20(%ebp), len
+
+ .macro modulo_BASE
+ movl $$-2146992015, %eax // 1/BASE in Q47
+ mull adler // edx:eax = adler divided by BASE in Q47
+ shrl $$15, %edx // edx is now the floor integer of adler and BASE
+ imull $$BASE, %edx, %edx // edx * BASE
+ subl %edx, adler // adler -= edx*BASE
+ movl $$-2146992015, %eax // 1/BASE in Q47
+ mull sum2 // edx:eax = sum2 divided by BASE in Q47
+ shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
+ imull $$BASE, %edx, %eax // eax = edx * BASE
+ subl %eax, sum2 // sum2 -= sdx*BASE
+ .endmacro
+
+ // update adler/sum2 according to a new 16-byte vector
+ .macro DO16
+ movaps (buf), %xmm1 // 16 bytes vector, in xmm1
+ movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
+ addl $$16, buf // buf -> next vector
+ psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ imull $$16, adler, %edx // edx = 16*adler;
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ addl %edx, sum2 // sum2 += adler*16;
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addl %edx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addl %edx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addl %edx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // update adler/sum2 according to a new 32-byte vector
+ .macro DO32
+ imull $$32, adler, %edx // edx = 32*adler
+ movaps (buf), %xmm1 // 1st 16 bytes vector
+ movaps 16(buf), %xmm7 // 2nd 16 bytes vector
+ movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
+ movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
+ psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7
+ addl %edx, sum2 // sum2 += adler*32;
+ pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
+ paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ paddd %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ addl $$32, buf // buf -> vector for next iteration
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addl %edx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addl %edx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addl %edx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // this defines the macro DO16 for SSSE3 not supported
+ .macro DO16_nossse3
+ movaps (buf), %xmm1 // 16 bytes vector
+ movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
+ movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
+ psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector
+ punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words
+ punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words
+ pmullw %xmm6, %xmm3 // lower 8 words * 16:9
+ pmullw %xmm4, %xmm2 // higher 8 words * 8:1
+ addl $$16, buf // buf -> next vector
+ psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ imull $$16, adler, %edx // edx = 16*adler;
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ addl %edx, sum2 // sum2 += adler*16;
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addl %edx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addl %edx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addl %edx, sum2 // sum2 += 2nd half of update
+ .endm
+
+#ifdef KERNEL
+ leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
+ mov (%eax), %eax // %eax = __cpu_capabilities
+#else
+ mov _COMM_PAGE_CPU_CAPABILITIES, %eax
+#endif
+ test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasAES
+ je L_no_ssse3
+
+ // i386 adler32 with ssse3
+
+ // need to fill up xmm4/xmm5/xmm6 only if len>=16
+ cmpl $16, len
+ jl L_skip_loading_tables
+
+ // set up table starting address to %eax
+ leal sum2_coefficients, %eax
+
+ // reading coefficients
+ pxor zero, zero
+ movaps (%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
+ movaps 16(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
+ movaps 32(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
+
+L_skip_loading_tables:
+
+ cmpl $NMAX, len // len vs NMAX
+ jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing
+
+len_ge_NMAX_loop: // while (len>=NMAX) {
+
+ subl $NMAX, len // len -= NMAX
+ movl $(NMAX/32), %eax // n = NMAX/32
+
+n_loop: // do {
+ DO32 // update adler/sum2 for a 32-byte input
+ decl %eax // n--;
+ jg n_loop // } while (n);
+ DO16 // update adler/sum2 for a 16-byte input
+ modulo_BASE // (adler/sum2) modulo BASE;
+ cmpl $NMAX, len //
+ jge len_ge_NMAX_loop // } /* len>=NMAX */
+
+len_lessthan_NMAX:
+
+ subl $32, len // pre-decrement len by 32
+ jl len_lessthan_32 // if len < 32, skip the 32-vector code
+len32_loop: // while (len>=32) {
+ DO32 // update adler/sum2 for a 32-byte input
+ subl $32, len // len -= 32;
+ jge len32_loop // }
+
+len_lessthan_32:
+
+ addl $(32-16), len // post-increment by 32 + pre-decrement by 16 on len
+ jl L_len_lessthan_16 // if len < 16, skip the 16-vector code
+ DO16 // update adler/sum2 for a 16-byte input
+ subl $16, len // len -= 16;
+
+L_len_lessthan_16:
+ addl $16, len // post-increment len by 16
+ jz len_is_zero // if len==0, branch over scalar processing
+
+0: // while (len) {
+ movzbl (buf), %edx // new input byte
+ incl buf // buf++
+ addl %edx, adler // adler += *buf
+ addl adler, sum2 // sum2 += adler
+ subl $1, len // len--
+ jg 0b // }
+
+len_is_zero:
+
+ modulo_BASE // (adler/sum2) modulo BASE;
+
+ // construct 32-bit (sum2<<16 | adler) to be returned
+
+ sall $16, sum2 // sum2 <<16
+ movl adler, %eax // adler
+ orl sum2, %eax // sum2<<16 | adler
+
+
+#ifdef KERNEL // if this is for kernel code, need to restore xmm registers
+ movaps (%esp), %xmm0 // restore xmm0, offset -12 for ebx/edi/esi
+ movaps 16(%esp), %xmm1 // restore xmm1
+ movaps 32(%esp), %xmm2 // restore xmm2
+ movaps 48(%esp), %xmm3 // restore xmm3
+ movaps 64(%esp), %xmm4 // restore xmm4
+ movaps 80(%esp), %xmm5 // restore xmm5
+ movaps 96(%esp), %xmm6 // restore xmm6
+ movaps 112(%esp), %xmm7 // restore xmm7, if this is for SSSE3 or above
+ addl $140, %esp // we've already restored %xmm0-%xmm7 from stack
+#endif
+
+ popl %esi
+ popl %edi
+ popl %ebx
+ leave // pop ebp out from stack
+ ret
+
+
+L_no_ssse3:
+
+ // i386 adler32 without ssse3
+
+ // need to fill up xmm4/xmm5/xmm6 only if len>=16
+ cmpl $16, len
+ jl 2f
+
+ // set up table starting address to %eax
+ leal sum2_coefficients, %eax
+
+ // reading coefficients
+ pxor zero, zero
+ movaps 48(%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
+ movaps 64(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
+ movaps 80(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
+
+2:
+
+ cmpl $NMAX, len // len vs NMAX
+ jl 3f // if (len < NMAX), skip the following NMAX batches processing
+
+0: // while (len>=NMAX) {
+
+ subl $NMAX, len // len -= NMAX
+ movl $(NMAX/16), %eax // n = NMAX/16
+
+1: // do {
+ DO16_nossse3 // update adler/sum2 for a 16-byte input
+ decl %eax // n--;
+ jg 1b // } while (n);
+
+ modulo_BASE // (adler/sum2) modulo BASE;
+
+ cmpl $NMAX, len //
+ jge 0b // } /* len>=NMAX */
+
+3:
+
+ subl $16, len // pre-decrement len by 16
+ jl L_len_lessthan_16 // if len < 16, skip the 16-vector code
+ DO16_nossse3 // update adler/sum2 for a 16-byte input
+ subl $16, len // len -= 16;
+ jmp L_len_lessthan_16
+
+
+ .const
+ .align 4
+sum2_coefficients: // used for vectorizing adler32 computation
+
+ .byte 32
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+
+ // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+
+
+ // data for without ssse3
+
+ .word 16
+ .word 15
+ .word 14
+ .word 13
+ .word 12
+ .word 11
+ .word 10
+ .word 9
+ .word 8
+ .word 7
+ .word 6
+ .word 5
+ .word 4
+ .word 3
+ .word 2
+ .word 1
+
+ // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+
+#else // (defined __x86_64__)
+
+ movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
+ mov (%rax), %eax // %eax = __cpu_capabilities
+ test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasSupplementalSSE3
+ jne L_has_ssse3
+
+ // ----------------------------------------------------------------------------------
+ // the following is added for x86_64 without SSSE3 support
+ // it is essentially a translated copy of the i386 code without SSSE3 code
+ // ----------------------------------------------------------------------------------
+
+ // input :
+ // adler : rdi
+ // sum2 : rsi
+ // buf : rdx
+ // len : rcx
+
+ pushq %rbp
+ movq %rsp, %rbp
+ pushq %rbx
+
+#ifdef KERNEL // if for kernel, save %xmm0-%xmm11
+ subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
+ movaps %xmm0, -32(%rbp)
+ movaps %xmm1, -48(%rbp)
+ movaps %xmm2, -64(%rbp)
+ movaps %xmm3, -80(%rbp)
+ movaps %xmm4, -96(%rbp)
+ movaps %xmm5, -112(%rbp)
+ movaps %xmm6, -128(%rbp)
+#endif
+
+ #define adler %rdi // 16(%rbp)
+ #define sum2 %rsi // 24(%ebp)
+ #define buf %rcx // 32(%ebp)
+ #define len %rbx // 40(%ebp)
+ #define zero %xmm0
+ #define ones %xmm5
+
+ movq %rcx, len
+ movq %rdx, buf
+
+ .macro modulo_BASE
+ movl $$-2146992015, %eax // 1/BASE in Q47
+ mull %edi // edx:eax = adler divided by BASE in Q47
+ shrl $$15, %edx // edx is now the floor integer of adler and BASE
+ imull $$BASE, %edx, %edx // edx * BASE
+ subq %rdx, adler // adler -= edx*BASE
+ movl $$-2146992015, %eax // 1/BASE in Q47
+ mull %esi // edx:eax = sum2 divided by BASE in Q47
+ shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
+ imull $$BASE, %edx, %eax // eax = edx * BASE
+ subq %rax, sum2 // sum2 -= sdx*BASE
+ .endmacro
+
+ // update adler/sum2 according to a new 16-byte vector, no ssse3
+ .macro DO16_nossse3
+ movaps (buf), %xmm1 // 16 bytes vector
+ movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
+ movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
+ psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector
+ punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words
+ punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words
+ pmullw %xmm6, %xmm3 // lower 8 words * 16:9
+ pmullw %xmm4, %xmm2 // higher 8 words * 8:1
+ add $$16, buf // buf -> next vector
+ psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ imulq $$16, adler, %rdx // edx = 16*adler;
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ add %rdx, sum2 // sum2 += adler*16;
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addq %rdx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addq %rdx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addq %rdx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // need to fill up xmm4/xmm5/xmm6 only if len>=16
+ cmpq $16, len
+ jl 0f
+
+ // set up table starting address to %eax
+ leaq sum2_coefficients_nossse3(%rip), %rax
+
+ // reading coefficients
+ pxor zero, zero
+ movaps (%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
+ movaps 16(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
+ movaps 32(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
+0:
+
+ cmp $NMAX, len // len vs NMAX
+ jl 3f // if (len < NMAX), skip the following NMAX batches processing
+
+0: // while (len>=NMAX) {
+
+ sub $NMAX, len // len -= NMAX
+ mov $(NMAX/16), %eax // n = NMAX/16
+
+1: // do {
+ DO16_nossse3 // update adler/sum2 for a 16-byte input
+ decl %eax // n--;
+ jg 1b // } while (n);
+
+ modulo_BASE // (adler/sum2) modulo BASE;
+
+ cmp $NMAX, len //
+ jge 0b // } /* len>=NMAX */
+
+3:
+
+ sub $16, len // pre-decrement len by 16
+ jl 2f // if len < 16, skip the 16-vector code
+ DO16_nossse3 // update adler/sum2 for a 16-byte input
+ sub $16, len // len -= 16;
+
+2:
+ add $16, len // post-increment len by 16
+ jz 1f // if len==0, branch over scalar processing
+
+0: // while (len) {
+ movzbq (buf), %rdx // new input byte
+ incq buf // buf++
+ addq %rdx, adler // adler += *buf
+ addq adler, sum2 // sum2 += adler
+ decq len // len--
+ jg 0b // }
+
+1:
+
+ modulo_BASE // (adler/sum2) modulo BASE;
+
+ // construct 32-bit (sum2<<16 | adler) to be returned
+
+ salq $16, sum2 // sum2 <<16
+ movq adler, %rax // adler
+ orq sum2, %rax // sum2<<16 | adler
+
+#ifdef KERNEL // if this is for kernel code, need to restore xmm registers
+ movaps -32(%rbp), %xmm0
+ movaps -48(%rbp), %xmm1
+ movaps -64(%rbp), %xmm2
+ movaps -80(%rbp), %xmm3
+ movaps -96(%rbp), %xmm4
+ movaps -112(%rbp), %xmm5
+ movaps -128(%rbp), %xmm6
+ addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack
+#endif
+
+ popq %rbx
+ leave
+ ret
+
+
+
+ .const
+ .align 4
+sum2_coefficients_nossse3: // used for vectorizing adler32 computation
+
+ // data for without ssse3
+
+ .word 16
+ .word 15
+ .word 14
+ .word 13
+ .word 12
+ .word 11
+ .word 10
+ .word 9
+ .word 8
+ .word 7
+ .word 6
+ .word 5
+ .word 4
+ .word 3
+ .word 2
+ .word 1
+
+ // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+
+
+ .text
+
+ // ----------------------------------------------------------------------------------
+ // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions
+ // ----------------------------------------------------------------------------------
+
+L_has_ssse3:
+
+ // input :
+ // adler : rdi
+ // sum2 : rsi
+ // buf : rdx
+ // len : rcx
+
+ pushq %rbp
+ movq %rsp, %rbp
+ pushq %rbx
+
+#ifdef KERNEL // if for kernel, save %xmm0-%xmm11
+ subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
+ movaps %xmm0, -32(%rbp)
+ movaps %xmm1, -48(%rbp)
+ movaps %xmm2, -64(%rbp)
+ movaps %xmm3, -80(%rbp)
+ movaps %xmm4, -96(%rbp)
+ movaps %xmm5, -112(%rbp)
+ movaps %xmm6, -128(%rbp)
+ movaps %xmm7, -144(%rbp)
+ movaps %xmm8, -160(%rbp)
+ movaps %xmm9, -176(%rbp)
+ movaps %xmm10, -192(%rbp)
+ movaps %xmm11, -208(%rbp)
+#endif
+
+ #define adler %rdi // 16(%rbp)
+ #define sum2 %rsi // 24(%ebp)
+ #define buf %rcx // 32(%ebp)
+ #define len %rbx // 40(%ebp)
+ #define zero %xmm0
+ #define ones %xmm5
+
+ movq %rcx, len
+ movq %rdx, buf
+
+ // update adler/sum2 according to a new 16-byte vector
+ .macro DO16
+ movaps (buf), %xmm1 // 16 bytes vector
+ movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
+ addq $$16, buf // buf -> next vector
+ psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ imulq $$16, adler, %rdx // edx = 16*adler;
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ addq %rdx, sum2 // sum2 += adler*16;
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addq %rdx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addq %rdx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addq %rdx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // update adler/sum2 according to a new 32-byte vector
+ .macro DO32
+ imulq $$32, adler, %rdx // edx = 32*adler
+ movaps (buf), %xmm1 // 1st 16 bytes vector
+ movaps 16(buf), %xmm7 // 2nd 16 bytes vector
+ movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
+ movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
+ psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7
+ addq %rdx, sum2 // sum2 += adler*32;
+ pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
+ paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
+ addq $$32, buf // buf -> vector for next iteration
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addq %rdx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addq %rdx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addq %rdx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // update adler/sum2 according to a new 48-byte vector
+
+ .macro DO48
+ imulq $$48, adler, %rdx // edx = 48*adler
+
+ movaps (buf), %xmm7 // 1st 16 bytes vector
+ movaps 16(buf), %xmm10 // 2nd 16 bytes vector
+ movaps 32(buf), %xmm11 // 3rd 16 bytes vector
+
+ movaps %xmm7, %xmm1 // 1st vector
+ movaps %xmm10, %xmm2 // 2nd vector
+ movaps %xmm11, %xmm3 // 3rd vector
+
+ psadbw zero, %xmm7 // 1st vector for adler
+ psadbw zero, %xmm10 // 2nd vector for adler
+ psadbw zero, %xmm11 // 3rd vector for adler
+
+ addq %rdx, sum2 // sum2 += adler*48;
+
+ pmaddubsw %xmm9, %xmm1 // 8 16-bit words to be added for sum2 : 1st vector
+ pmaddubsw %xmm6, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
+ pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 : 3rd vector
+
+ pmaddwd ones, %xmm1 // 4 32-bit elements to be added for sum2 in xmm1
+ pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+
+ paddd %xmm10, %xmm7 // 2 16-bit words to be added for adler
+ paddd %xmm11, %xmm7 // 2 16-bit words to be added for adler
+
+ paddd %xmm1, %xmm3 // 4 32-bit elements to be added for sum2
+ paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2
+
+ addq $$48, buf // buf -> vector for next iteration
+
+ movhlps %xmm7, %xmm2 // higher 16-bit word (for adler) in xmm2
+ paddq %xmm2, %xmm7 // xmm7 lower 32-bit to be added to adler
+
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm7, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addq %rdx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addq %rdx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addq %rdx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // update adler/sum2 according to a new 64-byte vector
+ .macro DO64
+ imulq $$64, adler, %rdx // edx = 64*adler
+
+ movaps (buf), %xmm1 // 1st 16 bytes vector
+ movaps 16(buf), %xmm7 // 2nd 16 bytes vector
+ movaps 32(buf), %xmm10 // 3rd 16 bytes vector
+ movaps 48(buf), %xmm11 // 4th 16 bytes vector
+
+ movaps %xmm1, %xmm3 // 1st vector
+ movaps %xmm11, %xmm2 // 4th vector
+ psadbw zero, %xmm1 // 1st vector for adler
+ psadbw zero, %xmm11 // 4th vector for adler
+
+ addq %rdx, sum2 // sum2 += adler*64;
+
+ pmaddubsw %xmm8, %xmm3 // 8 16-bit words to be added for sum2 : 1st vector
+ pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 : 4th vector
+ pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
+
+ paddd %xmm11, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+
+ movaps %xmm7, %xmm2 // 2nd vector
+ movaps %xmm10, %xmm11 // 3rd vector
+
+ psadbw zero, %xmm7 // 2nd vector for adler
+ psadbw zero, %xmm10 // 3rd vector for adler
+
+ pmaddubsw %xmm9, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
+ pmaddubsw %xmm6, %xmm11 // 8 16-bit words to be added for sum2 : 3rd vector
+ pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
+ pmaddwd ones, %xmm11 // 4 32-bit elements to be added for sum2 in xmm11
+
+ paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
+ paddd %xmm10, %xmm1 // 2 16-bit words to be added for adler in xmm1
+
+ paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+ paddd %xmm11, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
+
+ addq $$64, buf // buf -> vector for next iteration
+
+ movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
+ paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
+ movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
+ movd %xmm1, %edx // to be added to adler
+ paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
+ addq %rdx, adler // update adler
+ movd %xmm3, %edx // to be added to sum2
+ psrlq $$32, %xmm3 // another 32-bit to be added to sum2
+ addq %rdx, sum2 // sum2 += 1st half of update
+ movd %xmm3, %edx // to be added to sum2
+ addq %rdx, sum2 // sum2 += 2nd half of update
+ .endm
+
+ // need to fill up xmm4/xmm5/xmm6 only if len>=16
+ cmpq $16, len
+ jl skip_loading_tables
+
+ // set up table starting address to %eax
+ leaq sum2_coefficients(%rip), %rax
+
+ // reading coefficients
+ pxor zero, zero
+ movaps (%rax), %xmm8 // coefficients for computing sum2 : pmaddubsw 64:49
+ movaps 16(%rax), %xmm9 // coefficients for computing sum2 : pmaddubsw 48:33
+ movaps 32(%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
+ movaps 48(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
+ movaps 64(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
+
+skip_loading_tables:
+
+
+ cmpq $NMAX, len // len vs NMAX
+ jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing
+
+len_ge_NMAX_loop: // while (len>=NMAX) {
+
+ subq $NMAX, len // len -= NMAX
+ movq $(NMAX/64), %rax // n = NMAX/64
+
+n_loop: // do {
+ DO64 // update adler/sum2 for a 64-byte input
+ decq %rax // n--;
+ jg n_loop // } while (n);
+
+ DO48 // update adler/sum2 for a 48-byte input
+
+ modulo_BASE // (adler/sum2) modulo BASE;
+
+ cmpq $NMAX, len //
+ jge len_ge_NMAX_loop // } /* len>=NMAX */
+
+len_lessthan_NMAX:
+
+ subq $64, len // pre-decrement len by 64
+ jl len_lessthan_64 // if len < 64, skip the 64-vector code
+len64_loop: // while (len>=64) {
+ DO64 // update adler/sum2 for a 64-byte input
+ subq $64, len // len -= 64;
+ jge len64_loop // }
+
+len_lessthan_64:
+ addq $(64-32), len // post-increment 64 + pre-decrement 32 of len
+ jl len_lessthan_32 // if len < 32, skip the 32-vector code
+ DO32 // update adler/sum2 for a 32-byte input
+ subq $32, len // len -= 32;
+
+len_lessthan_32:
+
+ addq $(32-16), len // post-increment by 32 + pre-decrement by 16 on len
+ jl len_lessthan_16 // if len < 16, skip the 16-vector code
+ DO16 // update adler/sum2 for a 16-byte input
+ subq $16, len // len -= 16;
+
+len_lessthan_16:
+ addq $16, len // post-increment len by 16
+ jz len_is_zero // if len==0, branch over scalar processing
+
+scalar_loop: // while (len) {
+ movzbq (buf), %rdx // new input byte
+ incq buf // buf++
+ addq %rdx, adler // adler += *buf
+ addq adler, sum2 // sum2 += adler
+ decq len // len--
+ jg scalar_loop // }
+
+len_is_zero:
+
+ modulo_BASE // (adler/sum2) modulo BASE;
+
+ // construct 32-bit (sum2<<16 | adler) to be returned
+
+ salq $16, sum2 // sum2 <<16
+ movq adler, %rax // adler
+ orq sum2, %rax // sum2<<16 | adler
+
+
+#ifdef KERNEL // if for kernel, restore %xmm0-%xmm11
+ movaps -32(%rbp), %xmm0
+ movaps -48(%rbp), %xmm1
+ movaps -64(%rbp), %xmm2
+ movaps -80(%rbp), %xmm3
+ movaps -96(%rbp), %xmm4
+ movaps -112(%rbp), %xmm5
+ movaps -128(%rbp), %xmm6
+ movaps -144(%rbp), %xmm7
+ movaps -160(%rbp), %xmm8
+ movaps -176(%rbp), %xmm9
+ movaps -192(%rbp), %xmm10
+ movaps -208(%rbp), %xmm11
+ addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack
+#endif
+
+ popq %rbx
+ leave // pop ebp out from stack
+ ret
+
+
+ .const
+ .align 4
+sum2_coefficients: // used for vectorizing adler32 computation
+
+ // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
+
+ .byte 64
+ .byte 63
+ .byte 62
+ .byte 61
+ .byte 60
+ .byte 59
+ .byte 58
+ .byte 57
+ .byte 56
+ .byte 55
+ .byte 54
+ .byte 53
+ .byte 52
+ .byte 51
+ .byte 50
+ .byte 49
+ .byte 48
+ .byte 47
+ .byte 46
+ .byte 45
+ .byte 44
+ .byte 43
+ .byte 42
+ .byte 41
+ .byte 40
+ .byte 39
+ .byte 38
+ .byte 37
+ .byte 36
+ .byte 35
+ .byte 34
+ .byte 33
+ .byte 32
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+
+ // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+ .word 1
+
+#endif // (defined __i386__)
+
+#endif // (defined __i386__ || defined __x86_64__)