1 /* Apple Copyright 2009
2 CoreOS - vector & Numerics, cclee 10-22-09
4 This following source code implements a vectorized version of adler32 computation that is defined in zlib.
5 The target architectures are x86_64 and i386.
7 Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
8 The adler-sum2 pair is updated according to
11 adler = (adler+x[i])%BASE;
12 sum2 = (sum2+adler)%BASE;
15 To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
16 adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
19 for (i=0;i<N;i+=NMAX) {
20 for (k=0;k<NMAX;k++) {
28 The hand optimization of this function is now reduced to
30 for (k=0;k<NMAX;k++) {
35 This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
42 It can be shown that the sum2-adler pair can be updated according to
45 adler += (x[0] + x[1] + ... + x[K-1]);
46 sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
48 The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
49 The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors
50 [ 1 1 1 ... 1 ] for adler update.
51 [ K K-1 ... 1 ] for sum2 update.
53 The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm
54 registers. It then uses SSE instructions to perform the aforementioned vector computation.
56 For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32),
57 followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
59 For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block),
60 it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48),
61 before calling a modulo operation for adler and sum2.
65 /* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3
67 Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3
68 add code here to probe cpu_capabilities for ssse3 support
69 if ssse3 is supported, branch to ssse3-based code, otherwise use the original code
74 #define BASE 65521 /* largest prime smaller than 65536 */
75 #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
77 // uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
79 // while (len >= NMAX) {
81 // n = NMAX / 16; /* NMAX is divisible by 16 */
83 // DO16(buf); /* 16 sums unrolled */
89 // if (len) { /* avoid modulos if none remaining */
90 // while (len >= 16) {
102 // return adler | (sum2 << 16);
105 #if (defined __i386__ || defined __x86_64__)
107 #include <i386/cpu_capabilities.h>
114 #if (defined __i386__)
123 #ifdef KERNEL // if this is for kernel, need to save xmm registers
124 subl $140, %esp // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary
125 movaps %xmm0, 0(%esp) // save xmm0, offset -12 for ebx/edi/esi
126 movaps %xmm1, 16(%esp) // save xmm1
127 movaps %xmm2, 32(%esp) // save xmm2
128 movaps %xmm3, 48(%esp) // save xmm3
129 movaps %xmm4, 64(%esp) // save xmm4
130 movaps %xmm5, 80(%esp) // save xmm5
131 movaps %xmm6, 96(%esp) // save xmm6
132 movaps %xmm7, 112(%esp) // save xmm7, if this is for SSSE3 or above
135 #define adler %edi // 8(%ebp)
136 #define sum2 %esi // 12(%ebp)
137 #define buf %ecx // 16(%ebp)
138 #define len %ebx // 20(%ebp)
144 movl 16(%ebp), buf // use ecx as buf pointer
148 movl $$-2146992015, %eax // 1/BASE in Q47
149 mull adler // edx:eax = adler divided by BASE in Q47
150 shrl $$15, %edx // edx is now the floor integer of adler and BASE
151 imull $$BASE, %edx, %edx // edx * BASE
152 subl %edx, adler // adler -= edx*BASE
153 movl $$-2146992015, %eax // 1/BASE in Q47
154 mull sum2 // edx:eax = sum2 divided by BASE in Q47
155 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
156 imull $$BASE, %edx, %eax // eax = edx * BASE
157 subl %eax, sum2 // sum2 -= sdx*BASE
160 // update adler/sum2 according to a new 16-byte vector
162 movaps (buf), %xmm1 // 16 bytes vector, in xmm1
163 movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
164 addl $$16, buf // buf -> next vector
165 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
166 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
167 imull $$16, adler, %edx // edx = 16*adler;
168 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
169 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
170 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
171 addl %edx, sum2 // sum2 += adler*16;
172 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
173 movd %xmm1, %edx // to be added to adler
174 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
175 addl %edx, adler // update adler
176 movd %xmm3, %edx // to be added to sum2
177 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
178 addl %edx, sum2 // sum2 += 1st half of update
179 movd %xmm3, %edx // to be added to sum2
180 addl %edx, sum2 // sum2 += 2nd half of update
183 // update adler/sum2 according to a new 32-byte vector
185 imull $$32, adler, %edx // edx = 32*adler
186 movaps (buf), %xmm1 // 1st 16 bytes vector
187 movaps 16(buf), %xmm7 // 2nd 16 bytes vector
188 movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
189 movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
190 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
191 psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7
192 addl %edx, sum2 // sum2 += adler*32;
193 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
194 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
195 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
196 paddd %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
197 addl $$32, buf // buf -> vector for next iteration
198 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
199 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
200 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
201 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
202 movd %xmm1, %edx // to be added to adler
203 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
204 addl %edx, adler // update adler
205 movd %xmm3, %edx // to be added to sum2
206 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
207 addl %edx, sum2 // sum2 += 1st half of update
208 movd %xmm3, %edx // to be added to sum2
209 addl %edx, sum2 // sum2 += 2nd half of update
212 // this defines the macro DO16 for SSSE3 not supported
214 movaps (buf), %xmm1 // 16 bytes vector
215 movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
216 movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
217 psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector
218 punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words
219 punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words
220 pmullw %xmm6, %xmm3 // lower 8 words * 16:9
221 pmullw %xmm4, %xmm2 // higher 8 words * 8:1
222 addl $$16, buf // buf -> next vector
223 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
224 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
225 imull $$16, adler, %edx // edx = 16*adler;
226 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
227 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
228 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
229 addl %edx, sum2 // sum2 += adler*16;
230 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
231 movd %xmm1, %edx // to be added to adler
232 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
233 addl %edx, adler // update adler
234 movd %xmm3, %edx // to be added to sum2
235 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
236 addl %edx, sum2 // sum2 += 1st half of update
237 movd %xmm3, %edx // to be added to sum2
238 addl %edx, sum2 // sum2 += 2nd half of update
242 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
243 mov (%eax), %eax // %eax = __cpu_capabilities
245 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
247 test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasAES
250 // i386 adler32 with ssse3
252 // need to fill up xmm4/xmm5/xmm6 only if len>=16
254 jl L_skip_loading_tables
256 // set up table starting address to %eax
257 leal sum2_coefficients, %eax
259 // reading coefficients
261 movaps (%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
262 movaps 16(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
263 movaps 32(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
265 L_skip_loading_tables:
267 cmpl $NMAX, len // len vs NMAX
268 jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing
270 len_ge_NMAX_loop: // while (len>=NMAX) {
272 subl $NMAX, len // len -= NMAX
273 movl $(NMAX/32), %eax // n = NMAX/32
276 DO32 // update adler/sum2 for a 32-byte input
278 jg n_loop // } while (n);
279 DO16 // update adler/sum2 for a 16-byte input
280 modulo_BASE // (adler/sum2) modulo BASE;
282 jge len_ge_NMAX_loop // } /* len>=NMAX */
286 subl $32, len // pre-decrement len by 32
287 jl len_lessthan_32 // if len < 32, skip the 32-vector code
288 len32_loop: // while (len>=32) {
289 DO32 // update adler/sum2 for a 32-byte input
290 subl $32, len // len -= 32;
295 addl $(32-16), len // post-increment by 32 + pre-decrement by 16 on len
296 jl L_len_lessthan_16 // if len < 16, skip the 16-vector code
297 DO16 // update adler/sum2 for a 16-byte input
298 subl $16, len // len -= 16;
301 addl $16, len // post-increment len by 16
302 jz len_is_zero // if len==0, branch over scalar processing
305 movzbl (buf), %edx // new input byte
307 addl %edx, adler // adler += *buf
308 addl adler, sum2 // sum2 += adler
309 subl $1, len // len--
314 modulo_BASE // (adler/sum2) modulo BASE;
316 // construct 32-bit (sum2<<16 | adler) to be returned
318 sall $16, sum2 // sum2 <<16
319 movl adler, %eax // adler
320 orl sum2, %eax // sum2<<16 | adler
323 #ifdef KERNEL // if this is for kernel code, need to restore xmm registers
324 movaps (%esp), %xmm0 // restore xmm0, offset -12 for ebx/edi/esi
325 movaps 16(%esp), %xmm1 // restore xmm1
326 movaps 32(%esp), %xmm2 // restore xmm2
327 movaps 48(%esp), %xmm3 // restore xmm3
328 movaps 64(%esp), %xmm4 // restore xmm4
329 movaps 80(%esp), %xmm5 // restore xmm5
330 movaps 96(%esp), %xmm6 // restore xmm6
331 movaps 112(%esp), %xmm7 // restore xmm7, if this is for SSSE3 or above
332 addl $140, %esp // we've already restored %xmm0-%xmm7 from stack
338 leave // pop ebp out from stack
344 // i386 adler32 without ssse3
346 // need to fill up xmm4/xmm5/xmm6 only if len>=16
350 // set up table starting address to %eax
351 leal sum2_coefficients, %eax
353 // reading coefficients
355 movaps 48(%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
356 movaps 64(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
357 movaps 80(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
361 cmpl $NMAX, len // len vs NMAX
362 jl 3f // if (len < NMAX), skip the following NMAX batches processing
364 0: // while (len>=NMAX) {
366 subl $NMAX, len // len -= NMAX
367 movl $(NMAX/16), %eax // n = NMAX/16
370 DO16_nossse3 // update adler/sum2 for a 16-byte input
372 jg 1b // } while (n);
374 modulo_BASE // (adler/sum2) modulo BASE;
377 jge 0b // } /* len>=NMAX */
381 subl $16, len // pre-decrement len by 16
382 jl L_len_lessthan_16 // if len < 16, skip the 16-vector code
383 DO16_nossse3 // update adler/sum2 for a 16-byte input
384 subl $16, len // len -= 16;
385 jmp L_len_lessthan_16
390 sum2_coefficients: // used for vectorizing adler32 computation
425 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
436 // data for without ssse3
455 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
465 #else // (defined __x86_64__)
467 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
468 mov (%rax), %eax // %eax = __cpu_capabilities
469 test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasSupplementalSSE3
472 // ----------------------------------------------------------------------------------
473 // the following is added for x86_64 without SSSE3 support
474 // it is essentially a translated copy of the i386 code without SSSE3 code
475 // ----------------------------------------------------------------------------------
487 #ifdef KERNEL // if for kernel, save %xmm0-%xmm11
488 subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
489 movaps %xmm0, -32(%rbp)
490 movaps %xmm1, -48(%rbp)
491 movaps %xmm2, -64(%rbp)
492 movaps %xmm3, -80(%rbp)
493 movaps %xmm4, -96(%rbp)
494 movaps %xmm5, -112(%rbp)
495 movaps %xmm6, -128(%rbp)
498 #define adler %rdi // 16(%rbp)
499 #define sum2 %rsi // 24(%ebp)
500 #define buf %rcx // 32(%ebp)
501 #define len %rbx // 40(%ebp)
509 movl $$-2146992015, %eax // 1/BASE in Q47
510 mull %edi // edx:eax = adler divided by BASE in Q47
511 shrl $$15, %edx // edx is now the floor integer of adler and BASE
512 imull $$BASE, %edx, %edx // edx * BASE
513 subq %rdx, adler // adler -= edx*BASE
514 movl $$-2146992015, %eax // 1/BASE in Q47
515 mull %esi // edx:eax = sum2 divided by BASE in Q47
516 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
517 imull $$BASE, %edx, %eax // eax = edx * BASE
518 subq %rax, sum2 // sum2 -= sdx*BASE
521 // update adler/sum2 according to a new 16-byte vector, no ssse3
523 movaps (buf), %xmm1 // 16 bytes vector
524 movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
525 movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
526 psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector
527 punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words
528 punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words
529 pmullw %xmm6, %xmm3 // lower 8 words * 16:9
530 pmullw %xmm4, %xmm2 // higher 8 words * 8:1
531 add $$16, buf // buf -> next vector
532 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
533 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
534 imulq $$16, adler, %rdx // edx = 16*adler;
535 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
536 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
537 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
538 add %rdx, sum2 // sum2 += adler*16;
539 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
540 movd %xmm1, %edx // to be added to adler
541 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
542 addq %rdx, adler // update adler
543 movd %xmm3, %edx // to be added to sum2
544 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
545 addq %rdx, sum2 // sum2 += 1st half of update
546 movd %xmm3, %edx // to be added to sum2
547 addq %rdx, sum2 // sum2 += 2nd half of update
550 // need to fill up xmm4/xmm5/xmm6 only if len>=16
554 // set up table starting address to %eax
555 leaq sum2_coefficients_nossse3(%rip), %rax
557 // reading coefficients
559 movaps (%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
560 movaps 16(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
561 movaps 32(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
564 cmp $NMAX, len // len vs NMAX
565 jl 3f // if (len < NMAX), skip the following NMAX batches processing
567 0: // while (len>=NMAX) {
569 sub $NMAX, len // len -= NMAX
570 mov $(NMAX/16), %eax // n = NMAX/16
573 DO16_nossse3 // update adler/sum2 for a 16-byte input
575 jg 1b // } while (n);
577 modulo_BASE // (adler/sum2) modulo BASE;
580 jge 0b // } /* len>=NMAX */
584 sub $16, len // pre-decrement len by 16
585 jl 2f // if len < 16, skip the 16-vector code
586 DO16_nossse3 // update adler/sum2 for a 16-byte input
587 sub $16, len // len -= 16;
590 add $16, len // post-increment len by 16
591 jz 1f // if len==0, branch over scalar processing
594 movzbq (buf), %rdx // new input byte
596 addq %rdx, adler // adler += *buf
597 addq adler, sum2 // sum2 += adler
603 modulo_BASE // (adler/sum2) modulo BASE;
605 // construct 32-bit (sum2<<16 | adler) to be returned
607 salq $16, sum2 // sum2 <<16
608 movq adler, %rax // adler
609 orq sum2, %rax // sum2<<16 | adler
611 #ifdef KERNEL // if this is for kernel code, need to restore xmm registers
612 movaps -32(%rbp), %xmm0
613 movaps -48(%rbp), %xmm1
614 movaps -64(%rbp), %xmm2
615 movaps -80(%rbp), %xmm3
616 movaps -96(%rbp), %xmm4
617 movaps -112(%rbp), %xmm5
618 movaps -128(%rbp), %xmm6
619 addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack
630 sum2_coefficients_nossse3: // used for vectorizing adler32 computation
632 // data for without ssse3
651 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
664 // ----------------------------------------------------------------------------------
665 // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions
666 // ----------------------------------------------------------------------------------
680 #ifdef KERNEL // if for kernel, save %xmm0-%xmm11
681 subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
682 movaps %xmm0, -32(%rbp)
683 movaps %xmm1, -48(%rbp)
684 movaps %xmm2, -64(%rbp)
685 movaps %xmm3, -80(%rbp)
686 movaps %xmm4, -96(%rbp)
687 movaps %xmm5, -112(%rbp)
688 movaps %xmm6, -128(%rbp)
689 movaps %xmm7, -144(%rbp)
690 movaps %xmm8, -160(%rbp)
691 movaps %xmm9, -176(%rbp)
692 movaps %xmm10, -192(%rbp)
693 movaps %xmm11, -208(%rbp)
696 #define adler %rdi // 16(%rbp)
697 #define sum2 %rsi // 24(%ebp)
698 #define buf %rcx // 32(%ebp)
699 #define len %rbx // 40(%ebp)
706 // update adler/sum2 according to a new 16-byte vector
708 movaps (buf), %xmm1 // 16 bytes vector
709 movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
710 addq $$16, buf // buf -> next vector
711 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
712 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
713 imulq $$16, adler, %rdx // edx = 16*adler;
714 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
715 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
716 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
717 addq %rdx, sum2 // sum2 += adler*16;
718 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
719 movd %xmm1, %edx // to be added to adler
720 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
721 addq %rdx, adler // update adler
722 movd %xmm3, %edx // to be added to sum2
723 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
724 addq %rdx, sum2 // sum2 += 1st half of update
725 movd %xmm3, %edx // to be added to sum2
726 addq %rdx, sum2 // sum2 += 2nd half of update
729 // update adler/sum2 according to a new 32-byte vector
731 imulq $$32, adler, %rdx // edx = 32*adler
732 movaps (buf), %xmm1 // 1st 16 bytes vector
733 movaps 16(buf), %xmm7 // 2nd 16 bytes vector
734 movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
735 movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
736 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
737 psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7
738 addq %rdx, sum2 // sum2 += adler*32;
739 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
740 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
741 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
742 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
743 addq $$32, buf // buf -> vector for next iteration
744 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
745 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
746 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
747 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
748 movd %xmm1, %edx // to be added to adler
749 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
750 addq %rdx, adler // update adler
751 movd %xmm3, %edx // to be added to sum2
752 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
753 addq %rdx, sum2 // sum2 += 1st half of update
754 movd %xmm3, %edx // to be added to sum2
755 addq %rdx, sum2 // sum2 += 2nd half of update
758 // update adler/sum2 according to a new 48-byte vector
761 imulq $$48, adler, %rdx // edx = 48*adler
763 movaps (buf), %xmm7 // 1st 16 bytes vector
764 movaps 16(buf), %xmm10 // 2nd 16 bytes vector
765 movaps 32(buf), %xmm11 // 3rd 16 bytes vector
767 movaps %xmm7, %xmm1 // 1st vector
768 movaps %xmm10, %xmm2 // 2nd vector
769 movaps %xmm11, %xmm3 // 3rd vector
771 psadbw zero, %xmm7 // 1st vector for adler
772 psadbw zero, %xmm10 // 2nd vector for adler
773 psadbw zero, %xmm11 // 3rd vector for adler
775 addq %rdx, sum2 // sum2 += adler*48;
777 pmaddubsw %xmm9, %xmm1 // 8 16-bit words to be added for sum2 : 1st vector
778 pmaddubsw %xmm6, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
779 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 : 3rd vector
781 pmaddwd ones, %xmm1 // 4 32-bit elements to be added for sum2 in xmm1
782 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
783 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
785 paddd %xmm10, %xmm7 // 2 16-bit words to be added for adler
786 paddd %xmm11, %xmm7 // 2 16-bit words to be added for adler
788 paddd %xmm1, %xmm3 // 4 32-bit elements to be added for sum2
789 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2
791 addq $$48, buf // buf -> vector for next iteration
793 movhlps %xmm7, %xmm2 // higher 16-bit word (for adler) in xmm2
794 paddq %xmm2, %xmm7 // xmm7 lower 32-bit to be added to adler
796 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
797 movd %xmm7, %edx // to be added to adler
798 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
799 addq %rdx, adler // update adler
800 movd %xmm3, %edx // to be added to sum2
801 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
802 addq %rdx, sum2 // sum2 += 1st half of update
803 movd %xmm3, %edx // to be added to sum2
804 addq %rdx, sum2 // sum2 += 2nd half of update
807 // update adler/sum2 according to a new 64-byte vector
809 imulq $$64, adler, %rdx // edx = 64*adler
811 movaps (buf), %xmm1 // 1st 16 bytes vector
812 movaps 16(buf), %xmm7 // 2nd 16 bytes vector
813 movaps 32(buf), %xmm10 // 3rd 16 bytes vector
814 movaps 48(buf), %xmm11 // 4th 16 bytes vector
816 movaps %xmm1, %xmm3 // 1st vector
817 movaps %xmm11, %xmm2 // 4th vector
818 psadbw zero, %xmm1 // 1st vector for adler
819 psadbw zero, %xmm11 // 4th vector for adler
821 addq %rdx, sum2 // sum2 += adler*64;
823 pmaddubsw %xmm8, %xmm3 // 8 16-bit words to be added for sum2 : 1st vector
824 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 : 4th vector
825 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
826 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
828 paddd %xmm11, %xmm1 // 2 16-bit words to be added for adler in xmm1
829 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
831 movaps %xmm7, %xmm2 // 2nd vector
832 movaps %xmm10, %xmm11 // 3rd vector
834 psadbw zero, %xmm7 // 2nd vector for adler
835 psadbw zero, %xmm10 // 3rd vector for adler
837 pmaddubsw %xmm9, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
838 pmaddubsw %xmm6, %xmm11 // 8 16-bit words to be added for sum2 : 3rd vector
839 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
840 pmaddwd ones, %xmm11 // 4 32-bit elements to be added for sum2 in xmm11
842 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
843 paddd %xmm10, %xmm1 // 2 16-bit words to be added for adler in xmm1
845 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
846 paddd %xmm11, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
848 addq $$64, buf // buf -> vector for next iteration
850 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
851 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
852 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
853 movd %xmm1, %edx // to be added to adler
854 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
855 addq %rdx, adler // update adler
856 movd %xmm3, %edx // to be added to sum2
857 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
858 addq %rdx, sum2 // sum2 += 1st half of update
859 movd %xmm3, %edx // to be added to sum2
860 addq %rdx, sum2 // sum2 += 2nd half of update
863 // need to fill up xmm4/xmm5/xmm6 only if len>=16
865 jl skip_loading_tables
867 // set up table starting address to %eax
868 leaq sum2_coefficients(%rip), %rax
870 // reading coefficients
872 movaps (%rax), %xmm8 // coefficients for computing sum2 : pmaddubsw 64:49
873 movaps 16(%rax), %xmm9 // coefficients for computing sum2 : pmaddubsw 48:33
874 movaps 32(%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
875 movaps 48(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
876 movaps 64(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
881 cmpq $NMAX, len // len vs NMAX
882 jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing
884 len_ge_NMAX_loop: // while (len>=NMAX) {
886 subq $NMAX, len // len -= NMAX
887 movq $(NMAX/64), %rax // n = NMAX/64
890 DO64 // update adler/sum2 for a 64-byte input
892 jg n_loop // } while (n);
894 DO48 // update adler/sum2 for a 48-byte input
896 modulo_BASE // (adler/sum2) modulo BASE;
899 jge len_ge_NMAX_loop // } /* len>=NMAX */
903 subq $64, len // pre-decrement len by 64
904 jl len_lessthan_64 // if len < 64, skip the 64-vector code
905 len64_loop: // while (len>=64) {
906 DO64 // update adler/sum2 for a 64-byte input
907 subq $64, len // len -= 64;
911 addq $(64-32), len // post-increment 64 + pre-decrement 32 of len
912 jl len_lessthan_32 // if len < 32, skip the 32-vector code
913 DO32 // update adler/sum2 for a 32-byte input
914 subq $32, len // len -= 32;
918 addq $(32-16), len // post-increment by 32 + pre-decrement by 16 on len
919 jl len_lessthan_16 // if len < 16, skip the 16-vector code
920 DO16 // update adler/sum2 for a 16-byte input
921 subq $16, len // len -= 16;
924 addq $16, len // post-increment len by 16
925 jz len_is_zero // if len==0, branch over scalar processing
927 scalar_loop: // while (len) {
928 movzbq (buf), %rdx // new input byte
930 addq %rdx, adler // adler += *buf
931 addq adler, sum2 // sum2 += adler
937 modulo_BASE // (adler/sum2) modulo BASE;
939 // construct 32-bit (sum2<<16 | adler) to be returned
941 salq $16, sum2 // sum2 <<16
942 movq adler, %rax // adler
943 orq sum2, %rax // sum2<<16 | adler
946 #ifdef KERNEL // if for kernel, restore %xmm0-%xmm11
947 movaps -32(%rbp), %xmm0
948 movaps -48(%rbp), %xmm1
949 movaps -64(%rbp), %xmm2
950 movaps -80(%rbp), %xmm3
951 movaps -96(%rbp), %xmm4
952 movaps -112(%rbp), %xmm5
953 movaps -128(%rbp), %xmm6
954 movaps -144(%rbp), %xmm7
955 movaps -160(%rbp), %xmm8
956 movaps -176(%rbp), %xmm9
957 movaps -192(%rbp), %xmm10
958 movaps -208(%rbp), %xmm11
959 addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack
963 leave // pop ebp out from stack
969 sum2_coefficients: // used for vectorizing adler32 computation
971 // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
1038 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
1048 #endif // (defined __i386__)
1050 #endif // (defined __i386__ || defined __x86_64__)