]> git.saurik.com Git - apple/xnu.git/blob - libkern/zlib/intel/adler32vec.s
df9dcf3281cc424207ee6d8efee50b06296e4daa
[apple/xnu.git] / libkern / zlib / intel / adler32vec.s
1 /* Apple Copyright 2009
2 CoreOS - vector & Numerics, cclee 10-22-09
3
4 This following source code implements a vectorized version of adler32 computation that is defined in zlib.
5 The target architectures are x86_64 and i386.
6
7 Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
8 The adler-sum2 pair is updated according to
9
10 for (i=0;i<N;i++) {
11 adler = (adler+x[i])%BASE;
12 sum2 = (sum2+adler)%BASE;
13 }
14
15 To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
16 adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
17 algorithm to
18
19 for (i=0;i<N;i+=NMAX) {
20 for (k=0;k<NMAX;k++) {
21 adler+=x[i+k];
22 sum2+=adler;
23 }
24 adler%=BASE;
25 sum2%=BASE;
26 }
27
28 The hand optimization of this function is now reduced to
29
30 for (k=0;k<NMAX;k++) {
31 adler+=x[k];
32 sum2+=adler;
33 }
34
35 This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
36
37 for (k=0;k<K;k++) {
38 adler+=x[k];
39 sum2+=adler;
40 }
41
42 It can be shown that the sum2-adler pair can be updated according to
43
44 sum2 += adler*K;
45 adler += (x[0] + x[1] + ... + x[K-1]);
46 sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
47
48 The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
49 The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors
50 [ 1 1 1 ... 1 ] for adler update.
51 [ K K-1 ... 1 ] for sum2 update.
52
53 The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm
54 registers. It then uses SSE instructions to perform the aforementioned vector computation.
55
56 For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32),
57 followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
58
59 For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block),
60 it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48),
61 before calling a modulo operation for adler and sum2.
62
63 */
64
65 /* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3
66
67 Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3
68 add code here to probe cpu_capabilities for ssse3 support
69 if ssse3 is supported, branch to ssse3-based code, otherwise use the original code
70
71 cclee 5-3-10
72 */
73
74 #define BASE 65521 /* largest prime smaller than 65536 */
75 #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
76
77 // uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
78 // unsigned n;
79 // while (len >= NMAX) {
80 // len -= NMAX;
81 // n = NMAX / 16; /* NMAX is divisible by 16 */
82 // do {
83 // DO16(buf); /* 16 sums unrolled */
84 // buf += 16;
85 // } while (--n);
86 // MOD(adler);
87 // MOD(sum2);
88 // }
89 // if (len) { /* avoid modulos if none remaining */
90 // while (len >= 16) {
91 // len -= 16;
92 // DO16(buf);
93 // buf += 16;
94 // }
95 // while (len--) {
96 // adler += *buf++;
97 // sum2 += adler;
98 // }
99 // MOD(adler);
100 // MOD(sum2);
101 // }
102 // return adler | (sum2 << 16);
103 // }
104
105 #if (defined __i386__ || defined __x86_64__)
106
107 #include <i386/cpu_capabilities.h>
108
109 .text
110 .align 4,0x90
111 .globl _adler32_vec
112 _adler32_vec:
113
114 #if (defined __i386__)
115
116 pushl %ebp
117 movl %esp, %ebp
118
119 pushl %ebx
120 pushl %edi
121 pushl %esi
122
123 #ifdef KERNEL // if this is for kernel, need to save xmm registers
124 subl $140, %esp // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary
125 movaps %xmm0, 0(%esp) // save xmm0, offset -12 for ebx/edi/esi
126 movaps %xmm1, 16(%esp) // save xmm1
127 movaps %xmm2, 32(%esp) // save xmm2
128 movaps %xmm3, 48(%esp) // save xmm3
129 movaps %xmm4, 64(%esp) // save xmm4
130 movaps %xmm5, 80(%esp) // save xmm5
131 movaps %xmm6, 96(%esp) // save xmm6
132 movaps %xmm7, 112(%esp) // save xmm7, if this is for SSSE3 or above
133 #endif
134
135 #define adler %edi // 8(%ebp)
136 #define sum2 %esi // 12(%ebp)
137 #define buf %ecx // 16(%ebp)
138 #define len %ebx // 20(%ebp)
139 #define zero %xmm0
140 #define ones %xmm5
141
142 movl 8(%ebp), adler
143 movl 12(%ebp), sum2
144 movl 16(%ebp), buf // use ecx as buf pointer
145 movl 20(%ebp), len
146
147 .macro modulo_BASE
148 movl $$-2146992015, %eax // 1/BASE in Q47
149 mull adler // edx:eax = adler divided by BASE in Q47
150 shrl $$15, %edx // edx is now the floor integer of adler and BASE
151 imull $$BASE, %edx, %edx // edx * BASE
152 subl %edx, adler // adler -= edx*BASE
153 movl $$-2146992015, %eax // 1/BASE in Q47
154 mull sum2 // edx:eax = sum2 divided by BASE in Q47
155 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
156 imull $$BASE, %edx, %eax // eax = edx * BASE
157 subl %eax, sum2 // sum2 -= sdx*BASE
158 .endmacro
159
160 // update adler/sum2 according to a new 16-byte vector
161 .macro DO16
162 movaps (buf), %xmm1 // 16 bytes vector, in xmm1
163 movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
164 addl $$16, buf // buf -> next vector
165 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
166 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
167 imull $$16, adler, %edx // edx = 16*adler;
168 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
169 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
170 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
171 addl %edx, sum2 // sum2 += adler*16;
172 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
173 movd %xmm1, %edx // to be added to adler
174 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
175 addl %edx, adler // update adler
176 movd %xmm3, %edx // to be added to sum2
177 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
178 addl %edx, sum2 // sum2 += 1st half of update
179 movd %xmm3, %edx // to be added to sum2
180 addl %edx, sum2 // sum2 += 2nd half of update
181 .endm
182
183 // update adler/sum2 according to a new 32-byte vector
184 .macro DO32
185 imull $$32, adler, %edx // edx = 32*adler
186 movaps (buf), %xmm1 // 1st 16 bytes vector
187 movaps 16(buf), %xmm7 // 2nd 16 bytes vector
188 movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
189 movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
190 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
191 psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7
192 addl %edx, sum2 // sum2 += adler*32;
193 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
194 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
195 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
196 paddd %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
197 addl $$32, buf // buf -> vector for next iteration
198 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
199 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
200 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
201 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
202 movd %xmm1, %edx // to be added to adler
203 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
204 addl %edx, adler // update adler
205 movd %xmm3, %edx // to be added to sum2
206 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
207 addl %edx, sum2 // sum2 += 1st half of update
208 movd %xmm3, %edx // to be added to sum2
209 addl %edx, sum2 // sum2 += 2nd half of update
210 .endm
211
212 // this defines the macro DO16 for SSSE3 not supported
213 .macro DO16_nossse3
214 movaps (buf), %xmm1 // 16 bytes vector
215 movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
216 movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
217 psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector
218 punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words
219 punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words
220 pmullw %xmm6, %xmm3 // lower 8 words * 16:9
221 pmullw %xmm4, %xmm2 // higher 8 words * 8:1
222 addl $$16, buf // buf -> next vector
223 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
224 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
225 imull $$16, adler, %edx // edx = 16*adler;
226 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
227 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
228 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
229 addl %edx, sum2 // sum2 += adler*16;
230 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
231 movd %xmm1, %edx // to be added to adler
232 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
233 addl %edx, adler // update adler
234 movd %xmm3, %edx // to be added to sum2
235 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
236 addl %edx, sum2 // sum2 += 1st half of update
237 movd %xmm3, %edx // to be added to sum2
238 addl %edx, sum2 // sum2 += 2nd half of update
239 .endm
240
241 #ifdef KERNEL
242 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
243 mov (%eax), %eax // %eax = __cpu_capabilities
244 #else
245 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
246 #endif
247 test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasAES
248 je L_no_ssse3
249
250 // i386 adler32 with ssse3
251
252 // need to fill up xmm4/xmm5/xmm6 only if len>=16
253 cmpl $16, len
254 jl L_skip_loading_tables
255
256 // set up table starting address to %eax
257 leal sum2_coefficients, %eax
258
259 // reading coefficients
260 pxor zero, zero
261 movaps (%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
262 movaps 16(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
263 movaps 32(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
264
265 L_skip_loading_tables:
266
267 cmpl $NMAX, len // len vs NMAX
268 jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing
269
270 len_ge_NMAX_loop: // while (len>=NMAX) {
271
272 subl $NMAX, len // len -= NMAX
273 movl $(NMAX/32), %eax // n = NMAX/32
274
275 n_loop: // do {
276 DO32 // update adler/sum2 for a 32-byte input
277 decl %eax // n--;
278 jg n_loop // } while (n);
279 DO16 // update adler/sum2 for a 16-byte input
280 modulo_BASE // (adler/sum2) modulo BASE;
281 cmpl $NMAX, len //
282 jge len_ge_NMAX_loop // } /* len>=NMAX */
283
284 len_lessthan_NMAX:
285
286 subl $32, len // pre-decrement len by 32
287 jl len_lessthan_32 // if len < 32, skip the 32-vector code
288 len32_loop: // while (len>=32) {
289 DO32 // update adler/sum2 for a 32-byte input
290 subl $32, len // len -= 32;
291 jge len32_loop // }
292
293 len_lessthan_32:
294
295 addl $(32-16), len // post-increment by 32 + pre-decrement by 16 on len
296 jl L_len_lessthan_16 // if len < 16, skip the 16-vector code
297 DO16 // update adler/sum2 for a 16-byte input
298 subl $16, len // len -= 16;
299
300 L_len_lessthan_16:
301 addl $16, len // post-increment len by 16
302 jz len_is_zero // if len==0, branch over scalar processing
303
304 0: // while (len) {
305 movzbl (buf), %edx // new input byte
306 incl buf // buf++
307 addl %edx, adler // adler += *buf
308 addl adler, sum2 // sum2 += adler
309 subl $1, len // len--
310 jg 0b // }
311
312 len_is_zero:
313
314 modulo_BASE // (adler/sum2) modulo BASE;
315
316 // construct 32-bit (sum2<<16 | adler) to be returned
317
318 sall $16, sum2 // sum2 <<16
319 movl adler, %eax // adler
320 orl sum2, %eax // sum2<<16 | adler
321
322
323 #ifdef KERNEL // if this is for kernel code, need to restore xmm registers
324 movaps (%esp), %xmm0 // restore xmm0, offset -12 for ebx/edi/esi
325 movaps 16(%esp), %xmm1 // restore xmm1
326 movaps 32(%esp), %xmm2 // restore xmm2
327 movaps 48(%esp), %xmm3 // restore xmm3
328 movaps 64(%esp), %xmm4 // restore xmm4
329 movaps 80(%esp), %xmm5 // restore xmm5
330 movaps 96(%esp), %xmm6 // restore xmm6
331 movaps 112(%esp), %xmm7 // restore xmm7, if this is for SSSE3 or above
332 addl $140, %esp // we've already restored %xmm0-%xmm7 from stack
333 #endif
334
335 popl %esi
336 popl %edi
337 popl %ebx
338 leave // pop ebp out from stack
339 ret
340
341
342 L_no_ssse3:
343
344 // i386 adler32 without ssse3
345
346 // need to fill up xmm4/xmm5/xmm6 only if len>=16
347 cmpl $16, len
348 jl 2f
349
350 // set up table starting address to %eax
351 leal sum2_coefficients, %eax
352
353 // reading coefficients
354 pxor zero, zero
355 movaps 48(%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
356 movaps 64(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
357 movaps 80(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
358
359 2:
360
361 cmpl $NMAX, len // len vs NMAX
362 jl 3f // if (len < NMAX), skip the following NMAX batches processing
363
364 0: // while (len>=NMAX) {
365
366 subl $NMAX, len // len -= NMAX
367 movl $(NMAX/16), %eax // n = NMAX/16
368
369 1: // do {
370 DO16_nossse3 // update adler/sum2 for a 16-byte input
371 decl %eax // n--;
372 jg 1b // } while (n);
373
374 modulo_BASE // (adler/sum2) modulo BASE;
375
376 cmpl $NMAX, len //
377 jge 0b // } /* len>=NMAX */
378
379 3:
380
381 subl $16, len // pre-decrement len by 16
382 jl L_len_lessthan_16 // if len < 16, skip the 16-vector code
383 DO16_nossse3 // update adler/sum2 for a 16-byte input
384 subl $16, len // len -= 16;
385 jmp L_len_lessthan_16
386
387
388 .const
389 .align 4
390 sum2_coefficients: // used for vectorizing adler32 computation
391
392 .byte 32
393 .byte 31
394 .byte 30
395 .byte 29
396 .byte 28
397 .byte 27
398 .byte 26
399 .byte 25
400 .byte 24
401 .byte 23
402 .byte 22
403 .byte 21
404 .byte 20
405 .byte 19
406 .byte 18
407 .byte 17
408 .byte 16
409 .byte 15
410 .byte 14
411 .byte 13
412 .byte 12
413 .byte 11
414 .byte 10
415 .byte 9
416 .byte 8
417 .byte 7
418 .byte 6
419 .byte 5
420 .byte 4
421 .byte 3
422 .byte 2
423 .byte 1
424
425 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
426 .word 1
427 .word 1
428 .word 1
429 .word 1
430 .word 1
431 .word 1
432 .word 1
433 .word 1
434
435
436 // data for without ssse3
437
438 .word 16
439 .word 15
440 .word 14
441 .word 13
442 .word 12
443 .word 11
444 .word 10
445 .word 9
446 .word 8
447 .word 7
448 .word 6
449 .word 5
450 .word 4
451 .word 3
452 .word 2
453 .word 1
454
455 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
456 .word 1
457 .word 1
458 .word 1
459 .word 1
460 .word 1
461 .word 1
462 .word 1
463 .word 1
464
465 #else // (defined __x86_64__)
466
467 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
468 mov (%rax), %eax // %eax = __cpu_capabilities
469 test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasSupplementalSSE3
470 jne L_has_ssse3
471
472 // ----------------------------------------------------------------------------------
473 // the following is added for x86_64 without SSSE3 support
474 // it is essentially a translated copy of the i386 code without SSSE3 code
475 // ----------------------------------------------------------------------------------
476
477 // input :
478 // adler : rdi
479 // sum2 : rsi
480 // buf : rdx
481 // len : rcx
482
483 pushq %rbp
484 movq %rsp, %rbp
485 pushq %rbx
486
487 #ifdef KERNEL // if for kernel, save %xmm0-%xmm11
488 subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
489 movaps %xmm0, -32(%rbp)
490 movaps %xmm1, -48(%rbp)
491 movaps %xmm2, -64(%rbp)
492 movaps %xmm3, -80(%rbp)
493 movaps %xmm4, -96(%rbp)
494 movaps %xmm5, -112(%rbp)
495 movaps %xmm6, -128(%rbp)
496 #endif
497
498 #define adler %rdi // 16(%rbp)
499 #define sum2 %rsi // 24(%ebp)
500 #define buf %rcx // 32(%ebp)
501 #define len %rbx // 40(%ebp)
502 #define zero %xmm0
503 #define ones %xmm5
504
505 movq %rcx, len
506 movq %rdx, buf
507
508 .macro modulo_BASE
509 movl $$-2146992015, %eax // 1/BASE in Q47
510 mull %edi // edx:eax = adler divided by BASE in Q47
511 shrl $$15, %edx // edx is now the floor integer of adler and BASE
512 imull $$BASE, %edx, %edx // edx * BASE
513 subq %rdx, adler // adler -= edx*BASE
514 movl $$-2146992015, %eax // 1/BASE in Q47
515 mull %esi // edx:eax = sum2 divided by BASE in Q47
516 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
517 imull $$BASE, %edx, %eax // eax = edx * BASE
518 subq %rax, sum2 // sum2 -= sdx*BASE
519 .endmacro
520
521 // update adler/sum2 according to a new 16-byte vector, no ssse3
522 .macro DO16_nossse3
523 movaps (buf), %xmm1 // 16 bytes vector
524 movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
525 movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
526 psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector
527 punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words
528 punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words
529 pmullw %xmm6, %xmm3 // lower 8 words * 16:9
530 pmullw %xmm4, %xmm2 // higher 8 words * 8:1
531 add $$16, buf // buf -> next vector
532 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
533 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
534 imulq $$16, adler, %rdx // edx = 16*adler;
535 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
536 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
537 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
538 add %rdx, sum2 // sum2 += adler*16;
539 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
540 movd %xmm1, %edx // to be added to adler
541 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
542 addq %rdx, adler // update adler
543 movd %xmm3, %edx // to be added to sum2
544 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
545 addq %rdx, sum2 // sum2 += 1st half of update
546 movd %xmm3, %edx // to be added to sum2
547 addq %rdx, sum2 // sum2 += 2nd half of update
548 .endm
549
550 // need to fill up xmm4/xmm5/xmm6 only if len>=16
551 cmpq $16, len
552 jl 0f
553
554 // set up table starting address to %eax
555 leaq sum2_coefficients_nossse3(%rip), %rax
556
557 // reading coefficients
558 pxor zero, zero
559 movaps (%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
560 movaps 16(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
561 movaps 32(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
562 0:
563
564 cmp $NMAX, len // len vs NMAX
565 jl 3f // if (len < NMAX), skip the following NMAX batches processing
566
567 0: // while (len>=NMAX) {
568
569 sub $NMAX, len // len -= NMAX
570 mov $(NMAX/16), %eax // n = NMAX/16
571
572 1: // do {
573 DO16_nossse3 // update adler/sum2 for a 16-byte input
574 decl %eax // n--;
575 jg 1b // } while (n);
576
577 modulo_BASE // (adler/sum2) modulo BASE;
578
579 cmp $NMAX, len //
580 jge 0b // } /* len>=NMAX */
581
582 3:
583
584 sub $16, len // pre-decrement len by 16
585 jl 2f // if len < 16, skip the 16-vector code
586 DO16_nossse3 // update adler/sum2 for a 16-byte input
587 sub $16, len // len -= 16;
588
589 2:
590 add $16, len // post-increment len by 16
591 jz 1f // if len==0, branch over scalar processing
592
593 0: // while (len) {
594 movzbq (buf), %rdx // new input byte
595 incq buf // buf++
596 addq %rdx, adler // adler += *buf
597 addq adler, sum2 // sum2 += adler
598 decq len // len--
599 jg 0b // }
600
601 1:
602
603 modulo_BASE // (adler/sum2) modulo BASE;
604
605 // construct 32-bit (sum2<<16 | adler) to be returned
606
607 salq $16, sum2 // sum2 <<16
608 movq adler, %rax // adler
609 orq sum2, %rax // sum2<<16 | adler
610
611 #ifdef KERNEL // if this is for kernel code, need to restore xmm registers
612 movaps -32(%rbp), %xmm0
613 movaps -48(%rbp), %xmm1
614 movaps -64(%rbp), %xmm2
615 movaps -80(%rbp), %xmm3
616 movaps -96(%rbp), %xmm4
617 movaps -112(%rbp), %xmm5
618 movaps -128(%rbp), %xmm6
619 addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack
620 #endif
621
622 popq %rbx
623 leave
624 ret
625
626
627
628 .const
629 .align 4
630 sum2_coefficients_nossse3: // used for vectorizing adler32 computation
631
632 // data for without ssse3
633
634 .word 16
635 .word 15
636 .word 14
637 .word 13
638 .word 12
639 .word 11
640 .word 10
641 .word 9
642 .word 8
643 .word 7
644 .word 6
645 .word 5
646 .word 4
647 .word 3
648 .word 2
649 .word 1
650
651 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
652 .word 1
653 .word 1
654 .word 1
655 .word 1
656 .word 1
657 .word 1
658 .word 1
659 .word 1
660
661
662 .text
663
664 // ----------------------------------------------------------------------------------
665 // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions
666 // ----------------------------------------------------------------------------------
667
668 L_has_ssse3:
669
670 // input :
671 // adler : rdi
672 // sum2 : rsi
673 // buf : rdx
674 // len : rcx
675
676 pushq %rbp
677 movq %rsp, %rbp
678 pushq %rbx
679
680 #ifdef KERNEL // if for kernel, save %xmm0-%xmm11
681 subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
682 movaps %xmm0, -32(%rbp)
683 movaps %xmm1, -48(%rbp)
684 movaps %xmm2, -64(%rbp)
685 movaps %xmm3, -80(%rbp)
686 movaps %xmm4, -96(%rbp)
687 movaps %xmm5, -112(%rbp)
688 movaps %xmm6, -128(%rbp)
689 movaps %xmm7, -144(%rbp)
690 movaps %xmm8, -160(%rbp)
691 movaps %xmm9, -176(%rbp)
692 movaps %xmm10, -192(%rbp)
693 movaps %xmm11, -208(%rbp)
694 #endif
695
696 #define adler %rdi // 16(%rbp)
697 #define sum2 %rsi // 24(%ebp)
698 #define buf %rcx // 32(%ebp)
699 #define len %rbx // 40(%ebp)
700 #define zero %xmm0
701 #define ones %xmm5
702
703 movq %rcx, len
704 movq %rdx, buf
705
706 // update adler/sum2 according to a new 16-byte vector
707 .macro DO16
708 movaps (buf), %xmm1 // 16 bytes vector
709 movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw
710 addq $$16, buf // buf -> next vector
711 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
712 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
713 imulq $$16, adler, %rdx // edx = 16*adler;
714 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
715 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
716 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
717 addq %rdx, sum2 // sum2 += adler*16;
718 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
719 movd %xmm1, %edx // to be added to adler
720 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
721 addq %rdx, adler // update adler
722 movd %xmm3, %edx // to be added to sum2
723 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
724 addq %rdx, sum2 // sum2 += 1st half of update
725 movd %xmm3, %edx // to be added to sum2
726 addq %rdx, sum2 // sum2 += 2nd half of update
727 .endm
728
729 // update adler/sum2 according to a new 32-byte vector
730 .macro DO32
731 imulq $$32, adler, %rdx // edx = 32*adler
732 movaps (buf), %xmm1 // 1st 16 bytes vector
733 movaps 16(buf), %xmm7 // 2nd 16 bytes vector
734 movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
735 movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
736 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1
737 psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7
738 addq %rdx, sum2 // sum2 += adler*32;
739 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
740 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
741 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
742 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
743 addq $$32, buf // buf -> vector for next iteration
744 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
745 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
746 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
747 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
748 movd %xmm1, %edx // to be added to adler
749 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
750 addq %rdx, adler // update adler
751 movd %xmm3, %edx // to be added to sum2
752 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
753 addq %rdx, sum2 // sum2 += 1st half of update
754 movd %xmm3, %edx // to be added to sum2
755 addq %rdx, sum2 // sum2 += 2nd half of update
756 .endm
757
758 // update adler/sum2 according to a new 48-byte vector
759
760 .macro DO48
761 imulq $$48, adler, %rdx // edx = 48*adler
762
763 movaps (buf), %xmm7 // 1st 16 bytes vector
764 movaps 16(buf), %xmm10 // 2nd 16 bytes vector
765 movaps 32(buf), %xmm11 // 3rd 16 bytes vector
766
767 movaps %xmm7, %xmm1 // 1st vector
768 movaps %xmm10, %xmm2 // 2nd vector
769 movaps %xmm11, %xmm3 // 3rd vector
770
771 psadbw zero, %xmm7 // 1st vector for adler
772 psadbw zero, %xmm10 // 2nd vector for adler
773 psadbw zero, %xmm11 // 3rd vector for adler
774
775 addq %rdx, sum2 // sum2 += adler*48;
776
777 pmaddubsw %xmm9, %xmm1 // 8 16-bit words to be added for sum2 : 1st vector
778 pmaddubsw %xmm6, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
779 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 : 3rd vector
780
781 pmaddwd ones, %xmm1 // 4 32-bit elements to be added for sum2 in xmm1
782 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
783 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
784
785 paddd %xmm10, %xmm7 // 2 16-bit words to be added for adler
786 paddd %xmm11, %xmm7 // 2 16-bit words to be added for adler
787
788 paddd %xmm1, %xmm3 // 4 32-bit elements to be added for sum2
789 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2
790
791 addq $$48, buf // buf -> vector for next iteration
792
793 movhlps %xmm7, %xmm2 // higher 16-bit word (for adler) in xmm2
794 paddq %xmm2, %xmm7 // xmm7 lower 32-bit to be added to adler
795
796 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
797 movd %xmm7, %edx // to be added to adler
798 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
799 addq %rdx, adler // update adler
800 movd %xmm3, %edx // to be added to sum2
801 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
802 addq %rdx, sum2 // sum2 += 1st half of update
803 movd %xmm3, %edx // to be added to sum2
804 addq %rdx, sum2 // sum2 += 2nd half of update
805 .endm
806
807 // update adler/sum2 according to a new 64-byte vector
808 .macro DO64
809 imulq $$64, adler, %rdx // edx = 64*adler
810
811 movaps (buf), %xmm1 // 1st 16 bytes vector
812 movaps 16(buf), %xmm7 // 2nd 16 bytes vector
813 movaps 32(buf), %xmm10 // 3rd 16 bytes vector
814 movaps 48(buf), %xmm11 // 4th 16 bytes vector
815
816 movaps %xmm1, %xmm3 // 1st vector
817 movaps %xmm11, %xmm2 // 4th vector
818 psadbw zero, %xmm1 // 1st vector for adler
819 psadbw zero, %xmm11 // 4th vector for adler
820
821 addq %rdx, sum2 // sum2 += adler*64;
822
823 pmaddubsw %xmm8, %xmm3 // 8 16-bit words to be added for sum2 : 1st vector
824 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 : 4th vector
825 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
826 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
827
828 paddd %xmm11, %xmm1 // 2 16-bit words to be added for adler in xmm1
829 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
830
831 movaps %xmm7, %xmm2 // 2nd vector
832 movaps %xmm10, %xmm11 // 3rd vector
833
834 psadbw zero, %xmm7 // 2nd vector for adler
835 psadbw zero, %xmm10 // 3rd vector for adler
836
837 pmaddubsw %xmm9, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
838 pmaddubsw %xmm6, %xmm11 // 8 16-bit words to be added for sum2 : 3rd vector
839 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
840 pmaddwd ones, %xmm11 // 4 32-bit elements to be added for sum2 in xmm11
841
842 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1
843 paddd %xmm10, %xmm1 // 2 16-bit words to be added for adler in xmm1
844
845 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
846 paddd %xmm11, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
847
848 addq $$64, buf // buf -> vector for next iteration
849
850 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2
851 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler
852 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
853 movd %xmm1, %edx // to be added to adler
854 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
855 addq %rdx, adler // update adler
856 movd %xmm3, %edx // to be added to sum2
857 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
858 addq %rdx, sum2 // sum2 += 1st half of update
859 movd %xmm3, %edx // to be added to sum2
860 addq %rdx, sum2 // sum2 += 2nd half of update
861 .endm
862
863 // need to fill up xmm4/xmm5/xmm6 only if len>=16
864 cmpq $16, len
865 jl skip_loading_tables
866
867 // set up table starting address to %eax
868 leaq sum2_coefficients(%rip), %rax
869
870 // reading coefficients
871 pxor zero, zero
872 movaps (%rax), %xmm8 // coefficients for computing sum2 : pmaddubsw 64:49
873 movaps 16(%rax), %xmm9 // coefficients for computing sum2 : pmaddubsw 48:33
874 movaps 32(%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
875 movaps 48(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
876 movaps 64(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
877
878 skip_loading_tables:
879
880
881 cmpq $NMAX, len // len vs NMAX
882 jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing
883
884 len_ge_NMAX_loop: // while (len>=NMAX) {
885
886 subq $NMAX, len // len -= NMAX
887 movq $(NMAX/64), %rax // n = NMAX/64
888
889 n_loop: // do {
890 DO64 // update adler/sum2 for a 64-byte input
891 decq %rax // n--;
892 jg n_loop // } while (n);
893
894 DO48 // update adler/sum2 for a 48-byte input
895
896 modulo_BASE // (adler/sum2) modulo BASE;
897
898 cmpq $NMAX, len //
899 jge len_ge_NMAX_loop // } /* len>=NMAX */
900
901 len_lessthan_NMAX:
902
903 subq $64, len // pre-decrement len by 64
904 jl len_lessthan_64 // if len < 64, skip the 64-vector code
905 len64_loop: // while (len>=64) {
906 DO64 // update adler/sum2 for a 64-byte input
907 subq $64, len // len -= 64;
908 jge len64_loop // }
909
910 len_lessthan_64:
911 addq $(64-32), len // post-increment 64 + pre-decrement 32 of len
912 jl len_lessthan_32 // if len < 32, skip the 32-vector code
913 DO32 // update adler/sum2 for a 32-byte input
914 subq $32, len // len -= 32;
915
916 len_lessthan_32:
917
918 addq $(32-16), len // post-increment by 32 + pre-decrement by 16 on len
919 jl len_lessthan_16 // if len < 16, skip the 16-vector code
920 DO16 // update adler/sum2 for a 16-byte input
921 subq $16, len // len -= 16;
922
923 len_lessthan_16:
924 addq $16, len // post-increment len by 16
925 jz len_is_zero // if len==0, branch over scalar processing
926
927 scalar_loop: // while (len) {
928 movzbq (buf), %rdx // new input byte
929 incq buf // buf++
930 addq %rdx, adler // adler += *buf
931 addq adler, sum2 // sum2 += adler
932 decq len // len--
933 jg scalar_loop // }
934
935 len_is_zero:
936
937 modulo_BASE // (adler/sum2) modulo BASE;
938
939 // construct 32-bit (sum2<<16 | adler) to be returned
940
941 salq $16, sum2 // sum2 <<16
942 movq adler, %rax // adler
943 orq sum2, %rax // sum2<<16 | adler
944
945
946 #ifdef KERNEL // if for kernel, restore %xmm0-%xmm11
947 movaps -32(%rbp), %xmm0
948 movaps -48(%rbp), %xmm1
949 movaps -64(%rbp), %xmm2
950 movaps -80(%rbp), %xmm3
951 movaps -96(%rbp), %xmm4
952 movaps -112(%rbp), %xmm5
953 movaps -128(%rbp), %xmm6
954 movaps -144(%rbp), %xmm7
955 movaps -160(%rbp), %xmm8
956 movaps -176(%rbp), %xmm9
957 movaps -192(%rbp), %xmm10
958 movaps -208(%rbp), %xmm11
959 addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack
960 #endif
961
962 popq %rbx
963 leave // pop ebp out from stack
964 ret
965
966
967 .const
968 .align 4
969 sum2_coefficients: // used for vectorizing adler32 computation
970
971 // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
972
973 .byte 64
974 .byte 63
975 .byte 62
976 .byte 61
977 .byte 60
978 .byte 59
979 .byte 58
980 .byte 57
981 .byte 56
982 .byte 55
983 .byte 54
984 .byte 53
985 .byte 52
986 .byte 51
987 .byte 50
988 .byte 49
989 .byte 48
990 .byte 47
991 .byte 46
992 .byte 45
993 .byte 44
994 .byte 43
995 .byte 42
996 .byte 41
997 .byte 40
998 .byte 39
999 .byte 38
1000 .byte 37
1001 .byte 36
1002 .byte 35
1003 .byte 34
1004 .byte 33
1005 .byte 32
1006 .byte 31
1007 .byte 30
1008 .byte 29
1009 .byte 28
1010 .byte 27
1011 .byte 26
1012 .byte 25
1013 .byte 24
1014 .byte 23
1015 .byte 22
1016 .byte 21
1017 .byte 20
1018 .byte 19
1019 .byte 18
1020 .byte 17
1021 .byte 16
1022 .byte 15
1023 .byte 14
1024 .byte 13
1025 .byte 12
1026 .byte 11
1027 .byte 10
1028 .byte 9
1029 .byte 8
1030 .byte 7
1031 .byte 6
1032 .byte 5
1033 .byte 4
1034 .byte 3
1035 .byte 2
1036 .byte 1
1037
1038 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
1039 .word 1
1040 .word 1
1041 .word 1
1042 .word 1
1043 .word 1
1044 .word 1
1045 .word 1
1046 .word 1
1047
1048 #endif // (defined __i386__)
1049
1050 #endif // (defined __i386__ || defined __x86_64__)