]> git.saurik.com Git - apple/xnu.git/blob - libkern/zlib/arm/adler32vec.s
xnu-1504.3.12.tar.gz
[apple/xnu.git] / libkern / zlib / arm / adler32vec.s
1 #include <arm/arch.h>
2
3 #define BASE 65521 /* largest prime smaller than 65536 */
4 #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
5
6 // Note: buf should have been 16-byte aligned in the caller function,
7
8 // uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) {
9 // unsigned n;
10 // while (len >= NMAX) {
11 // len -= NMAX;
12 // n = NMAX / 16; /* NMAX is divisible by 16 */
13 // do {
14 // DO16(buf); /* 16 sums unrolled */
15 // buf += 16;
16 // } while (--n);
17 // MOD(adler);
18 // MOD(sum2);
19 // }
20 // if (len) { /* avoid modulos if none remaining */
21 // while (len >= 16) {
22 // len -= 16;
23 // DO16(buf);
24 // buf += 16;
25 // }
26 // while (len--) {
27 // adler += *buf++;
28 // sum2 += adler;
29 // }
30 // MOD(adler);
31 // MOD(sum2);
32 // }
33 // return adler | (sum2 << 16); /* return recombined sums */
34 // }
35
36
37 /*
38 DO16 vectorization:
39 given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that
40 sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]);
41 adler += (x[0] + x[1] + ... + x[15]);
42
43 therefore, this is what can be done to vectorize the above computation
44 1. 16-byte aligned vector load into q2 (x[0:x15])
45 2. sum2 += (adler<<4);
46 3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15]
47 4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15]
48 5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2
49
50 In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop.
51 1. 32-byte aligned vector load into q2,q14 (x[0:x31])
52 2. sum2 += (adler<<5);
53 3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31]
54 4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31]
55 5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2
56
57 This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8.
58
59 */
60
61 /*
62 MOD implementation:
63 adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47
64 1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47
65 2. vshr.u64 q2,q2,#47 // floor function
66 3. vpadd.u32 d4,d4,d5 // merge into a double word in d4
67 4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
68
69 */
70
71 #if defined _ARM_ARCH_6 // this file would be used only for armv6 or above
72
73
74 .text
75 .align 2
76 .globl _adler32_vec
77 _adler32_vec:
78
79 #if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) // for armv6 or armv7 without neon support
80
81
82 #define adler r0
83 #define sum2 r1
84 #define buf r2
85 #define len r3
86 #define one_by_base r4
87 #define base r5
88 #define nmax r6
89 #define t r12
90 #define vecs lr
91 #define x0 r8
92 #define x1 r10
93 #define x2 r11
94 #define x3 r12
95 #define zero r9
96
97 // this macro performs adler/sum2 update for 4 input bytes
98
99 .macro DO4
100 add sum2, adler, lsl #2 // sum2 += 4*adler;
101 ldr x0,[buf] // 4 bytes in 1 32-bit word
102 usada8 adler, x0, zero, adler // adler += sum(x0:x3)
103 ldrb x0,[buf], #4 // x0
104 ldrb x2,[buf,#-2] // x2
105 ldrb x1,[buf,#-3] // x1
106 ldrb x3,[buf,#-1] // x3
107 add sum2, x0, lsl #2 // sum2 += 4*x0
108 add x3, x3, x1, lsl #1 // x3+2*x1
109 add sum2, x2, lsl #1 // sum2 += 2*x2
110 add x3, x1 // x3+3*x1
111 add sum2, x3 // sum2 += x3+3*x1
112 .endm
113
114 // the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes
115 .macro DO16
116 DO4 // adler/sum2 update for 4 input bytes
117 DO4 // adler/sum2 update for 4 input bytes
118 DO4 // adler/sum2 update for 4 input bytes
119 DO4 // adler/sum2 update for 4 input bytes
120 .endm
121
122 // the following macro performs adler sum2 modulo BASE
123 .macro modulo_base
124 umull x0,x1,adler,one_by_base // adler/BASE in Q47
125 umull x2,x3,sum2,one_by_base // sum2/BASE in Q47
126 lsr x1, #15 // x1 >> 15 = floor(adler/BASE)
127 lsr x3, #15 // x3 >> 15 = floor(sum2/BASE)
128 mla adler, x1, base, adler // adler %= base;
129 mla sum2, x3, base, sum2 // sum2 %= base;
130 .endm
131
132 adr t, coeffs
133 push {r4-r6, r8-r11, lr}
134 ldmia t, {one_by_base, base, nmax} // load up coefficients
135
136 subs len, nmax // pre-subtract len by NMAX
137 eor zero, zero // a dummy zero register to use usada8 instruction
138 blt len_lessthan_NMAX // if (len < NMAX) skip the while loop
139
140 while_lengenmax_loop: // do {
141 lsr vecs, nmax, #4 // vecs = NMAX/16;
142
143 len16_loop: // do {
144
145 DO16
146
147 subs vecs, #1 // vecs--;
148 bgt len16_loop // } while (vec>0);
149
150 modulo_base // adler sum2 modulo BASE
151
152 subs len, nmax // len -= NMAX
153 bge while_lengenmax_loop // } while (len >= NMAX);
154
155 len_lessthan_NMAX:
156 adds len, nmax // post-subtract len by NMAX
157
158 subs len, #16 // pre-decrement len by 16
159 blt len_lessthan_16
160
161 len16_loop2:
162
163 DO16
164
165 subs len, #16
166 bge len16_loop2
167
168 len_lessthan_16:
169 adds len, #16 // post-increment len by 16
170 beq len_is_zero
171
172 remaining_buf:
173 ldrb x0, [buf], #1
174 subs len, #1
175 add adler, x0
176 add sum2, adler
177 bgt remaining_buf
178
179 len_is_zero:
180
181 modulo_base // adler sum2 modulo BASE
182
183 add r0, adler, sum2, lsl #16 // to return sum2<<16 | adler
184
185 pop {r4-r6, r8-r11, pc}
186
187 .align 2
188 coeffs:
189 .long -2146992015
190 .long -BASE
191 .long NMAX
192
193 #else // KERNEL_SUPPORT_NEON
194
195
196
197 #define adler r0
198 #define sum2 r1
199 #define buf r2
200 #define len r3
201 #define nmax r4
202 #define vecs lr // vecs = NMAX/16
203 #define n r5
204
205 #define t r12
206
207 #define sum2_coeff q0
208 #define sum2_coeff0 d0
209 #define sum2_coeff1 d1
210 #define alder_coeff q1
211 #define ones d2
212 #define x0_x15 q2
213 #define x0_x7 d4
214 #define x8_x15 d5
215 #define adlersum2 d6
216 #define adler16 d25
217
218 #if defined _ARM_ARCH_7
219
220 adr t, vec_table // address to vec_table[]
221 stmfd sp!, {r4, r5, lr}
222
223 vld1.32 {q0-q1},[t,:128]! // loading up coefficients for adler/sum2 computation
224 vld1.32 {q15},[t,:128]! // for sum2 computation
225 ldr nmax, [t] // NMAX
226
227 vmov adlersum2, sum2, adler // pack up adler/sum2 into a double register
228
229 cmp len, nmax // len vs NMAX
230 lsr vecs, nmax, #4 // vecs = NMAX/16;
231 blt len_lessthan_NMAX // if (len < NMAX) skip the while loop
232
233 sub len, nmax // pre-decrement len by NMAX
234
235 while_len_ge_NMAX_loop: // while (len>=NMAX) {
236
237 mov n, vecs, lsr #1 // n = NMAX/16;
238
239 do_loop: // do {
240
241 vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2)
242 vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15
243 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
244 vld1.32 {q14}, [buf,:128]! // x16:x31
245 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
246 vadd.u32 adlersum2,adler16 // sum2 += old adler*32;
247 vmull.u8 q12, d28, ones // 16-bit x16-x23
248 vmull.u8 q13, d29, ones // 16-bit x24-x31
249 vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9
250 vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1
251 vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
252 vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25
253 vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17
254 vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
255 vadd.u16 q10, q11 // 8 16-bit elements for sum2
256 vadd.u16 q8, q12 // 8 16-bit elements for adler
257 vadd.u16 q9, q14 // 8 16-bit elements for sum2
258 vadd.u16 q10, q9 // 8 16-bit elements for sum2
259 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
260 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
261 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
262 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
263 subs n, #1 // --n
264 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
265 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
266
267 bgt do_loop // } while (--n);
268
269 vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2)
270
271 vld1.32 {x0_x15},[buf,:128]! // 16-byte input
272
273 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
274 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
275 vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9
276 vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1
277
278 vadd.u16 q8, q8, q9 // 8 16-bit elements for adler
279 vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2
280 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
281 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
282 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
283 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
284 vadd.u32 adlersum2,adler16 // sum2 += old adler;
285 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
286 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
287
288 // mod(alder,BASE); mod(sum2,BASE);
289 vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47
290 vshr.u64 q2,q2,#47 // take the integer part
291 vpadd.u32 d4,d4,d5 // merge into a double word in d4
292 vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
293
294 subs len, nmax // len -= NMAX;
295 bge while_len_ge_NMAX_loop // repeat while len >= NMAX
296
297 add len, nmax // post-increment len by NMAX
298
299 len_lessthan_NMAX:
300
301 cmp len, #0
302 beq len_is_zero // if len==0, branch to skip the following
303
304
305 subs len, #32 // pre-decrement len by 32
306 blt len_lessthan_32 // if len < 32, branch to len16_loop
307
308 len32_loop:
309
310 vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2)
311 vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15
312 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
313 vld1.32 {q14}, [buf,:128]! // x16:x31
314 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
315 vadd.u32 adlersum2,adler16 // sum2 += old adler*32;
316 vmull.u8 q12, d28, ones // 16-bit x16-x23
317 vmull.u8 q13, d29, ones // 16-bit x24-x31
318 vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9
319 vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1
320 vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
321 vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25
322 vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17
323 vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
324 vadd.u16 q10, q11 // 8 16-bit elements for sum2
325 vadd.u16 q8, q12 // 8 16-bit elements for adler
326 vadd.u16 q9, q14 // 8 16-bit elements for sum2
327 vadd.u16 q10, q9 // 8 16-bit elements for sum2
328 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
329 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
330 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
331 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
332 subs len, #32 // len -= 32;
333 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
334 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
335
336 bge len32_loop
337
338 len_lessthan_32:
339
340 adds len, #(32-16) // post-increment len by 32, then pre-decrement by 16
341 blt len_lessthan_16 // if len < 16, branch to len_lessthan_16
342
343 vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2)
344
345 vld1.32 {x0_x15},[buf,:128]! // 16-byte input
346
347
348 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
349 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
350 vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9
351 vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1
352
353 vadd.u16 q8, q8, q9 // 8 16-bit elements for adler
354 vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2
355 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
356 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
357 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
358 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
359 subs len, #16 // decrement len by 16
360 vadd.u32 adlersum2,adler16 // sum2 += old adler;
361 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
362 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
363
364 len_lessthan_16:
365 adds len, #16 // post-increment len by 16
366 beq len_is_zero_internal // if len==0, branch to len_is_zero_internal
367
368 // restore adler/sum2 into general registers for remaining (<16) bytes
369
370 vmov sum2, adler, adlersum2
371 remaining_len_loop:
372 ldrb t, [buf], #1 // *buf++;
373 subs len, #1 // len--;
374 add adler,t // adler += *buf
375 add sum2,adler // sum2 += adler
376 bgt remaining_len_loop // break if len<=0
377
378 vmov adlersum2, sum2, adler // move to double register for modulo operation
379
380 len_is_zero_internal:
381
382 // mod(alder,BASE); mod(sum2,BASE);
383
384 vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47
385 vshr.u64 q2,q2,#47 // take the integer part
386 vpadd.u32 d4,d4,d5 // merge into a double word in d4
387 vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
388
389 len_is_zero:
390
391 vmov sum2, adler, adlersum2 // restore adler/sum2 from (s12=sum2, s13=adler)
392 add r0, adler, sum2, lsl #16 // to return adler | (sum2 << 16);
393 ldmfd sp!, {r4, r5, pc} // restore registers and return
394
395
396 // constants to be loaded into q registers
397 .align 4 // 16 byte aligned
398
399 vec_table:
400
401 // coefficients for computing sum2
402 .long 0x0d0e0f10 // s0
403 .long 0x090a0b0c // s1
404 .long 0x05060708 // s2
405 .long 0x01020304 // s3
406
407 // coefficients for computing adler
408 .long 0x01010101 // s4/d2
409 .long 0x01010101 // s5
410
411 .long BASE // s6 : BASE
412 .long 0x80078071 // s7 : 1/BASE in Q47
413
414 // q15 : d30.d31
415 .long 0x1d1e1f20 // s0
416 .long 0x191a1b1c // s1
417 .long 0x15161718 // s2
418 .long 0x11121314 // s3
419
420 NMAX_loc:
421 .long NMAX // NMAX
422
423 #endif // _ARM_ARCH_7
424
425 #endif // (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7)
426
427 #endif // _ARM_ARCH_6
428