]>
Commit | Line | Data |
---|---|---|
b7266188 A |
1 | #include <arm/arch.h> |
2 | ||
3 | #define BASE 65521 /* largest prime smaller than 65536 */ | |
4 | #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ | |
5 | ||
6 | // Note: buf should have been 16-byte aligned in the caller function, | |
7 | ||
8 | // uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) { | |
9 | // unsigned n; | |
10 | // while (len >= NMAX) { | |
11 | // len -= NMAX; | |
12 | // n = NMAX / 16; /* NMAX is divisible by 16 */ | |
13 | // do { | |
14 | // DO16(buf); /* 16 sums unrolled */ | |
15 | // buf += 16; | |
16 | // } while (--n); | |
17 | // MOD(adler); | |
18 | // MOD(sum2); | |
19 | // } | |
20 | // if (len) { /* avoid modulos if none remaining */ | |
21 | // while (len >= 16) { | |
22 | // len -= 16; | |
23 | // DO16(buf); | |
24 | // buf += 16; | |
25 | // } | |
26 | // while (len--) { | |
27 | // adler += *buf++; | |
28 | // sum2 += adler; | |
29 | // } | |
30 | // MOD(adler); | |
31 | // MOD(sum2); | |
32 | // } | |
33 | // return adler | (sum2 << 16); /* return recombined sums */ | |
34 | // } | |
35 | ||
36 | ||
37 | /* | |
38 | DO16 vectorization: | |
39 | given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that | |
40 | sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]); | |
41 | adler += (x[0] + x[1] + ... + x[15]); | |
42 | ||
43 | therefore, this is what can be done to vectorize the above computation | |
44 | 1. 16-byte aligned vector load into q2 (x[0:x15]) | |
45 | 2. sum2 += (adler<<4); | |
46 | 3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15] | |
47 | 4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15] | |
48 | 5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2 | |
49 | ||
50 | In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop. | |
51 | 1. 32-byte aligned vector load into q2,q14 (x[0:x31]) | |
52 | 2. sum2 += (adler<<5); | |
53 | 3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31] | |
54 | 4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31] | |
55 | 5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2 | |
56 | ||
57 | This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8. | |
58 | ||
59 | */ | |
60 | ||
61 | /* | |
62 | MOD implementation: | |
63 | adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47 | |
64 | 1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47 | |
65 | 2. vshr.u64 q2,q2,#47 // floor function | |
66 | 3. vpadd.u32 d4,d4,d5 // merge into a double word in d4 | |
67 | 4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE | |
68 | ||
69 | */ | |
70 | ||
71 | #if defined _ARM_ARCH_6 // this file would be used only for armv6 or above | |
72 | ||
73 | ||
74 | .text | |
75 | .align 2 | |
76 | .globl _adler32_vec | |
77 | _adler32_vec: | |
78 | ||
79 | #if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) // for armv6 or armv7 without neon support | |
80 | ||
81 | ||
82 | #define adler r0 | |
83 | #define sum2 r1 | |
84 | #define buf r2 | |
85 | #define len r3 | |
86 | #define one_by_base r4 | |
87 | #define base r5 | |
88 | #define nmax r6 | |
89 | #define t r12 | |
90 | #define vecs lr | |
91 | #define x0 r8 | |
92 | #define x1 r10 | |
93 | #define x2 r11 | |
94 | #define x3 r12 | |
95 | #define zero r9 | |
96 | ||
97 | // this macro performs adler/sum2 update for 4 input bytes | |
98 | ||
99 | .macro DO4 | |
100 | add sum2, adler, lsl #2 // sum2 += 4*adler; | |
101 | ldr x0,[buf] // 4 bytes in 1 32-bit word | |
102 | usada8 adler, x0, zero, adler // adler += sum(x0:x3) | |
103 | ldrb x0,[buf], #4 // x0 | |
104 | ldrb x2,[buf,#-2] // x2 | |
105 | ldrb x1,[buf,#-3] // x1 | |
106 | ldrb x3,[buf,#-1] // x3 | |
107 | add sum2, x0, lsl #2 // sum2 += 4*x0 | |
108 | add x3, x3, x1, lsl #1 // x3+2*x1 | |
109 | add sum2, x2, lsl #1 // sum2 += 2*x2 | |
110 | add x3, x1 // x3+3*x1 | |
111 | add sum2, x3 // sum2 += x3+3*x1 | |
112 | .endm | |
113 | ||
114 | // the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes | |
115 | .macro DO16 | |
116 | DO4 // adler/sum2 update for 4 input bytes | |
117 | DO4 // adler/sum2 update for 4 input bytes | |
118 | DO4 // adler/sum2 update for 4 input bytes | |
119 | DO4 // adler/sum2 update for 4 input bytes | |
120 | .endm | |
121 | ||
122 | // the following macro performs adler sum2 modulo BASE | |
123 | .macro modulo_base | |
124 | umull x0,x1,adler,one_by_base // adler/BASE in Q47 | |
125 | umull x2,x3,sum2,one_by_base // sum2/BASE in Q47 | |
126 | lsr x1, #15 // x1 >> 15 = floor(adler/BASE) | |
127 | lsr x3, #15 // x3 >> 15 = floor(sum2/BASE) | |
128 | mla adler, x1, base, adler // adler %= base; | |
129 | mla sum2, x3, base, sum2 // sum2 %= base; | |
130 | .endm | |
131 | ||
132 | adr t, coeffs | |
133 | push {r4-r6, r8-r11, lr} | |
134 | ldmia t, {one_by_base, base, nmax} // load up coefficients | |
135 | ||
136 | subs len, nmax // pre-subtract len by NMAX | |
137 | eor zero, zero // a dummy zero register to use usada8 instruction | |
138 | blt len_lessthan_NMAX // if (len < NMAX) skip the while loop | |
139 | ||
140 | while_lengenmax_loop: // do { | |
141 | lsr vecs, nmax, #4 // vecs = NMAX/16; | |
142 | ||
143 | len16_loop: // do { | |
144 | ||
145 | DO16 | |
146 | ||
147 | subs vecs, #1 // vecs--; | |
148 | bgt len16_loop // } while (vec>0); | |
149 | ||
150 | modulo_base // adler sum2 modulo BASE | |
151 | ||
152 | subs len, nmax // len -= NMAX | |
153 | bge while_lengenmax_loop // } while (len >= NMAX); | |
154 | ||
155 | len_lessthan_NMAX: | |
156 | adds len, nmax // post-subtract len by NMAX | |
157 | ||
158 | subs len, #16 // pre-decrement len by 16 | |
159 | blt len_lessthan_16 | |
160 | ||
161 | len16_loop2: | |
162 | ||
163 | DO16 | |
164 | ||
165 | subs len, #16 | |
166 | bge len16_loop2 | |
167 | ||
168 | len_lessthan_16: | |
169 | adds len, #16 // post-increment len by 16 | |
170 | beq len_is_zero | |
171 | ||
172 | remaining_buf: | |
173 | ldrb x0, [buf], #1 | |
174 | subs len, #1 | |
175 | add adler, x0 | |
176 | add sum2, adler | |
177 | bgt remaining_buf | |
178 | ||
179 | len_is_zero: | |
180 | ||
181 | modulo_base // adler sum2 modulo BASE | |
182 | ||
183 | add r0, adler, sum2, lsl #16 // to return sum2<<16 | adler | |
184 | ||
185 | pop {r4-r6, r8-r11, pc} | |
186 | ||
187 | .align 2 | |
188 | coeffs: | |
189 | .long -2146992015 | |
190 | .long -BASE | |
191 | .long NMAX | |
192 | ||
193 | #else // KERNEL_SUPPORT_NEON | |
194 | ||
195 | ||
196 | ||
197 | #define adler r0 | |
198 | #define sum2 r1 | |
199 | #define buf r2 | |
200 | #define len r3 | |
201 | #define nmax r4 | |
202 | #define vecs lr // vecs = NMAX/16 | |
203 | #define n r5 | |
204 | ||
205 | #define t r12 | |
206 | ||
207 | #define sum2_coeff q0 | |
208 | #define sum2_coeff0 d0 | |
209 | #define sum2_coeff1 d1 | |
210 | #define alder_coeff q1 | |
211 | #define ones d2 | |
212 | #define x0_x15 q2 | |
213 | #define x0_x7 d4 | |
214 | #define x8_x15 d5 | |
215 | #define adlersum2 d6 | |
216 | #define adler16 d25 | |
217 | ||
218 | #if defined _ARM_ARCH_7 | |
219 | ||
220 | adr t, vec_table // address to vec_table[] | |
221 | stmfd sp!, {r4, r5, lr} | |
222 | ||
223 | vld1.32 {q0-q1},[t,:128]! // loading up coefficients for adler/sum2 computation | |
224 | vld1.32 {q15},[t,:128]! // for sum2 computation | |
225 | ldr nmax, [t] // NMAX | |
226 | ||
227 | vmov adlersum2, sum2, adler // pack up adler/sum2 into a double register | |
228 | ||
229 | cmp len, nmax // len vs NMAX | |
230 | lsr vecs, nmax, #4 // vecs = NMAX/16; | |
231 | blt len_lessthan_NMAX // if (len < NMAX) skip the while loop | |
232 | ||
233 | sub len, nmax // pre-decrement len by NMAX | |
234 | ||
235 | while_len_ge_NMAX_loop: // while (len>=NMAX) { | |
236 | ||
237 | mov n, vecs, lsr #1 // n = NMAX/16; | |
238 | ||
239 | do_loop: // do { | |
240 | ||
241 | vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2) | |
242 | vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15 | |
243 | vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 | |
244 | vld1.32 {q14}, [buf,:128]! // x16:x31 | |
245 | vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 | |
246 | vadd.u32 adlersum2,adler16 // sum2 += old adler*32; | |
247 | vmull.u8 q12, d28, ones // 16-bit x16-x23 | |
248 | vmull.u8 q13, d29, ones // 16-bit x24-x31 | |
249 | vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9 | |
250 | vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1 | |
251 | vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler | |
252 | vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25 | |
253 | vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17 | |
254 | vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler | |
255 | vadd.u16 q10, q11 // 8 16-bit elements for sum2 | |
256 | vadd.u16 q8, q12 // 8 16-bit elements for adler | |
257 | vadd.u16 q9, q14 // 8 16-bit elements for sum2 | |
258 | vadd.u16 q10, q9 // 8 16-bit elements for sum2 | |
259 | vpaddl.u16 q8, q8 // 4 32-bit elements for adler | |
260 | vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 | |
261 | vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler | |
262 | vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 | |
263 | subs n, #1 // --n | |
264 | vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler | |
265 | vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input | |
266 | ||
267 | bgt do_loop // } while (--n); | |
268 | ||
269 | vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2) | |
270 | ||
271 | vld1.32 {x0_x15},[buf,:128]! // 16-byte input | |
272 | ||
273 | vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 | |
274 | vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 | |
275 | vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9 | |
276 | vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1 | |
277 | ||
278 | vadd.u16 q8, q8, q9 // 8 16-bit elements for adler | |
279 | vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2 | |
280 | vpaddl.u16 q8, q8 // 4 32-bit elements for adler | |
281 | vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 | |
282 | vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler | |
283 | vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 | |
284 | vadd.u32 adlersum2,adler16 // sum2 += old adler; | |
285 | vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler | |
286 | vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input | |
287 | ||
288 | // mod(alder,BASE); mod(sum2,BASE); | |
289 | vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47 | |
290 | vshr.u64 q2,q2,#47 // take the integer part | |
291 | vpadd.u32 d4,d4,d5 // merge into a double word in d4 | |
292 | vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE | |
293 | ||
294 | subs len, nmax // len -= NMAX; | |
295 | bge while_len_ge_NMAX_loop // repeat while len >= NMAX | |
296 | ||
297 | add len, nmax // post-increment len by NMAX | |
298 | ||
299 | len_lessthan_NMAX: | |
300 | ||
301 | cmp len, #0 | |
302 | beq len_is_zero // if len==0, branch to skip the following | |
303 | ||
304 | ||
305 | subs len, #32 // pre-decrement len by 32 | |
306 | blt len_lessthan_32 // if len < 32, branch to len16_loop | |
307 | ||
308 | len32_loop: | |
309 | ||
310 | vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2) | |
311 | vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15 | |
312 | vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 | |
313 | vld1.32 {q14}, [buf,:128]! // x16:x31 | |
314 | vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 | |
315 | vadd.u32 adlersum2,adler16 // sum2 += old adler*32; | |
316 | vmull.u8 q12, d28, ones // 16-bit x16-x23 | |
317 | vmull.u8 q13, d29, ones // 16-bit x24-x31 | |
318 | vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9 | |
319 | vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1 | |
320 | vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler | |
321 | vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25 | |
322 | vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17 | |
323 | vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler | |
324 | vadd.u16 q10, q11 // 8 16-bit elements for sum2 | |
325 | vadd.u16 q8, q12 // 8 16-bit elements for adler | |
326 | vadd.u16 q9, q14 // 8 16-bit elements for sum2 | |
327 | vadd.u16 q10, q9 // 8 16-bit elements for sum2 | |
328 | vpaddl.u16 q8, q8 // 4 32-bit elements for adler | |
329 | vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 | |
330 | vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler | |
331 | vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 | |
332 | subs len, #32 // len -= 32; | |
333 | vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler | |
334 | vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input | |
335 | ||
336 | bge len32_loop | |
337 | ||
338 | len_lessthan_32: | |
339 | ||
340 | adds len, #(32-16) // post-increment len by 32, then pre-decrement by 16 | |
341 | blt len_lessthan_16 // if len < 16, branch to len_lessthan_16 | |
342 | ||
343 | vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2) | |
344 | ||
345 | vld1.32 {x0_x15},[buf,:128]! // 16-byte input | |
346 | ||
347 | ||
348 | vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 | |
349 | vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 | |
350 | vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9 | |
351 | vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1 | |
352 | ||
353 | vadd.u16 q8, q8, q9 // 8 16-bit elements for adler | |
354 | vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2 | |
355 | vpaddl.u16 q8, q8 // 4 32-bit elements for adler | |
356 | vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 | |
357 | vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler | |
358 | vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 | |
359 | subs len, #16 // decrement len by 16 | |
360 | vadd.u32 adlersum2,adler16 // sum2 += old adler; | |
361 | vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler | |
362 | vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input | |
363 | ||
364 | len_lessthan_16: | |
365 | adds len, #16 // post-increment len by 16 | |
366 | beq len_is_zero_internal // if len==0, branch to len_is_zero_internal | |
367 | ||
368 | // restore adler/sum2 into general registers for remaining (<16) bytes | |
369 | ||
370 | vmov sum2, adler, adlersum2 | |
371 | remaining_len_loop: | |
372 | ldrb t, [buf], #1 // *buf++; | |
373 | subs len, #1 // len--; | |
374 | add adler,t // adler += *buf | |
375 | add sum2,adler // sum2 += adler | |
376 | bgt remaining_len_loop // break if len<=0 | |
377 | ||
378 | vmov adlersum2, sum2, adler // move to double register for modulo operation | |
379 | ||
380 | len_is_zero_internal: | |
381 | ||
382 | // mod(alder,BASE); mod(sum2,BASE); | |
383 | ||
384 | vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47 | |
385 | vshr.u64 q2,q2,#47 // take the integer part | |
386 | vpadd.u32 d4,d4,d5 // merge into a double word in d4 | |
387 | vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE | |
388 | ||
389 | len_is_zero: | |
390 | ||
391 | vmov sum2, adler, adlersum2 // restore adler/sum2 from (s12=sum2, s13=adler) | |
392 | add r0, adler, sum2, lsl #16 // to return adler | (sum2 << 16); | |
393 | ldmfd sp!, {r4, r5, pc} // restore registers and return | |
394 | ||
395 | ||
396 | // constants to be loaded into q registers | |
397 | .align 4 // 16 byte aligned | |
398 | ||
399 | vec_table: | |
400 | ||
401 | // coefficients for computing sum2 | |
402 | .long 0x0d0e0f10 // s0 | |
403 | .long 0x090a0b0c // s1 | |
404 | .long 0x05060708 // s2 | |
405 | .long 0x01020304 // s3 | |
406 | ||
407 | // coefficients for computing adler | |
408 | .long 0x01010101 // s4/d2 | |
409 | .long 0x01010101 // s5 | |
410 | ||
411 | .long BASE // s6 : BASE | |
412 | .long 0x80078071 // s7 : 1/BASE in Q47 | |
413 | ||
414 | // q15 : d30.d31 | |
415 | .long 0x1d1e1f20 // s0 | |
416 | .long 0x191a1b1c // s1 | |
417 | .long 0x15161718 // s2 | |
418 | .long 0x11121314 // s3 | |
419 | ||
420 | NMAX_loc: | |
421 | .long NMAX // NMAX | |
422 | ||
423 | #endif // _ARM_ARCH_7 | |
424 | ||
425 | #endif // (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) | |
426 | ||
427 | #endif // _ARM_ARCH_6 | |
428 |