3 #define BASE 65521 /* largest prime smaller than 65536 */
4 #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
6 // Note: buf should have been 16-byte aligned in the caller function,
8 // uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) {
10 // while (len >= NMAX) {
12 // n = NMAX / 16; /* NMAX is divisible by 16 */
14 // DO16(buf); /* 16 sums unrolled */
20 // if (len) { /* avoid modulos if none remaining */
21 // while (len >= 16) {
33 // return adler | (sum2 << 16); /* return recombined sums */
39 given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that
40 sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]);
41 adler += (x[0] + x[1] + ... + x[15]);
43 therefore, this is what can be done to vectorize the above computation
44 1. 16-byte aligned vector load into q2 (x[0:x15])
45 2. sum2 += (adler<<4);
46 3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15]
47 4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15]
48 5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2
50 In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop.
51 1. 32-byte aligned vector load into q2,q14 (x[0:x31])
52 2. sum2 += (adler<<5);
53 3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31]
54 4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31]
55 5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2
57 This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8.
63 adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47
64 1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47
65 2. vshr.u64 q2,q2,#47 // floor function
66 3. vpadd.u32 d4,d4,d5 // merge into a double word in d4
67 4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
71 #if defined _ARM_ARCH_6 // this file would be used only for armv6 or above
79 #if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) // for armv6 or armv7 without neon support
86 #define one_by_base r4
97 // this macro performs adler/sum2 update for 4 input bytes
100 add sum2, adler, lsl #2 // sum2 += 4*adler;
101 ldr x0,[buf] // 4 bytes in 1 32-bit word
102 usada8 adler, x0, zero, adler // adler += sum(x0:x3)
103 ldrb x0,[buf], #4 // x0
104 ldrb x2,[buf,#-2] // x2
105 ldrb x1,[buf,#-3] // x1
106 ldrb x3,[buf,#-1] // x3
107 add sum2, x0, lsl #2 // sum2 += 4*x0
108 add x3, x3, x1, lsl #1 // x3+2*x1
109 add sum2, x2, lsl #1 // sum2 += 2*x2
110 add x3, x1 // x3+3*x1
111 add sum2, x3 // sum2 += x3+3*x1
114 // the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes
116 DO4 // adler/sum2 update for 4 input bytes
117 DO4 // adler/sum2 update for 4 input bytes
118 DO4 // adler/sum2 update for 4 input bytes
119 DO4 // adler/sum2 update for 4 input bytes
122 // the following macro performs adler sum2 modulo BASE
124 umull x0,x1,adler,one_by_base // adler/BASE in Q47
125 umull x2,x3,sum2,one_by_base // sum2/BASE in Q47
126 lsr x1, #15 // x1 >> 15 = floor(adler/BASE)
127 lsr x3, #15 // x3 >> 15 = floor(sum2/BASE)
128 mla adler, x1, base, adler // adler %= base;
129 mla sum2, x3, base, sum2 // sum2 %= base;
133 push {r4-r6, r8-r11, lr}
134 ldmia t, {one_by_base, base, nmax} // load up coefficients
136 subs len, nmax // pre-subtract len by NMAX
137 eor zero, zero // a dummy zero register to use usada8 instruction
138 blt len_lessthan_NMAX // if (len < NMAX) skip the while loop
140 while_lengenmax_loop: // do {
141 lsr vecs, nmax, #4 // vecs = NMAX/16;
147 subs vecs, #1 // vecs--;
148 bgt len16_loop // } while (vec>0);
150 modulo_base // adler sum2 modulo BASE
152 subs len, nmax // len -= NMAX
153 bge while_lengenmax_loop // } while (len >= NMAX);
156 adds len, nmax // post-subtract len by NMAX
158 subs len, #16 // pre-decrement len by 16
169 adds len, #16 // post-increment len by 16
181 modulo_base // adler sum2 modulo BASE
183 add r0, adler, sum2, lsl #16 // to return sum2<<16 | adler
185 pop {r4-r6, r8-r11, pc}
193 #else // KERNEL_SUPPORT_NEON
202 #define vecs lr // vecs = NMAX/16
207 #define sum2_coeff q0
208 #define sum2_coeff0 d0
209 #define sum2_coeff1 d1
210 #define alder_coeff q1
218 #if defined _ARM_ARCH_7
220 adr t, vec_table // address to vec_table[]
221 stmfd sp!, {r4, r5, lr}
223 vld1.32 {q0-q1},[t,:128]! // loading up coefficients for adler/sum2 computation
224 vld1.32 {q15},[t,:128]! // for sum2 computation
225 ldr nmax, [t] // NMAX
227 vmov adlersum2, sum2, adler // pack up adler/sum2 into a double register
229 cmp len, nmax // len vs NMAX
230 lsr vecs, nmax, #4 // vecs = NMAX/16;
231 blt len_lessthan_NMAX // if (len < NMAX) skip the while loop
233 sub len, nmax // pre-decrement len by NMAX
235 while_len_ge_NMAX_loop: // while (len>=NMAX) {
237 mov n, vecs, lsr #1 // n = NMAX/16;
241 vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2)
242 vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15
243 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
244 vld1.32 {q14}, [buf,:128]! // x16:x31
245 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
246 vadd.u32 adlersum2,adler16 // sum2 += old adler*32;
247 vmull.u8 q12, d28, ones // 16-bit x16-x23
248 vmull.u8 q13, d29, ones // 16-bit x24-x31
249 vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9
250 vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1
251 vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
252 vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25
253 vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17
254 vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
255 vadd.u16 q10, q11 // 8 16-bit elements for sum2
256 vadd.u16 q8, q12 // 8 16-bit elements for adler
257 vadd.u16 q9, q14 // 8 16-bit elements for sum2
258 vadd.u16 q10, q9 // 8 16-bit elements for sum2
259 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
260 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
261 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
262 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
264 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
265 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
267 bgt do_loop // } while (--n);
269 vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2)
271 vld1.32 {x0_x15},[buf,:128]! // 16-byte input
273 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
274 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
275 vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9
276 vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1
278 vadd.u16 q8, q8, q9 // 8 16-bit elements for adler
279 vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2
280 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
281 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
282 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
283 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
284 vadd.u32 adlersum2,adler16 // sum2 += old adler;
285 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
286 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
288 // mod(alder,BASE); mod(sum2,BASE);
289 vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47
290 vshr.u64 q2,q2,#47 // take the integer part
291 vpadd.u32 d4,d4,d5 // merge into a double word in d4
292 vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
294 subs len, nmax // len -= NMAX;
295 bge while_len_ge_NMAX_loop // repeat while len >= NMAX
297 add len, nmax // post-increment len by NMAX
302 beq len_is_zero // if len==0, branch to skip the following
305 subs len, #32 // pre-decrement len by 32
306 blt len_lessthan_32 // if len < 32, branch to len16_loop
310 vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2)
311 vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15
312 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
313 vld1.32 {q14}, [buf,:128]! // x16:x31
314 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
315 vadd.u32 adlersum2,adler16 // sum2 += old adler*32;
316 vmull.u8 q12, d28, ones // 16-bit x16-x23
317 vmull.u8 q13, d29, ones // 16-bit x24-x31
318 vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9
319 vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1
320 vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
321 vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25
322 vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17
323 vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
324 vadd.u16 q10, q11 // 8 16-bit elements for sum2
325 vadd.u16 q8, q12 // 8 16-bit elements for adler
326 vadd.u16 q9, q14 // 8 16-bit elements for sum2
327 vadd.u16 q10, q9 // 8 16-bit elements for sum2
328 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
329 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
330 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
331 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
332 subs len, #32 // len -= 32;
333 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
334 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
340 adds len, #(32-16) // post-increment len by 32, then pre-decrement by 16
341 blt len_lessthan_16 // if len < 16, branch to len_lessthan_16
343 vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2)
345 vld1.32 {x0_x15},[buf,:128]! // 16-byte input
348 vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
349 vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
350 vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9
351 vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1
353 vadd.u16 q8, q8, q9 // 8 16-bit elements for adler
354 vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2
355 vpaddl.u16 q8, q8 // 4 32-bit elements for adler
356 vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
357 vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
358 vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
359 subs len, #16 // decrement len by 16
360 vadd.u32 adlersum2,adler16 // sum2 += old adler;
361 vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
362 vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
365 adds len, #16 // post-increment len by 16
366 beq len_is_zero_internal // if len==0, branch to len_is_zero_internal
368 // restore adler/sum2 into general registers for remaining (<16) bytes
370 vmov sum2, adler, adlersum2
372 ldrb t, [buf], #1 // *buf++;
373 subs len, #1 // len--;
374 add adler,t // adler += *buf
375 add sum2,adler // sum2 += adler
376 bgt remaining_len_loop // break if len<=0
378 vmov adlersum2, sum2, adler // move to double register for modulo operation
380 len_is_zero_internal:
382 // mod(alder,BASE); mod(sum2,BASE);
384 vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47
385 vshr.u64 q2,q2,#47 // take the integer part
386 vpadd.u32 d4,d4,d5 // merge into a double word in d4
387 vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
391 vmov sum2, adler, adlersum2 // restore adler/sum2 from (s12=sum2, s13=adler)
392 add r0, adler, sum2, lsl #16 // to return adler | (sum2 << 16);
393 ldmfd sp!, {r4, r5, pc} // restore registers and return
396 // constants to be loaded into q registers
397 .align 4 // 16 byte aligned
401 // coefficients for computing sum2
402 .long 0x0d0e0f10 // s0
403 .long 0x090a0b0c // s1
404 .long 0x05060708 // s2
405 .long 0x01020304 // s3
407 // coefficients for computing adler
408 .long 0x01010101 // s4/d2
409 .long 0x01010101 // s5
411 .long BASE // s6 : BASE
412 .long 0x80078071 // s7 : 1/BASE in Q47
415 .long 0x1d1e1f20 // s0
416 .long 0x191a1b1c // s1
417 .long 0x15161718 // s2
418 .long 0x11121314 // s3
423 #endif // _ARM_ARCH_7
425 #endif // (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7)
427 #endif // _ARM_ARCH_6