]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2012-2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__) | |
31 | * with __arm64__ tagged ARM64_TODO . This code revision is optimized based | |
32 | * on the 64-bit part in netinet/cpu_in_cksum.c | |
33 | * | |
34 | * cclee - CoreOS - Vector & Numerics. 06/20/2012. | |
35 | */ | |
36 | ||
37 | #ifdef KERNEL | |
38 | #define CKSUM_ERR _kprintf | |
39 | #else | |
40 | #ifndef LIBSYSCALL_INTERFACE | |
41 | #error "LIBSYSCALL_INTERFACE not defined" | |
42 | #endif /* !LIBSYSCALL_INTERFACE */ | |
43 | #define CKSUM_ERR _fprintf_stderr | |
44 | #endif /* !KERNEL */ | |
45 | ||
46 | /* | |
47 | * XXX: adi@apple.com: | |
48 | * | |
49 | * Ugly, but we have little choice, since relying on genassym and <assym.s> | |
50 | * is not possible unless this code lives in osfmk. Note also that this | |
51 | * routine expects "mbuf-like" argument, and it does not expect the mbuf to be | |
52 | * authentic; it only cares about 3 fields. | |
53 | */ | |
54 | #if defined(__LP64__) | |
55 | #define M_NEXT 0 | |
56 | #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary | |
57 | #define M_LEN 24 | |
58 | #else | |
59 | #define M_NEXT 0 | |
60 | #define M_DATA 8 | |
61 | #define M_LEN 12 | |
62 | #endif | |
63 | ||
64 | .globl _os_cpu_in_cksum_mbuf | |
65 | .text | |
66 | .align 4 | |
67 | _os_cpu_in_cksum_mbuf: | |
68 | ||
69 | ||
70 | /* | |
71 | * 64-bit version. | |
72 | * | |
73 | * This function returns the partial 16-bit checksum accumulated in | |
74 | * a 32-bit variable (withouth 1's complement); caller is responsible | |
75 | * for folding the 32-bit sum into 16-bit and performinng the 1's | |
76 | * complement if applicable | |
77 | */ | |
78 | ||
79 | /* | |
80 | * uint32_t | |
81 | * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum) | |
82 | * { | |
83 | * int mlen; | |
84 | * uint64_t sum, partial; | |
85 | * unsigned int final_acc; | |
86 | * uint8_t *data; | |
87 | * boolean_t needs_swap, started_on_odd; | |
88 | * | |
89 | * VERIFY(len >= 0); | |
90 | * VERIFY(off >= 0); | |
91 | * | |
92 | * needs_swap = FALSE; | |
93 | * started_on_odd = FALSE; | |
94 | * sum = initial_sum; | |
95 | */ | |
96 | ||
97 | #define m x0 | |
98 | #define len x1 | |
99 | #define off x2 | |
100 | #define sum x3 | |
101 | #define needs_swap x4 | |
102 | #define started_on_odd x5 | |
103 | #define mlen x6 | |
104 | #define Wmlen w6 | |
105 | #define t x7 | |
106 | #define data x8 | |
107 | #if defined(__LP64__) | |
108 | #define ptr_m x0 | |
109 | #define ptr_data x8 | |
110 | #else | |
111 | #define ptr_m w0 | |
112 | #define ptr_data w8 | |
113 | #endif | |
114 | ||
115 | ||
116 | mov needs_swap, #0 // needs_swap = FALSE; | |
117 | mov started_on_odd, #0 // started_on_odd = FALSE; | |
118 | mov w3, w3 // clear higher half | |
119 | ||
120 | ||
121 | /* | |
122 | * for (;;) { | |
123 | * if (PREDICT_FALSE(m == NULL)) { | |
124 | * CKSUM_ERR("%s: out of data\n", __func__); | |
125 | * return (-1); | |
126 | * } | |
127 | * mlen = m->m_len; | |
128 | * if (mlen > off) { | |
129 | * mlen -= off; | |
130 | * data = mtod(m, uint8_t *) + off; | |
131 | * goto post_initial_offset; | |
132 | * } | |
133 | * off -= mlen; | |
134 | * if (len == 0) | |
135 | * break; | |
136 | * m = m->m_next; | |
137 | * } | |
138 | */ | |
139 | ||
140 | 0: | |
141 | cbz m, Lin_cksum_whoops // if (m == NULL) return -1; | |
142 | ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; | |
143 | cmp mlen, off | |
144 | b.le 1f | |
145 | ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) | |
146 | sub mlen, mlen, off // mlen -= off; | |
147 | add data, data, off // data = mtod(m, uint8_t *) + off; | |
148 | b L_post_initial_offset | |
149 | 1: | |
150 | sub off, off, mlen | |
151 | cbnz len, 2f | |
152 | mov x0, x3 | |
153 | ret lr | |
154 | 2: | |
155 | ldr ptr_m, [m, #M_NEXT] | |
156 | b 0b | |
157 | ||
158 | L_loop: // for (; len > 0; m = m->m_next) { | |
159 | /* | |
160 | * if (PREDICT_FALSE(m == NULL)) { | |
161 | * CKSUM_ERR("%s: out of data\n", __func__); | |
162 | * return (-1); | |
163 | * } | |
164 | * mlen = m->m_len; | |
165 | * data = mtod(m, uint8_t *); | |
166 | */ | |
167 | cbz m, Lin_cksum_whoops // if (m == NULL) return -1; | |
168 | ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; | |
169 | ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) | |
170 | ||
171 | L_post_initial_offset: | |
172 | /* | |
173 | * if (mlen == 0) continue; | |
174 | * if (mlen > len) mlen = len; | |
175 | * len -= mlen; | |
176 | */ | |
177 | ||
178 | cbz mlen, L_continue | |
179 | cmp mlen, len | |
180 | csel mlen, mlen, len, le | |
181 | sub len, len, mlen | |
182 | ||
183 | /* | |
184 | * partial = 0; | |
185 | * if ((uintptr_t)data & 1) { | |
186 | * started_on_odd = !started_on_odd; | |
187 | * partial = *data << 8; | |
188 | * ++data; | |
189 | * --mlen; | |
190 | * } | |
191 | * needs_swap = started_on_odd; | |
192 | */ | |
193 | ||
194 | tst data, #1 | |
195 | mov x7, #0 | |
196 | mov x10, #0 | |
197 | b.eq 1f | |
198 | ldrb w7, [data], #1 | |
199 | eor started_on_odd, started_on_odd, #1 | |
200 | sub mlen, mlen, #1 | |
201 | lsl w7, w7, #8 | |
202 | 1: | |
203 | ||
204 | ||
205 | /* | |
206 | * if ((uintptr_t)data & 2) { | |
207 | * if (mlen < 2) | |
208 | * goto trailing_bytes; | |
209 | * partial += *(uint16_t *)(void *)data; | |
210 | * data += 2; | |
211 | * mlen -= 2; | |
212 | * } | |
213 | */ | |
214 | tst data, #2 | |
215 | mov needs_swap, started_on_odd | |
216 | b.eq 1f | |
217 | cmp mlen, #2 | |
218 | b.lt L_trailing_bytes | |
219 | ldrh w9, [data], #2 | |
220 | sub mlen, mlen, #2 | |
221 | add w7, w7, w9 | |
222 | 1: | |
223 | ||
224 | /* | |
225 | * if ((uintptr_t)data & 4) { | |
226 | * if (mlen < 4) | |
227 | * goto L2_bytes; | |
228 | * partial += *(uint32_t *)(void *)data; | |
229 | * data += 4; | |
230 | * mlen -= 4; | |
231 | * } | |
232 | */ | |
233 | // align on 8-bytes boundary if applicable | |
234 | tst data, #4 | |
235 | b.eq 1f | |
236 | cmp mlen, #4 | |
237 | b.lt L2_bytes | |
238 | ldr w9, [data], #4 | |
239 | sub mlen, mlen, #4 | |
240 | adds w7, w7, w9 | |
241 | adc x7, x7, x10 // assumes x10 still is #0 as set above | |
242 | 1: | |
243 | ||
244 | /* | |
245 | * while (mlen >= 64) { | |
246 | * __builtin_prefetch(data + 32); | |
247 | * __builtin_prefetch(data + 64); | |
248 | * partial += *(uint32_t *)(void *)data; | |
249 | * partial += *(uint32_t *)(void *)(data + 4); | |
250 | * partial += *(uint32_t *)(void *)(data + 8); | |
251 | * partial += *(uint32_t *)(void *)(data + 12); | |
252 | * partial += *(uint32_t *)(void *)(data + 16); | |
253 | * partial += *(uint32_t *)(void *)(data + 20); | |
254 | * partial += *(uint32_t *)(void *)(data + 24); | |
255 | * partial += *(uint32_t *)(void *)(data + 28); | |
256 | * partial += *(uint32_t *)(void *)(data + 32); | |
257 | * partial += *(uint32_t *)(void *)(data + 36); | |
258 | * partial += *(uint32_t *)(void *)(data + 40); | |
259 | * partial += *(uint32_t *)(void *)(data + 44); | |
260 | * partial += *(uint32_t *)(void *)(data + 48); | |
261 | * partial += *(uint32_t *)(void *)(data + 52); | |
262 | * partial += *(uint32_t *)(void *)(data + 56); | |
263 | * partial += *(uint32_t *)(void *)(data + 60); | |
264 | * data += 64; | |
265 | * mlen -= 64; | |
266 | * // if (PREDICT_FALSE(partial & (3ULL << 62))) { | |
267 | * // if (needs_swap) | |
268 | * // partial = (partial << 8) + | |
269 | * // (partial >> 56); | |
270 | * // sum += (partial >> 32); | |
271 | * // sum += (partial & 0xffffffff); | |
272 | * // partial = 0; | |
273 | * // } | |
274 | * } | |
275 | */ | |
276 | ||
277 | // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next | |
278 | subs mlen, mlen, #64 | |
279 | b.lt L32_bytes | |
280 | ||
281 | // save used vector registers | |
282 | sub sp, sp, #8*16 | |
283 | mov x11, sp | |
284 | st1.4s {v0, v1, v2, v3}, [x11], #4*16 | |
285 | st1.4s {v4, v5, v6, v7}, [x11], #4*16 | |
286 | ||
287 | // spread partial into 8 8-byte registers in v0-v3 | |
288 | fmov s3, w7 | |
289 | eor.16b v0, v0, v0 | |
290 | eor.16b v1, v1, v1 | |
291 | eor.16b v2, v2, v2 | |
292 | ||
293 | // load the 1st 64 bytes (16 32-bit words) | |
294 | ld1.4s {v4,v5,v6,v7},[data],#64 | |
295 | ||
296 | // branch to finish off if mlen<64 | |
297 | subs mlen, mlen, #64 | |
298 | b.lt L64_finishup | |
299 | ||
300 | /* | |
301 | * loop for loading and accumulating 16 32-bit words into | |
302 | * 8 8-byte accumulators per iteration. | |
303 | */ | |
304 | L64_loop: | |
305 | subs mlen, mlen, #64 // mlen -= 64 | |
306 | ||
307 | uadalp.2d v0, v4 | |
308 | ld1.4s {v4},[data], #16 | |
309 | ||
310 | uadalp.2d v1, v5 | |
311 | ld1.4s {v5},[data], #16 | |
312 | ||
313 | uadalp.2d v2, v6 | |
314 | ld1.4s {v6},[data], #16 | |
315 | ||
316 | uadalp.2d v3, v7 | |
317 | ld1.4s {v7},[data], #16 | |
318 | ||
319 | b.ge L64_loop | |
320 | ||
321 | L64_finishup: | |
322 | uadalp.2d v0, v4 | |
323 | uadalp.2d v1, v5 | |
324 | uadalp.2d v2, v6 | |
325 | uadalp.2d v3, v7 | |
326 | ||
327 | add.2d v0, v0, v1 | |
328 | add.2d v2, v2, v3 | |
329 | addp.2d d0, v0 | |
330 | addp.2d d2, v2 | |
331 | add.2d v0, v0, v2 | |
332 | fmov x7, d0 // partial in x7 now | |
333 | ||
334 | // restore used vector registers | |
335 | ld1.4s {v0, v1, v2, v3}, [sp], #4*16 | |
336 | ld1.4s {v4, v5, v6, v7}, [sp], #4*16 | |
337 | ||
338 | L32_bytes: | |
339 | tst mlen, #32 | |
340 | b.eq L16_bytes | |
341 | ldp x9, x10, [data], #16 | |
342 | ldp x11, x12, [data], #16 | |
343 | adds x7, x7, x9 | |
344 | mov x9, #0 | |
345 | adcs x7, x7, x10 | |
346 | adcs x7, x7, x11 | |
347 | adcs x7, x7, x12 | |
348 | adc x7, x7, x9 | |
349 | ||
350 | L16_bytes: | |
351 | tst mlen, #16 | |
352 | b.eq L8_bytes | |
353 | ldp x9, x10, [data], #16 | |
354 | adds x7, x7, x9 | |
355 | mov x9, #0 | |
356 | adcs x7, x7, x10 | |
357 | adc x7, x7, x9 | |
358 | ||
359 | L8_bytes: | |
360 | tst mlen, #8 | |
361 | mov x10, #0 | |
362 | b.eq L4_bytes | |
363 | ldr x9,[data],#8 | |
364 | adds x7, x7, x9 | |
365 | adc x7, x7, x10 | |
366 | ||
367 | L4_bytes: | |
368 | tst mlen, #4 | |
369 | b.eq L2_bytes | |
370 | ldr w9,[data],#4 | |
371 | adds x7, x7, x9 | |
372 | adc x7, x7, x10 | |
373 | ||
374 | L2_bytes: | |
375 | tst mlen, #2 | |
376 | b.eq L_trailing_bytes | |
377 | ldrh w9,[data],#2 | |
378 | adds x7, x7, x9 | |
379 | adc x7, x7, x10 | |
380 | ||
381 | L_trailing_bytes: | |
382 | tst mlen, #1 | |
383 | b.eq L0_bytes | |
384 | ldrb w9,[data],#1 | |
385 | adds x7, x7, x9 | |
386 | adc x7, x7, x10 | |
387 | eor started_on_odd, started_on_odd, #1 | |
388 | ||
389 | L0_bytes: | |
390 | /* | |
391 | * if (needs_swap) | |
392 | * partial = (partial << 8) + (partial >> 56); | |
393 | */ | |
394 | cbz needs_swap, 1f | |
395 | ror x7, x7, #56 | |
396 | 1: | |
397 | /* | |
398 | * sum += (partial >> 32) + (partial & 0xffffffff); | |
399 | * sum = (sum >> 32) + (sum & 0xffffffff); | |
400 | * } | |
401 | */ | |
402 | ||
403 | add x3, x3, x7, lsr #32 | |
404 | mov w7, w7 | |
405 | add x3, x3, x7 | |
406 | mov w7, w3 | |
407 | add x3, x7, x3, lsr #32 | |
408 | ||
409 | L_continue: | |
410 | cmp len, #0 | |
411 | ldr ptr_m, [m, #M_NEXT] // m = m->m_next | |
412 | b.gt L_loop | |
413 | ||
414 | /* | |
415 | * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + | |
416 | * ((sum >> 16) & 0xffff) + (sum & 0xffff); | |
417 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); | |
418 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); | |
419 | * return (final_acc & 0xffff); | |
420 | * } | |
421 | */ | |
422 | ||
423 | mov w4, #0x00ffff | |
424 | and x0, x4, x3, lsr #48 | |
425 | and x1, x4, x3, lsr #32 | |
426 | and x2, x4, x3, lsr #16 | |
427 | and x3, x4, x3 | |
428 | add w0, w0, w1 | |
429 | add w2, w2, w3 | |
430 | add w0, w0, w2 | |
431 | and w1, w4, w0, lsr #16 | |
432 | and w0, w4, w0 | |
433 | add w0, w0, w1 | |
434 | and w1, w4, w0, lsr #16 | |
435 | and w0, w4, w0 | |
436 | add w0, w0, w1 | |
437 | /* | |
438 | * If we were to 1's complement it (XOR with 0xffff): | |
439 | * | |
440 | * eor w0, w0, w4 | |
441 | */ | |
442 | and w0, w0, w4 | |
443 | ||
444 | ret lr | |
445 | ||
446 | Lin_cksum_whoops: | |
447 | adrp x0, Lin_cksum_whoops_str@page | |
448 | add x0, x0, Lin_cksum_whoops_str@pageoff | |
449 | bl #CKSUM_ERR | |
450 | mov x0, #-1 | |
451 | ret lr | |
452 | ||
453 | Lin_cksum_whoops_str: | |
454 | .asciz "os_cpu_in_cksum_mbuf: out of data\n" | |
455 | .align 5 |