]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2012-2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__) | |
31 | * with __arm64__ tagged ARM64_TODO . This code revision is optimized based | |
32 | * on the 64-bit part in netinet/cpu_in_cksum.c | |
33 | * | |
34 | * cclee - CoreOS - Vector & Numerics. 06/20/2012. | |
35 | */ | |
36 | ||
37 | #ifdef KERNEL | |
38 | #define CKSUM_ERR _kprintf | |
39 | #else | |
40 | #ifndef LIBSYSCALL_INTERFACE | |
41 | #error "LIBSYSCALL_INTERFACE not defined" | |
42 | #endif /* !LIBSYSCALL_INTERFACE */ | |
43 | #define CKSUM_ERR _fprintf_stderr | |
44 | #endif /* !KERNEL */ | |
45 | ||
46 | /* | |
47 | * XXX: adi@apple.com: | |
48 | * | |
49 | * Ugly, but we have little choice, since relying on genassym and <assym.s> | |
50 | * is not possible unless this code lives in osfmk. Note also that this | |
51 | * routine expects "mbuf-like" argument, and it does not expect the mbuf to be | |
52 | * authentic; it only cares about 3 fields. | |
53 | */ | |
54 | #if defined(__LP64__) | |
55 | #define M_NEXT 0 | |
56 | #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary | |
57 | #define M_LEN 24 | |
58 | #else | |
59 | #define M_NEXT 0 | |
60 | #define M_DATA 8 | |
61 | #define M_LEN 12 | |
62 | #endif | |
63 | ||
64 | .globl _os_cpu_in_cksum_mbuf | |
65 | .text | |
66 | .align 4 | |
67 | _os_cpu_in_cksum_mbuf: | |
68 | ||
69 | ||
70 | /* | |
71 | * 64-bit version. | |
72 | * | |
73 | * This function returns the partial 16-bit checksum accumulated in | |
74 | * a 32-bit variable (withouth 1's complement); caller is responsible | |
75 | * for folding the 32-bit sum into 16-bit and performinng the 1's | |
76 | * complement if applicable | |
77 | */ | |
78 | ||
79 | /* | |
80 | * uint32_t | |
81 | * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum) | |
82 | * { | |
83 | * int mlen; | |
84 | * uint64_t sum, partial; | |
85 | * unsigned int final_acc; | |
86 | * uint8_t *data; | |
87 | * boolean_t needs_swap, started_on_odd; | |
88 | * | |
89 | * VERIFY(len >= 0); | |
90 | * VERIFY(off >= 0); | |
91 | * | |
92 | * needs_swap = FALSE; | |
93 | * started_on_odd = FALSE; | |
94 | * sum = initial_sum; | |
95 | */ | |
96 | ||
97 | #define m x0 | |
98 | #define len x1 | |
99 | #define off x2 | |
100 | #define sum x3 | |
101 | #define needs_swap x4 | |
102 | #define started_on_odd x5 | |
103 | #define mlen x6 | |
104 | #define Wmlen w6 | |
105 | #define t x7 | |
106 | #define data x8 | |
107 | #if defined(__LP64__) | |
108 | #define ptr_m x0 | |
109 | #define ptr_data x8 | |
110 | #else | |
111 | #define ptr_m w0 | |
112 | #define ptr_data w8 | |
113 | #endif | |
114 | ||
115 | ||
116 | mov needs_swap, #0 // needs_swap = FALSE; | |
117 | mov started_on_odd, #0 // started_on_odd = FALSE; | |
118 | mov w3, w3 // clear higher half | |
119 | ||
120 | ||
121 | /* | |
122 | * for (;;) { | |
123 | * if (PREDICT_FALSE(m == NULL)) { | |
124 | * CKSUM_ERR("%s: out of data\n", __func__); | |
125 | * return (-1); | |
126 | * } | |
127 | * mlen = m->m_len; | |
128 | * if (mlen > off) { | |
129 | * mlen -= off; | |
130 | * data = mtod(m, uint8_t *) + off; | |
131 | * goto post_initial_offset; | |
132 | * } | |
133 | * off -= mlen; | |
134 | * if (len == 0) | |
135 | * break; | |
136 | * m = m->m_next; | |
137 | * } | |
138 | */ | |
139 | ||
140 | 0: | |
141 | cbz m, Lin_cksum_whoops // if (m == NULL) return -1; | |
142 | ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; | |
143 | cmp mlen, off | |
144 | b.le 1f | |
145 | ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) | |
146 | sub mlen, mlen, off // mlen -= off; | |
147 | add data, data, off // data = mtod(m, uint8_t *) + off; | |
148 | b L_post_initial_offset | |
149 | 1: | |
150 | sub off, off, mlen | |
151 | cbnz len, 2f | |
152 | mov x0, x3 | |
153 | ret lr | |
154 | 2: | |
155 | ldr ptr_m, [m, #M_NEXT] | |
156 | b 0b | |
157 | ||
158 | L_loop: // for (; len > 0; m = m->m_next) { | |
159 | /* | |
160 | * if (PREDICT_FALSE(m == NULL)) { | |
161 | * CKSUM_ERR("%s: out of data\n", __func__); | |
162 | * return (-1); | |
163 | * } | |
164 | * mlen = m->m_len; | |
165 | * data = mtod(m, uint8_t *); | |
166 | */ | |
167 | cbz m, Lin_cksum_whoops // if (m == NULL) return -1; | |
168 | ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; | |
169 | ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) | |
170 | ||
171 | L_post_initial_offset: | |
172 | /* | |
173 | * if (mlen == 0) continue; | |
174 | * if (mlen > len) mlen = len; | |
175 | * len -= mlen; | |
176 | */ | |
177 | ||
178 | cbz mlen, L_continue | |
179 | cmp mlen, len | |
180 | csel mlen, mlen, len, le | |
181 | sub len, len, mlen | |
182 | ||
183 | /* | |
184 | * partial = 0; | |
185 | * if ((uintptr_t)data & 1) { | |
186 | * started_on_odd = !started_on_odd; | |
187 | * partial = *data << 8; | |
188 | * ++data; | |
189 | * --mlen; | |
190 | * } | |
191 | * needs_swap = started_on_odd; | |
192 | */ | |
193 | ||
194 | tst data, #1 | |
195 | mov x7, #0 | |
196 | mov x10, #0 | |
197 | b.eq 1f | |
198 | ldrb w7, [data], #1 | |
199 | eor started_on_odd, started_on_odd, #1 | |
200 | sub mlen, mlen, #1 | |
201 | lsl w7, w7, #8 | |
202 | 1: | |
203 | ||
204 | ||
205 | /* | |
206 | * if ((uintptr_t)data & 2) { | |
207 | * if (mlen < 2) | |
208 | * goto trailing_bytes; | |
209 | * partial += *(uint16_t *)(void *)data; | |
210 | * data += 2; | |
211 | * mlen -= 2; | |
212 | * } | |
213 | */ | |
214 | tst data, #2 | |
215 | mov needs_swap, started_on_odd | |
216 | b.eq 1f | |
217 | cmp mlen, #2 | |
218 | b.lt L_trailing_bytes | |
219 | ldrh w9, [data], #2 | |
220 | sub mlen, mlen, #2 | |
221 | add w7, w7, w9 | |
222 | 1: | |
223 | ||
224 | /* | |
225 | * while (mlen >= 64) { | |
226 | * __builtin_prefetch(data + 32); | |
227 | * __builtin_prefetch(data + 64); | |
228 | * partial += *(uint32_t *)(void *)data; | |
229 | * partial += *(uint32_t *)(void *)(data + 4); | |
230 | * partial += *(uint32_t *)(void *)(data + 8); | |
231 | * partial += *(uint32_t *)(void *)(data + 12); | |
232 | * partial += *(uint32_t *)(void *)(data + 16); | |
233 | * partial += *(uint32_t *)(void *)(data + 20); | |
234 | * partial += *(uint32_t *)(void *)(data + 24); | |
235 | * partial += *(uint32_t *)(void *)(data + 28); | |
236 | * partial += *(uint32_t *)(void *)(data + 32); | |
237 | * partial += *(uint32_t *)(void *)(data + 36); | |
238 | * partial += *(uint32_t *)(void *)(data + 40); | |
239 | * partial += *(uint32_t *)(void *)(data + 44); | |
240 | * partial += *(uint32_t *)(void *)(data + 48); | |
241 | * partial += *(uint32_t *)(void *)(data + 52); | |
242 | * partial += *(uint32_t *)(void *)(data + 56); | |
243 | * partial += *(uint32_t *)(void *)(data + 60); | |
244 | * data += 64; | |
245 | * mlen -= 64; | |
246 | * // if (PREDICT_FALSE(partial & (3ULL << 62))) { | |
247 | * // if (needs_swap) | |
248 | * // partial = (partial << 8) + | |
249 | * // (partial >> 56); | |
250 | * // sum += (partial >> 32); | |
251 | * // sum += (partial & 0xffffffff); | |
252 | * // partial = 0; | |
253 | * // } | |
254 | * } | |
255 | */ | |
256 | ||
257 | // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next | |
258 | subs mlen, mlen, #64 | |
259 | b.lt L32_bytes | |
260 | ||
261 | // save used vector registers | |
262 | sub sp, sp, #8*16 | |
263 | mov x11, sp | |
264 | st1.4s {v0, v1, v2, v3}, [x11], #4*16 | |
265 | st1.4s {v4, v5, v6, v7}, [x11], #4*16 | |
266 | ||
267 | // spread partial into 8 8-byte registers in v0-v3 | |
268 | fmov s3, w7 | |
269 | eor.16b v0, v0, v0 | |
270 | eor.16b v1, v1, v1 | |
271 | eor.16b v2, v2, v2 | |
272 | ||
273 | // load the 1st 64 bytes (16 32-bit words) | |
274 | ld1.4s {v4,v5,v6,v7},[data],#64 | |
275 | ||
276 | // branch to finish off if mlen<64 | |
277 | subs mlen, mlen, #64 | |
278 | b.lt L64_finishup | |
279 | ||
280 | /* | |
281 | * loop for loading and accumulating 16 32-bit words into | |
282 | * 8 8-byte accumulators per iteration. | |
283 | */ | |
284 | L64_loop: | |
285 | subs mlen, mlen, #64 // mlen -= 64 | |
286 | ||
287 | uadalp.2d v0, v4 | |
288 | ld1.4s {v4},[data], #16 | |
289 | ||
290 | uadalp.2d v1, v5 | |
291 | ld1.4s {v5},[data], #16 | |
292 | ||
293 | uadalp.2d v2, v6 | |
294 | ld1.4s {v6},[data], #16 | |
295 | ||
296 | uadalp.2d v3, v7 | |
297 | ld1.4s {v7},[data], #16 | |
298 | ||
299 | b.ge L64_loop | |
300 | ||
301 | L64_finishup: | |
302 | uadalp.2d v0, v4 | |
303 | uadalp.2d v1, v5 | |
304 | uadalp.2d v2, v6 | |
305 | uadalp.2d v3, v7 | |
306 | ||
307 | add.2d v0, v0, v1 | |
308 | add.2d v2, v2, v3 | |
309 | addp.2d d0, v0 | |
310 | addp.2d d2, v2 | |
311 | add.2d v0, v0, v2 | |
312 | fmov x7, d0 // partial in x7 now | |
313 | ||
314 | // restore used vector registers | |
315 | ld1.4s {v0, v1, v2, v3}, [sp], #4*16 | |
316 | ld1.4s {v4, v5, v6, v7}, [sp], #4*16 | |
317 | ||
318 | L32_bytes: | |
319 | tst mlen, #32 | |
320 | b.eq L16_bytes | |
321 | ldp x9, x10, [data], #16 | |
322 | ldp x11, x12, [data], #16 | |
323 | adds x7, x7, x9 | |
324 | mov x9, #0 | |
325 | adcs x7, x7, x10 | |
326 | adcs x7, x7, x11 | |
327 | adcs x7, x7, x12 | |
328 | adc x7, x7, x9 | |
329 | ||
330 | L16_bytes: | |
331 | tst mlen, #16 | |
332 | b.eq L8_bytes | |
333 | ldp x9, x10, [data], #16 | |
334 | adds x7, x7, x9 | |
335 | mov x9, #0 | |
336 | adcs x7, x7, x10 | |
337 | adc x7, x7, x9 | |
338 | ||
339 | L8_bytes: | |
340 | tst mlen, #8 | |
341 | mov x10, #0 | |
342 | b.eq L4_bytes | |
343 | ldr x9,[data],#8 | |
344 | adds x7, x7, x9 | |
345 | adc x7, x7, x10 | |
346 | ||
347 | L4_bytes: | |
348 | tst mlen, #4 | |
349 | b.eq L2_bytes | |
350 | ldr w9,[data],#4 | |
351 | adds x7, x7, x9 | |
352 | adc x7, x7, x10 | |
353 | ||
354 | L2_bytes: | |
355 | tst mlen, #2 | |
356 | b.eq L_trailing_bytes | |
357 | ldrh w9,[data],#2 | |
358 | adds x7, x7, x9 | |
359 | adc x7, x7, x10 | |
360 | ||
361 | L_trailing_bytes: | |
362 | tst mlen, #1 | |
363 | b.eq L0_bytes | |
364 | ldrb w9,[data],#1 | |
365 | adds x7, x7, x9 | |
366 | adc x7, x7, x10 | |
367 | eor started_on_odd, started_on_odd, #1 | |
368 | ||
369 | L0_bytes: | |
370 | /* | |
371 | * if (needs_swap) | |
372 | * partial = (partial << 8) + (partial >> 56); | |
373 | */ | |
374 | cbz needs_swap, 1f | |
375 | ror x7, x7, #56 | |
376 | 1: | |
377 | /* | |
378 | * sum += (partial >> 32) + (partial & 0xffffffff); | |
379 | * sum = (sum >> 32) + (sum & 0xffffffff); | |
380 | * } | |
381 | */ | |
382 | ||
383 | add x3, x3, x7, lsr #32 | |
384 | mov w7, w7 | |
385 | add x3, x3, x7 | |
386 | mov w7, w3 | |
387 | add x3, x7, x3, lsr #32 | |
388 | ||
389 | L_continue: | |
390 | cmp len, #0 | |
391 | ldr ptr_m, [m, #M_NEXT] // m = m->m_next | |
392 | b.gt L_loop | |
393 | ||
394 | /* | |
395 | * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + | |
396 | * ((sum >> 16) & 0xffff) + (sum & 0xffff); | |
397 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); | |
398 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); | |
399 | * return (final_acc & 0xffff); | |
400 | * } | |
401 | */ | |
402 | ||
403 | mov w4, #0x00ffff | |
404 | and x0, x4, x3, lsr #48 | |
405 | and x1, x4, x3, lsr #32 | |
406 | and x2, x4, x3, lsr #16 | |
407 | and x3, x4, x3 | |
408 | add w0, w0, w1 | |
409 | add w2, w2, w3 | |
410 | add w0, w0, w2 | |
411 | and w1, w4, w0, lsr #16 | |
412 | and w0, w4, w0 | |
413 | add w0, w0, w1 | |
414 | and w1, w4, w0, lsr #16 | |
415 | and w0, w4, w0 | |
416 | add w0, w0, w1 | |
417 | /* | |
418 | * If we were to 1's complement it (XOR with 0xffff): | |
419 | * | |
420 | * eor w0, w0, w4 | |
421 | */ | |
422 | and w0, w0, w4 | |
423 | ||
424 | ret lr | |
425 | ||
426 | Lin_cksum_whoops: | |
427 | adrp x0, Lin_cksum_whoops_str@page | |
428 | add x0, x0, Lin_cksum_whoops_str@pageoff | |
429 | bl #CKSUM_ERR | |
430 | mov x0, #-1 | |
431 | ret lr | |
432 | ||
433 | Lin_cksum_whoops_str: | |
434 | .asciz "os_cpu_in_cksum_mbuf: out of data\n" | |
435 | .align 5 |