]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/arm64/cpu_in_cksum.s
xnu-4903.241.1.tar.gz
[apple/xnu.git] / bsd / dev / arm64 / cpu_in_cksum.s
1 /*
2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
33 *
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
35 */
36
37 #ifdef KERNEL
38 #define CKSUM_ERR _kprintf
39 #else
40 #ifndef LIBSYSCALL_INTERFACE
41 #error "LIBSYSCALL_INTERFACE not defined"
42 #endif /* !LIBSYSCALL_INTERFACE */
43 #define CKSUM_ERR _fprintf_stderr
44 #endif /* !KERNEL */
45
46 /*
47 * XXX: adi@apple.com:
48 *
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk. Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
53 */
54 #if defined(__LP64__)
55 #define M_NEXT 0
56 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
57 #define M_LEN 24
58 #else
59 #define M_NEXT 0
60 #define M_DATA 8
61 #define M_LEN 12
62 #endif
63
64 .globl _os_cpu_in_cksum_mbuf
65 .text
66 .align 4
67 _os_cpu_in_cksum_mbuf:
68
69
70 /*
71 * 64-bit version.
72 *
73 * This function returns the partial 16-bit checksum accumulated in
74 * a 32-bit variable (withouth 1's complement); caller is responsible
75 * for folding the 32-bit sum into 16-bit and performinng the 1's
76 * complement if applicable
77 */
78
79 /*
80 * uint32_t
81 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
82 * {
83 * int mlen;
84 * uint64_t sum, partial;
85 * unsigned int final_acc;
86 * uint8_t *data;
87 * boolean_t needs_swap, started_on_odd;
88 *
89 * VERIFY(len >= 0);
90 * VERIFY(off >= 0);
91 *
92 * needs_swap = FALSE;
93 * started_on_odd = FALSE;
94 * sum = initial_sum;
95 */
96
97 #define m x0
98 #define len x1
99 #define off x2
100 #define sum x3
101 #define needs_swap x4
102 #define started_on_odd x5
103 #define mlen x6
104 #define Wmlen w6
105 #define t x7
106 #define data x8
107 #if defined(__LP64__)
108 #define ptr_m x0
109 #define ptr_data x8
110 #else
111 #define ptr_m w0
112 #define ptr_data w8
113 #endif
114
115
116 mov needs_swap, #0 // needs_swap = FALSE;
117 mov started_on_odd, #0 // started_on_odd = FALSE;
118 mov w3, w3 // clear higher half
119
120
121 /*
122 * for (;;) {
123 * if (PREDICT_FALSE(m == NULL)) {
124 * CKSUM_ERR("%s: out of data\n", __func__);
125 * return (-1);
126 * }
127 * mlen = m->m_len;
128 * if (mlen > off) {
129 * mlen -= off;
130 * data = mtod(m, uint8_t *) + off;
131 * goto post_initial_offset;
132 * }
133 * off -= mlen;
134 * if (len == 0)
135 * break;
136 * m = m->m_next;
137 * }
138 */
139
140 0:
141 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
142 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
143 cmp mlen, off
144 b.le 1f
145 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
146 sub mlen, mlen, off // mlen -= off;
147 add data, data, off // data = mtod(m, uint8_t *) + off;
148 b L_post_initial_offset
149 1:
150 sub off, off, mlen
151 cbnz len, 2f
152 mov x0, x3
153 ret lr
154 2:
155 ldr ptr_m, [m, #M_NEXT]
156 b 0b
157
158 L_loop: // for (; len > 0; m = m->m_next) {
159 /*
160 * if (PREDICT_FALSE(m == NULL)) {
161 * CKSUM_ERR("%s: out of data\n", __func__);
162 * return (-1);
163 * }
164 * mlen = m->m_len;
165 * data = mtod(m, uint8_t *);
166 */
167 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
168 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
169 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
170
171 L_post_initial_offset:
172 /*
173 * if (mlen == 0) continue;
174 * if (mlen > len) mlen = len;
175 * len -= mlen;
176 */
177
178 cbz mlen, L_continue
179 cmp mlen, len
180 csel mlen, mlen, len, le
181 sub len, len, mlen
182
183 /*
184 * partial = 0;
185 * if ((uintptr_t)data & 1) {
186 * started_on_odd = !started_on_odd;
187 * partial = *data << 8;
188 * ++data;
189 * --mlen;
190 * }
191 * needs_swap = started_on_odd;
192 */
193
194 tst data, #1
195 mov x7, #0
196 mov x10, #0
197 b.eq 1f
198 ldrb w7, [data], #1
199 eor started_on_odd, started_on_odd, #1
200 sub mlen, mlen, #1
201 lsl w7, w7, #8
202 1:
203
204
205 /*
206 * if ((uintptr_t)data & 2) {
207 * if (mlen < 2)
208 * goto trailing_bytes;
209 * partial += *(uint16_t *)(void *)data;
210 * data += 2;
211 * mlen -= 2;
212 * }
213 */
214 tst data, #2
215 mov needs_swap, started_on_odd
216 b.eq 1f
217 cmp mlen, #2
218 b.lt L_trailing_bytes
219 ldrh w9, [data], #2
220 sub mlen, mlen, #2
221 add w7, w7, w9
222 1:
223
224 /*
225 * while (mlen >= 64) {
226 * __builtin_prefetch(data + 32);
227 * __builtin_prefetch(data + 64);
228 * partial += *(uint32_t *)(void *)data;
229 * partial += *(uint32_t *)(void *)(data + 4);
230 * partial += *(uint32_t *)(void *)(data + 8);
231 * partial += *(uint32_t *)(void *)(data + 12);
232 * partial += *(uint32_t *)(void *)(data + 16);
233 * partial += *(uint32_t *)(void *)(data + 20);
234 * partial += *(uint32_t *)(void *)(data + 24);
235 * partial += *(uint32_t *)(void *)(data + 28);
236 * partial += *(uint32_t *)(void *)(data + 32);
237 * partial += *(uint32_t *)(void *)(data + 36);
238 * partial += *(uint32_t *)(void *)(data + 40);
239 * partial += *(uint32_t *)(void *)(data + 44);
240 * partial += *(uint32_t *)(void *)(data + 48);
241 * partial += *(uint32_t *)(void *)(data + 52);
242 * partial += *(uint32_t *)(void *)(data + 56);
243 * partial += *(uint32_t *)(void *)(data + 60);
244 * data += 64;
245 * mlen -= 64;
246 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
247 * // if (needs_swap)
248 * // partial = (partial << 8) +
249 * // (partial >> 56);
250 * // sum += (partial >> 32);
251 * // sum += (partial & 0xffffffff);
252 * // partial = 0;
253 * // }
254 * }
255 */
256
257 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
258 subs mlen, mlen, #64
259 b.lt L32_bytes
260
261 // save used vector registers
262 sub sp, sp, #8*16
263 mov x11, sp
264 st1.4s {v0, v1, v2, v3}, [x11], #4*16
265 st1.4s {v4, v5, v6, v7}, [x11], #4*16
266
267 // spread partial into 8 8-byte registers in v0-v3
268 fmov s3, w7
269 eor.16b v0, v0, v0
270 eor.16b v1, v1, v1
271 eor.16b v2, v2, v2
272
273 // load the 1st 64 bytes (16 32-bit words)
274 ld1.4s {v4,v5,v6,v7},[data],#64
275
276 // branch to finish off if mlen<64
277 subs mlen, mlen, #64
278 b.lt L64_finishup
279
280 /*
281 * loop for loading and accumulating 16 32-bit words into
282 * 8 8-byte accumulators per iteration.
283 */
284 L64_loop:
285 subs mlen, mlen, #64 // mlen -= 64
286
287 uadalp.2d v0, v4
288 ld1.4s {v4},[data], #16
289
290 uadalp.2d v1, v5
291 ld1.4s {v5},[data], #16
292
293 uadalp.2d v2, v6
294 ld1.4s {v6},[data], #16
295
296 uadalp.2d v3, v7
297 ld1.4s {v7},[data], #16
298
299 b.ge L64_loop
300
301 L64_finishup:
302 uadalp.2d v0, v4
303 uadalp.2d v1, v5
304 uadalp.2d v2, v6
305 uadalp.2d v3, v7
306
307 add.2d v0, v0, v1
308 add.2d v2, v2, v3
309 addp.2d d0, v0
310 addp.2d d2, v2
311 add.2d v0, v0, v2
312 fmov x7, d0 // partial in x7 now
313
314 // restore used vector registers
315 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
316 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
317
318 L32_bytes:
319 tst mlen, #32
320 b.eq L16_bytes
321 ldp x9, x10, [data], #16
322 ldp x11, x12, [data], #16
323 adds x7, x7, x9
324 mov x9, #0
325 adcs x7, x7, x10
326 adcs x7, x7, x11
327 adcs x7, x7, x12
328 adc x7, x7, x9
329
330 L16_bytes:
331 tst mlen, #16
332 b.eq L8_bytes
333 ldp x9, x10, [data], #16
334 adds x7, x7, x9
335 mov x9, #0
336 adcs x7, x7, x10
337 adc x7, x7, x9
338
339 L8_bytes:
340 tst mlen, #8
341 mov x10, #0
342 b.eq L4_bytes
343 ldr x9,[data],#8
344 adds x7, x7, x9
345 adc x7, x7, x10
346
347 L4_bytes:
348 tst mlen, #4
349 b.eq L2_bytes
350 ldr w9,[data],#4
351 adds x7, x7, x9
352 adc x7, x7, x10
353
354 L2_bytes:
355 tst mlen, #2
356 b.eq L_trailing_bytes
357 ldrh w9,[data],#2
358 adds x7, x7, x9
359 adc x7, x7, x10
360
361 L_trailing_bytes:
362 tst mlen, #1
363 b.eq L0_bytes
364 ldrb w9,[data],#1
365 adds x7, x7, x9
366 adc x7, x7, x10
367 eor started_on_odd, started_on_odd, #1
368
369 L0_bytes:
370 /*
371 * if (needs_swap)
372 * partial = (partial << 8) + (partial >> 56);
373 */
374 cbz needs_swap, 1f
375 ror x7, x7, #56
376 1:
377 /*
378 * sum += (partial >> 32) + (partial & 0xffffffff);
379 * sum = (sum >> 32) + (sum & 0xffffffff);
380 * }
381 */
382
383 add x3, x3, x7, lsr #32
384 mov w7, w7
385 add x3, x3, x7
386 mov w7, w3
387 add x3, x7, x3, lsr #32
388
389 L_continue:
390 cmp len, #0
391 ldr ptr_m, [m, #M_NEXT] // m = m->m_next
392 b.gt L_loop
393
394 /*
395 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
396 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
397 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
398 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
399 * return (final_acc & 0xffff);
400 * }
401 */
402
403 mov w4, #0x00ffff
404 and x0, x4, x3, lsr #48
405 and x1, x4, x3, lsr #32
406 and x2, x4, x3, lsr #16
407 and x3, x4, x3
408 add w0, w0, w1
409 add w2, w2, w3
410 add w0, w0, w2
411 and w1, w4, w0, lsr #16
412 and w0, w4, w0
413 add w0, w0, w1
414 and w1, w4, w0, lsr #16
415 and w0, w4, w0
416 add w0, w0, w1
417 /*
418 * If we were to 1's complement it (XOR with 0xffff):
419 *
420 * eor w0, w0, w4
421 */
422 and w0, w0, w4
423
424 ret lr
425
426 Lin_cksum_whoops:
427 adrp x0, Lin_cksum_whoops_str@page
428 add x0, x0, Lin_cksum_whoops_str@pageoff
429 bl #CKSUM_ERR
430 mov x0, #-1
431 ret lr
432
433 Lin_cksum_whoops_str:
434 .asciz "os_cpu_in_cksum_mbuf: out of data\n"
435 .align 5