]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/arm64/cpu_in_cksum.s
xnu-4570.51.1.tar.gz
[apple/xnu.git] / bsd / dev / arm64 / cpu_in_cksum.s
1 /*
2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
33 *
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
35 */
36
37 #ifdef KERNEL
38 #define CKSUM_ERR _kprintf
39 #else
40 #ifndef LIBSYSCALL_INTERFACE
41 #error "LIBSYSCALL_INTERFACE not defined"
42 #endif /* !LIBSYSCALL_INTERFACE */
43 #define CKSUM_ERR _fprintf_stderr
44 #endif /* !KERNEL */
45
46 /*
47 * XXX: adi@apple.com:
48 *
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk. Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
53 */
54 #define M_NEXT 0
55 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
56 #define M_LEN 24
57
58 .globl _os_cpu_in_cksum_mbuf
59 .text
60 .align 4
61 _os_cpu_in_cksum_mbuf:
62
63
64 /*
65 * 64-bit version.
66 *
67 * This function returns the partial 16-bit checksum accumulated in
68 * a 32-bit variable (withouth 1's complement); caller is responsible
69 * for folding the 32-bit sum into 16-bit and performinng the 1's
70 * complement if applicable
71 */
72
73 /*
74 * uint32_t
75 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
76 * {
77 * int mlen;
78 * uint64_t sum, partial;
79 * unsigned int final_acc;
80 * uint8_t *data;
81 * boolean_t needs_swap, started_on_odd;
82 *
83 * VERIFY(len >= 0);
84 * VERIFY(off >= 0);
85 *
86 * needs_swap = FALSE;
87 * started_on_odd = FALSE;
88 * sum = initial_sum;
89 */
90
91 #define m x0
92 #define len x1
93 #define off x2
94 #define sum x3
95 #define needs_swap x4
96 #define started_on_odd x5
97 #define mlen x6
98 #define Wmlen w6
99 #define t x7
100 #define data x8
101
102 mov needs_swap, #0 // needs_swap = FALSE;
103 mov started_on_odd, #0 // started_on_odd = FALSE;
104 mov w3, w3 // clear higher half
105
106
107 /*
108 * for (;;) {
109 * if (PREDICT_FALSE(m == NULL)) {
110 * CKSUM_ERR("%s: out of data\n", __func__);
111 * return (-1);
112 * }
113 * mlen = m->m_len;
114 * if (mlen > off) {
115 * mlen -= off;
116 * data = mtod(m, uint8_t *) + off;
117 * goto post_initial_offset;
118 * }
119 * off -= mlen;
120 * if (len == 0)
121 * break;
122 * m = m->m_next;
123 * }
124 */
125
126 0:
127 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
128 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
129 cmp mlen, off
130 b.le 1f
131 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
132 sub mlen, mlen, off // mlen -= off;
133 add data, data, off // data = mtod(m, uint8_t *) + off;
134 b L_post_initial_offset
135 1:
136 sub off, off, mlen
137 cbnz len, 2f
138 mov x0, x3
139 ret lr
140 2:
141 ldr m, [m, #M_NEXT]
142 b 0b
143
144 L_loop: // for (; len > 0; m = m->m_next) {
145 /*
146 * if (PREDICT_FALSE(m == NULL)) {
147 * CKSUM_ERR("%s: out of data\n", __func__);
148 * return (-1);
149 * }
150 * mlen = m->m_len;
151 * data = mtod(m, uint8_t *);
152 */
153 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
154 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
155 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
156
157 L_post_initial_offset:
158 /*
159 * if (mlen == 0) continue;
160 * if (mlen > len) mlen = len;
161 * len -= mlen;
162 */
163
164 cbz mlen, L_continue
165 cmp mlen, len
166 csel mlen, mlen, len, le
167 sub len, len, mlen
168
169 /*
170 * partial = 0;
171 * if ((uintptr_t)data & 1) {
172 * started_on_odd = !started_on_odd;
173 * partial = *data << 8;
174 * ++data;
175 * --mlen;
176 * }
177 * needs_swap = started_on_odd;
178 */
179
180 tst data, #1
181 mov x7, #0
182 mov x10, #0
183 b.eq 1f
184 ldrb w7, [data], #1
185 eor started_on_odd, started_on_odd, #1
186 sub mlen, mlen, #1
187 lsl w7, w7, #8
188 1:
189
190
191 /*
192 * if ((uintptr_t)data & 2) {
193 * if (mlen < 2)
194 * goto trailing_bytes;
195 * partial += *(uint16_t *)(void *)data;
196 * data += 2;
197 * mlen -= 2;
198 * }
199 */
200 tst data, #2
201 mov needs_swap, started_on_odd
202 b.eq 1f
203 cmp mlen, #2
204 b.lt L_trailing_bytes
205 ldrh w9, [data], #2
206 sub mlen, mlen, #2
207 add w7, w7, w9
208 1:
209
210 /*
211 * while (mlen >= 64) {
212 * __builtin_prefetch(data + 32);
213 * __builtin_prefetch(data + 64);
214 * partial += *(uint32_t *)(void *)data;
215 * partial += *(uint32_t *)(void *)(data + 4);
216 * partial += *(uint32_t *)(void *)(data + 8);
217 * partial += *(uint32_t *)(void *)(data + 12);
218 * partial += *(uint32_t *)(void *)(data + 16);
219 * partial += *(uint32_t *)(void *)(data + 20);
220 * partial += *(uint32_t *)(void *)(data + 24);
221 * partial += *(uint32_t *)(void *)(data + 28);
222 * partial += *(uint32_t *)(void *)(data + 32);
223 * partial += *(uint32_t *)(void *)(data + 36);
224 * partial += *(uint32_t *)(void *)(data + 40);
225 * partial += *(uint32_t *)(void *)(data + 44);
226 * partial += *(uint32_t *)(void *)(data + 48);
227 * partial += *(uint32_t *)(void *)(data + 52);
228 * partial += *(uint32_t *)(void *)(data + 56);
229 * partial += *(uint32_t *)(void *)(data + 60);
230 * data += 64;
231 * mlen -= 64;
232 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
233 * // if (needs_swap)
234 * // partial = (partial << 8) +
235 * // (partial >> 56);
236 * // sum += (partial >> 32);
237 * // sum += (partial & 0xffffffff);
238 * // partial = 0;
239 * // }
240 * }
241 */
242
243 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
244 subs mlen, mlen, #64
245 b.lt L32_bytes
246
247 // save used vector registers
248 sub sp, sp, #8*16
249 mov x11, sp
250 st1.4s {v0, v1, v2, v3}, [x11], #4*16
251 st1.4s {v4, v5, v6, v7}, [x11], #4*16
252
253 // spread partial into 8 8-byte registers in v0-v3
254 fmov s3, w7
255 eor.16b v0, v0, v0
256 eor.16b v1, v1, v1
257 eor.16b v2, v2, v2
258
259 // load the 1st 64 bytes (16 32-bit words)
260 ld1.4s {v4,v5,v6,v7},[data],#64
261
262 // branch to finish off if mlen<64
263 subs mlen, mlen, #64
264 b.lt L64_finishup
265
266 /*
267 * loop for loading and accumulating 16 32-bit words into
268 * 8 8-byte accumulators per iteration.
269 */
270 L64_loop:
271 subs mlen, mlen, #64 // mlen -= 64
272
273 uadalp.2d v0, v4
274 ld1.4s {v4},[data], #16
275
276 uadalp.2d v1, v5
277 ld1.4s {v5},[data], #16
278
279 uadalp.2d v2, v6
280 ld1.4s {v6},[data], #16
281
282 uadalp.2d v3, v7
283 ld1.4s {v7},[data], #16
284
285 b.ge L64_loop
286
287 L64_finishup:
288 uadalp.2d v0, v4
289 uadalp.2d v1, v5
290 uadalp.2d v2, v6
291 uadalp.2d v3, v7
292
293 add.2d v0, v0, v1
294 add.2d v2, v2, v3
295 addp.2d d0, v0
296 addp.2d d2, v2
297 add.2d v0, v0, v2
298 fmov x7, d0 // partial in x7 now
299
300 // restore used vector registers
301 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
302 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
303
304 L32_bytes:
305 tst mlen, #32
306 b.eq L16_bytes
307 ldp x9, x10, [data], #16
308 ldp x11, x12, [data], #16
309 adds x7, x7, x9
310 mov x9, #0
311 adcs x7, x7, x10
312 adcs x7, x7, x11
313 adcs x7, x7, x12
314 adc x7, x7, x9
315
316 L16_bytes:
317 tst mlen, #16
318 b.eq L8_bytes
319 ldp x9, x10, [data], #16
320 adds x7, x7, x9
321 mov x9, #0
322 adcs x7, x7, x10
323 adc x7, x7, x9
324
325 L8_bytes:
326 tst mlen, #8
327 mov x10, #0
328 b.eq L4_bytes
329 ldr x9,[data],#8
330 adds x7, x7, x9
331 adc x7, x7, x10
332
333 L4_bytes:
334 tst mlen, #4
335 b.eq L2_bytes
336 ldr w9,[data],#4
337 adds x7, x7, x9
338 adc x7, x7, x10
339
340 L2_bytes:
341 tst mlen, #2
342 b.eq L_trailing_bytes
343 ldrh w9,[data],#2
344 adds x7, x7, x9
345 adc x7, x7, x10
346
347 L_trailing_bytes:
348 tst mlen, #1
349 b.eq L0_bytes
350 ldrb w9,[data],#1
351 adds x7, x7, x9
352 adc x7, x7, x10
353 eor started_on_odd, started_on_odd, #1
354
355 L0_bytes:
356 /*
357 * if (needs_swap)
358 * partial = (partial << 8) + (partial >> 56);
359 */
360 cbz needs_swap, 1f
361 ror x7, x7, #56
362 1:
363 /*
364 * sum += (partial >> 32) + (partial & 0xffffffff);
365 * sum = (sum >> 32) + (sum & 0xffffffff);
366 * }
367 */
368
369 add x3, x3, x7, lsr #32
370 mov w7, w7
371 add x3, x3, x7
372 mov w7, w3
373 add x3, x7, x3, lsr #32
374
375 L_continue:
376 cmp len, #0
377 ldr m, [m, #M_NEXT] // m = m->m_next
378 b.gt L_loop
379
380 /*
381 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
382 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
383 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
384 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
385 * return (final_acc & 0xffff);
386 * }
387 */
388
389 mov w4, #0x00ffff
390 and x0, x4, x3, lsr #48
391 and x1, x4, x3, lsr #32
392 and x2, x4, x3, lsr #16
393 and x3, x4, x3
394 add w0, w0, w1
395 add w2, w2, w3
396 add w0, w0, w2
397 and w1, w4, w0, lsr #16
398 and w0, w4, w0
399 add w0, w0, w1
400 and w1, w4, w0, lsr #16
401 and w0, w4, w0
402 add w0, w0, w1
403 /*
404 * If we were to 1's complement it (XOR with 0xffff):
405 *
406 * eor w0, w0, w4
407 */
408 and w0, w0, w4
409
410 ret lr
411
412 Lin_cksum_whoops:
413 adrp x0, Lin_cksum_whoops_str@page
414 add x0, x0, Lin_cksum_whoops_str@pageoff
415 bl #CKSUM_ERR
416 mov x0, #-1
417 ret lr
418
419 Lin_cksum_whoops_str:
420 .asciz "os_cpu_in_cksum_mbuf: out of data\n"
421 .align 5