]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/arm64/cpu_in_cksum.s
b01b27172018c649e55587f2bd6f6e43b847bfb6
[apple/xnu.git] / bsd / dev / arm64 / cpu_in_cksum.s
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * This document is the property of Apple Inc.
5 * It is considered confidential and proprietary.
6 *
7 * This document may not be reproduced or transmitted in any form,
8 * in whole or in part, without the express written permission of
9 * Apple Inc.
10 */
11
12 /*
13 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
14 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
15 * on the 64-bit part in netinet/cpu_in_cksum.c
16 *
17 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
18 */
19
20 #ifdef KERNEL
21 #define CKSUM_ERR _kprintf
22 #else
23 #ifndef LIBSYSCALL_INTERFACE
24 #error "LIBSYSCALL_INTERFACE not defined"
25 #endif /* !LIBSYSCALL_INTERFACE */
26 #define CKSUM_ERR _fprintf_stderr
27 #endif /* !KERNEL */
28
29 /*
30 * XXX: adi@apple.com:
31 *
32 * Ugly, but we have little choice, since relying on genassym and <assym.s>
33 * is not possible unless this code lives in osfmk. Note also that this
34 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
35 * authentic; it only cares about 3 fields.
36 */
37 #define M_NEXT 0
38 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
39 #define M_LEN 24
40
41 .globl _os_cpu_in_cksum_mbuf
42 .text
43 .align 4
44 _os_cpu_in_cksum_mbuf:
45
46
47 /*
48 * 64-bit version.
49 *
50 * This function returns the partial 16-bit checksum accumulated in
51 * a 32-bit variable (withouth 1's complement); caller is responsible
52 * for folding the 32-bit sum into 16-bit and performinng the 1's
53 * complement if applicable
54 */
55
56 /*
57 * uint32_t
58 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
59 * {
60 * int mlen;
61 * uint64_t sum, partial;
62 * unsigned int final_acc;
63 * uint8_t *data;
64 * boolean_t needs_swap, started_on_odd;
65 *
66 * VERIFY(len >= 0);
67 * VERIFY(off >= 0);
68 *
69 * needs_swap = FALSE;
70 * started_on_odd = FALSE;
71 * sum = initial_sum;
72 */
73
74 #define m x0
75 #define len x1
76 #define off x2
77 #define sum x3
78 #define needs_swap x4
79 #define started_on_odd x5
80 #define mlen x6
81 #define Wmlen w6
82 #define t x7
83 #define data x8
84
85 mov needs_swap, #0 // needs_swap = FALSE;
86 mov started_on_odd, #0 // started_on_odd = FALSE;
87 mov w3, w3 // clear higher half
88
89
90 /*
91 * for (;;) {
92 * if (PREDICT_FALSE(m == NULL)) {
93 * CKSUM_ERR("%s: out of data\n", __func__);
94 * return (-1);
95 * }
96 * mlen = m->m_len;
97 * if (mlen > off) {
98 * mlen -= off;
99 * data = mtod(m, uint8_t *) + off;
100 * goto post_initial_offset;
101 * }
102 * off -= mlen;
103 * if (len == 0)
104 * break;
105 * m = m->m_next;
106 * }
107 */
108
109 0:
110 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
111 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
112 cmp mlen, off
113 b.le 1f
114 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
115 sub mlen, mlen, off // mlen -= off;
116 add data, data, off // data = mtod(m, uint8_t *) + off;
117 b L_post_initial_offset
118 1:
119 sub off, off, mlen
120 cbnz len, 2f
121 mov x0, x3
122 ret lr
123 2:
124 ldr m, [m, #M_NEXT]
125 b 0b
126
127 L_loop: // for (; len > 0; m = m->m_next) {
128 /*
129 * if (PREDICT_FALSE(m == NULL)) {
130 * CKSUM_ERR("%s: out of data\n", __func__);
131 * return (-1);
132 * }
133 * mlen = m->m_len;
134 * data = mtod(m, uint8_t *);
135 */
136 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
137 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
138 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
139
140 L_post_initial_offset:
141 /*
142 * if (mlen == 0) continue;
143 * if (mlen > len) mlen = len;
144 * len -= mlen;
145 */
146
147 cbz mlen, L_continue
148 cmp mlen, len
149 csel mlen, mlen, len, le
150 sub len, len, mlen
151
152 /*
153 * partial = 0;
154 * if ((uintptr_t)data & 1) {
155 * started_on_odd = !started_on_odd;
156 * partial = *data << 8;
157 * ++data;
158 * --mlen;
159 * }
160 * needs_swap = started_on_odd;
161 */
162
163 tst data, #1
164 mov x7, #0
165 mov x10, #0
166 b.eq 1f
167 ldrb w7, [data], #1
168 eor started_on_odd, started_on_odd, #1
169 sub mlen, mlen, #1
170 lsl w7, w7, #8
171 1:
172
173
174 /*
175 * if ((uintptr_t)data & 2) {
176 * if (mlen < 2)
177 * goto trailing_bytes;
178 * partial += *(uint16_t *)(void *)data;
179 * data += 2;
180 * mlen -= 2;
181 * }
182 */
183 tst data, #2
184 mov needs_swap, started_on_odd
185 b.eq 1f
186 cmp mlen, #2
187 b.lt L_trailing_bytes
188 ldrh w9, [data], #2
189 sub mlen, mlen, #2
190 add w7, w7, w9
191 1:
192
193 /*
194 * while (mlen >= 64) {
195 * __builtin_prefetch(data + 32);
196 * __builtin_prefetch(data + 64);
197 * partial += *(uint32_t *)(void *)data;
198 * partial += *(uint32_t *)(void *)(data + 4);
199 * partial += *(uint32_t *)(void *)(data + 8);
200 * partial += *(uint32_t *)(void *)(data + 12);
201 * partial += *(uint32_t *)(void *)(data + 16);
202 * partial += *(uint32_t *)(void *)(data + 20);
203 * partial += *(uint32_t *)(void *)(data + 24);
204 * partial += *(uint32_t *)(void *)(data + 28);
205 * partial += *(uint32_t *)(void *)(data + 32);
206 * partial += *(uint32_t *)(void *)(data + 36);
207 * partial += *(uint32_t *)(void *)(data + 40);
208 * partial += *(uint32_t *)(void *)(data + 44);
209 * partial += *(uint32_t *)(void *)(data + 48);
210 * partial += *(uint32_t *)(void *)(data + 52);
211 * partial += *(uint32_t *)(void *)(data + 56);
212 * partial += *(uint32_t *)(void *)(data + 60);
213 * data += 64;
214 * mlen -= 64;
215 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
216 * // if (needs_swap)
217 * // partial = (partial << 8) +
218 * // (partial >> 56);
219 * // sum += (partial >> 32);
220 * // sum += (partial & 0xffffffff);
221 * // partial = 0;
222 * // }
223 * }
224 */
225
226 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
227 subs mlen, mlen, #64
228 b.lt L32_bytes
229
230 // save used vector registers
231 sub sp, sp, #8*16
232 mov x11, sp
233 st1.4s {v0, v1, v2, v3}, [x11], #4*16
234 st1.4s {v4, v5, v6, v7}, [x11], #4*16
235
236 // spread partial into 8 8-byte registers in v0-v3
237 fmov s3, w7
238 eor.16b v0, v0, v0
239 eor.16b v1, v1, v1
240 eor.16b v2, v2, v2
241
242 // load the 1st 64 bytes (16 32-bit words)
243 ld1.4s {v4,v5,v6,v7},[data],#64
244
245 // branch to finish off if mlen<64
246 subs mlen, mlen, #64
247 b.lt L64_finishup
248
249 /*
250 * loop for loading and accumulating 16 32-bit words into
251 * 8 8-byte accumulators per iteration.
252 */
253 L64_loop:
254 subs mlen, mlen, #64 // mlen -= 64
255
256 uadalp.2d v0, v4
257 ld1.4s {v4},[data], #16
258
259 uadalp.2d v1, v5
260 ld1.4s {v5},[data], #16
261
262 uadalp.2d v2, v6
263 ld1.4s {v6},[data], #16
264
265 uadalp.2d v3, v7
266 ld1.4s {v7},[data], #16
267
268 b.ge L64_loop
269
270 L64_finishup:
271 uadalp.2d v0, v4
272 uadalp.2d v1, v5
273 uadalp.2d v2, v6
274 uadalp.2d v3, v7
275
276 add.2d v0, v0, v1
277 add.2d v2, v2, v3
278 addp.2d d0, v0
279 addp.2d d2, v2
280 add.2d v0, v0, v2
281 fmov x7, d0 // partial in x7 now
282
283 // restore used vector registers
284 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
285 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
286
287 L32_bytes:
288 tst mlen, #32
289 b.eq L16_bytes
290 ldp x9, x10, [data], #16
291 ldp x11, x12, [data], #16
292 adds x7, x7, x9
293 mov x9, #0
294 adcs x7, x7, x10
295 adcs x7, x7, x11
296 adcs x7, x7, x12
297 adc x7, x7, x9
298
299 L16_bytes:
300 tst mlen, #16
301 b.eq L8_bytes
302 ldp x9, x10, [data], #16
303 adds x7, x7, x9
304 mov x9, #0
305 adcs x7, x7, x10
306 adc x7, x7, x9
307
308 L8_bytes:
309 tst mlen, #8
310 mov x10, #0
311 b.eq L4_bytes
312 ldr x9,[data],#8
313 adds x7, x7, x9
314 adc x7, x7, x10
315
316 L4_bytes:
317 tst mlen, #4
318 b.eq L2_bytes
319 ldr w9,[data],#4
320 adds x7, x7, x9
321 adc x7, x7, x10
322
323 L2_bytes:
324 tst mlen, #2
325 b.eq L_trailing_bytes
326 ldrh w9,[data],#2
327 adds x7, x7, x9
328 adc x7, x7, x10
329
330 L_trailing_bytes:
331 tst mlen, #1
332 b.eq L0_bytes
333 ldrb w9,[data],#1
334 adds x7, x7, x9
335 adc x7, x7, x10
336 eor started_on_odd, started_on_odd, #1
337
338 L0_bytes:
339 /*
340 * if (needs_swap)
341 * partial = (partial << 8) + (partial >> 56);
342 */
343 cbz needs_swap, 1f
344 ror x7, x7, #56
345 1:
346 /*
347 * sum += (partial >> 32) + (partial & 0xffffffff);
348 * sum = (sum >> 32) + (sum & 0xffffffff);
349 * }
350 */
351
352 add x3, x3, x7, lsr #32
353 mov w7, w7
354 add x3, x3, x7
355 mov w7, w3
356 add x3, x7, x3, lsr #32
357
358 L_continue:
359 cmp len, #0
360 ldr m, [m, #M_NEXT] // m = m->m_next
361 b.gt L_loop
362
363 /*
364 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
365 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
366 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
367 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
368 * return (final_acc & 0xffff);
369 * }
370 */
371
372 mov w4, #0x00ffff
373 and x0, x4, x3, lsr #48
374 and x1, x4, x3, lsr #32
375 and x2, x4, x3, lsr #16
376 and x3, x4, x3
377 add w0, w0, w1
378 add w2, w2, w3
379 add w0, w0, w2
380 and w1, w4, w0, lsr #16
381 and w0, w4, w0
382 add w0, w0, w1
383 and w1, w4, w0, lsr #16
384 and w0, w4, w0
385 add w0, w0, w1
386 /*
387 * If we were to 1's complement it (XOR with 0xffff):
388 *
389 * eor w0, w0, w4
390 */
391 and w0, w0, w4
392
393 ret lr
394
395 Lin_cksum_whoops:
396 adrp x0, Lin_cksum_whoops_str@page
397 add x0, x0, Lin_cksum_whoops_str@pageoff
398 bl #CKSUM_ERR
399 mov x0, #-1
400 ret lr
401
402 Lin_cksum_whoops_str:
403 .asciz "os_cpu_in_cksum_mbuf: out of data\n"
404 .align 5