2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
38 #define CKSUM_ERR _kprintf
40 #ifndef LIBSYSCALL_INTERFACE
41 #error "LIBSYSCALL_INTERFACE not defined"
42 #endif /* !LIBSYSCALL_INTERFACE */
43 #define CKSUM_ERR _fprintf_stderr
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk. Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
56 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
64 .globl _os_cpu_in_cksum_mbuf
67 _os_cpu_in_cksum_mbuf:
73 * This function returns the partial 16-bit checksum accumulated in
74 * a 32-bit variable (withouth 1's complement); caller is responsible
75 * for folding the 32-bit sum into 16-bit and performinng the 1's
76 * complement if applicable
81 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
84 * uint64_t sum, partial;
85 * unsigned int final_acc;
87 * boolean_t needs_swap, started_on_odd;
93 * started_on_odd = FALSE;
101 #define needs_swap x4
102 #define started_on_odd x5
107 #if defined(__LP64__)
116 mov needs_swap, #0 // needs_swap = FALSE;
117 mov started_on_odd, #0 // started_on_odd = FALSE;
118 mov w3, w3 // clear higher half
123 * if (PREDICT_FALSE(m == NULL)) {
124 * CKSUM_ERR("%s: out of data\n", __func__);
130 * data = mtod(m, uint8_t *) + off;
131 * goto post_initial_offset;
141 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
142 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
145 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
146 sub mlen, mlen, off // mlen -= off;
147 add data, data, off // data = mtod(m, uint8_t *) + off;
148 b L_post_initial_offset
155 ldr ptr_m, [m, #M_NEXT]
158 L_loop: // for (; len > 0; m = m->m_next) {
160 * if (PREDICT_FALSE(m == NULL)) {
161 * CKSUM_ERR("%s: out of data\n", __func__);
165 * data = mtod(m, uint8_t *);
167 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
168 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
169 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
171 L_post_initial_offset:
173 * if (mlen == 0) continue;
174 * if (mlen > len) mlen = len;
180 csel mlen, mlen, len, le
185 * if ((uintptr_t)data & 1) {
186 * started_on_odd = !started_on_odd;
187 * partial = *data << 8;
191 * needs_swap = started_on_odd;
199 eor started_on_odd, started_on_odd, #1
206 * if ((uintptr_t)data & 2) {
208 * goto trailing_bytes;
209 * partial += *(uint16_t *)(void *)data;
215 mov needs_swap, started_on_odd
218 b.lt L_trailing_bytes
225 * while (mlen >= 64) {
226 * __builtin_prefetch(data + 32);
227 * __builtin_prefetch(data + 64);
228 * partial += *(uint32_t *)(void *)data;
229 * partial += *(uint32_t *)(void *)(data + 4);
230 * partial += *(uint32_t *)(void *)(data + 8);
231 * partial += *(uint32_t *)(void *)(data + 12);
232 * partial += *(uint32_t *)(void *)(data + 16);
233 * partial += *(uint32_t *)(void *)(data + 20);
234 * partial += *(uint32_t *)(void *)(data + 24);
235 * partial += *(uint32_t *)(void *)(data + 28);
236 * partial += *(uint32_t *)(void *)(data + 32);
237 * partial += *(uint32_t *)(void *)(data + 36);
238 * partial += *(uint32_t *)(void *)(data + 40);
239 * partial += *(uint32_t *)(void *)(data + 44);
240 * partial += *(uint32_t *)(void *)(data + 48);
241 * partial += *(uint32_t *)(void *)(data + 52);
242 * partial += *(uint32_t *)(void *)(data + 56);
243 * partial += *(uint32_t *)(void *)(data + 60);
246 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
248 * // partial = (partial << 8) +
249 * // (partial >> 56);
250 * // sum += (partial >> 32);
251 * // sum += (partial & 0xffffffff);
257 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
261 // save used vector registers
264 st1.4s {v0, v1, v2, v3}, [x11], #4*16
265 st1.4s {v4, v5, v6, v7}, [x11], #4*16
267 // spread partial into 8 8-byte registers in v0-v3
273 // load the 1st 64 bytes (16 32-bit words)
274 ld1.4s {v4,v5,v6,v7},[data],#64
276 // branch to finish off if mlen<64
281 * loop for loading and accumulating 16 32-bit words into
282 * 8 8-byte accumulators per iteration.
285 subs mlen, mlen, #64 // mlen -= 64
288 ld1.4s {v4},[data], #16
291 ld1.4s {v5},[data], #16
294 ld1.4s {v6},[data], #16
297 ld1.4s {v7},[data], #16
312 fmov x7, d0 // partial in x7 now
314 // restore used vector registers
315 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
316 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
321 ldp x9, x10, [data], #16
322 ldp x11, x12, [data], #16
333 ldp x9, x10, [data], #16
356 b.eq L_trailing_bytes
367 eor started_on_odd, started_on_odd, #1
372 * partial = (partial << 8) + (partial >> 56);
378 * sum += (partial >> 32) + (partial & 0xffffffff);
379 * sum = (sum >> 32) + (sum & 0xffffffff);
383 add x3, x3, x7, lsr #32
387 add x3, x7, x3, lsr #32
391 ldr ptr_m, [m, #M_NEXT] // m = m->m_next
395 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
396 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
397 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
398 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
399 * return (final_acc & 0xffff);
404 and x0, x4, x3, lsr #48
405 and x1, x4, x3, lsr #32
406 and x2, x4, x3, lsr #16
411 and w1, w4, w0, lsr #16
414 and w1, w4, w0, lsr #16
418 * If we were to 1's complement it (XOR with 0xffff):
427 adrp x0, Lin_cksum_whoops_str@page
428 add x0, x0, Lin_cksum_whoops_str@pageoff
433 Lin_cksum_whoops_str:
434 .asciz "os_cpu_in_cksum_mbuf: out of data\n"