2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
38 #define CKSUM_ERR _kprintf
40 #ifndef LIBSYSCALL_INTERFACE
41 #error "LIBSYSCALL_INTERFACE not defined"
42 #endif /* !LIBSYSCALL_INTERFACE */
43 #define CKSUM_ERR _fprintf_stderr
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk. Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
56 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
64 .globl _os_cpu_in_cksum_mbuf
67 _os_cpu_in_cksum_mbuf:
73 * This function returns the partial 16-bit checksum accumulated in
74 * a 32-bit variable (withouth 1's complement); caller is responsible
75 * for folding the 32-bit sum into 16-bit and performinng the 1's
76 * complement if applicable
81 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
84 * uint64_t sum, partial;
85 * unsigned int final_acc;
87 * boolean_t needs_swap, started_on_odd;
93 * started_on_odd = FALSE;
101 #define needs_swap x4
102 #define started_on_odd x5
107 #if defined(__LP64__)
116 mov needs_swap, #0 // needs_swap = FALSE;
117 mov started_on_odd, #0 // started_on_odd = FALSE;
118 mov w3, w3 // clear higher half
123 * if (PREDICT_FALSE(m == NULL)) {
124 * CKSUM_ERR("%s: out of data\n", __func__);
130 * data = mtod(m, uint8_t *) + off;
131 * goto post_initial_offset;
141 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
142 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
145 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
146 sub mlen, mlen, off // mlen -= off;
147 add data, data, off // data = mtod(m, uint8_t *) + off;
148 b L_post_initial_offset
155 ldr ptr_m, [m, #M_NEXT]
158 L_loop: // for (; len > 0; m = m->m_next) {
160 * if (PREDICT_FALSE(m == NULL)) {
161 * CKSUM_ERR("%s: out of data\n", __func__);
165 * data = mtod(m, uint8_t *);
167 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
168 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
169 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
171 L_post_initial_offset:
173 * if (mlen == 0) continue;
174 * if (mlen > len) mlen = len;
180 csel mlen, mlen, len, le
185 * if ((uintptr_t)data & 1) {
186 * started_on_odd = !started_on_odd;
187 * partial = *data << 8;
191 * needs_swap = started_on_odd;
199 eor started_on_odd, started_on_odd, #1
206 * if ((uintptr_t)data & 2) {
208 * goto trailing_bytes;
209 * partial += *(uint16_t *)(void *)data;
215 mov needs_swap, started_on_odd
218 b.lt L_trailing_bytes
225 * if ((uintptr_t)data & 4) {
228 * partial += *(uint32_t *)(void *)data;
233 // align on 8-bytes boundary if applicable
241 adc x7, x7, x10 // assumes x10 still is #0 as set above
245 * while (mlen >= 64) {
246 * __builtin_prefetch(data + 32);
247 * __builtin_prefetch(data + 64);
248 * partial += *(uint32_t *)(void *)data;
249 * partial += *(uint32_t *)(void *)(data + 4);
250 * partial += *(uint32_t *)(void *)(data + 8);
251 * partial += *(uint32_t *)(void *)(data + 12);
252 * partial += *(uint32_t *)(void *)(data + 16);
253 * partial += *(uint32_t *)(void *)(data + 20);
254 * partial += *(uint32_t *)(void *)(data + 24);
255 * partial += *(uint32_t *)(void *)(data + 28);
256 * partial += *(uint32_t *)(void *)(data + 32);
257 * partial += *(uint32_t *)(void *)(data + 36);
258 * partial += *(uint32_t *)(void *)(data + 40);
259 * partial += *(uint32_t *)(void *)(data + 44);
260 * partial += *(uint32_t *)(void *)(data + 48);
261 * partial += *(uint32_t *)(void *)(data + 52);
262 * partial += *(uint32_t *)(void *)(data + 56);
263 * partial += *(uint32_t *)(void *)(data + 60);
266 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
268 * // partial = (partial << 8) +
269 * // (partial >> 56);
270 * // sum += (partial >> 32);
271 * // sum += (partial & 0xffffffff);
277 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
281 // save used vector registers
284 st1.4s {v0, v1, v2, v3}, [x11], #4*16
285 st1.4s {v4, v5, v6, v7}, [x11], #4*16
287 // spread partial into 8 8-byte registers in v0-v3
293 // load the 1st 64 bytes (16 32-bit words)
294 ld1.4s {v4,v5,v6,v7},[data],#64
296 // branch to finish off if mlen<64
301 * loop for loading and accumulating 16 32-bit words into
302 * 8 8-byte accumulators per iteration.
305 subs mlen, mlen, #64 // mlen -= 64
308 ld1.4s {v4},[data], #16
311 ld1.4s {v5},[data], #16
314 ld1.4s {v6},[data], #16
317 ld1.4s {v7},[data], #16
332 fmov x7, d0 // partial in x7 now
334 // restore used vector registers
335 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
336 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
341 ldp x9, x10, [data], #16
342 ldp x11, x12, [data], #16
353 ldp x9, x10, [data], #16
376 b.eq L_trailing_bytes
387 eor started_on_odd, started_on_odd, #1
392 * partial = (partial << 8) + (partial >> 56);
398 * sum += (partial >> 32) + (partial & 0xffffffff);
399 * sum = (sum >> 32) + (sum & 0xffffffff);
403 add x3, x3, x7, lsr #32
407 add x3, x7, x3, lsr #32
411 ldr ptr_m, [m, #M_NEXT] // m = m->m_next
415 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
416 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
417 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
418 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
419 * return (final_acc & 0xffff);
424 and x0, x4, x3, lsr #48
425 and x1, x4, x3, lsr #32
426 and x2, x4, x3, lsr #16
431 and w1, w4, w0, lsr #16
434 and w1, w4, w0, lsr #16
438 * If we were to 1's complement it (XOR with 0xffff):
447 adrp x0, Lin_cksum_whoops_str@page
448 add x0, x0, Lin_cksum_whoops_str@pageoff
453 Lin_cksum_whoops_str:
454 .asciz "os_cpu_in_cksum_mbuf: out of data\n"