2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
38 #define CKSUM_ERR _kprintf
40 #ifndef LIBSYSCALL_INTERFACE
41 #error "LIBSYSCALL_INTERFACE not defined"
42 #endif /* !LIBSYSCALL_INTERFACE */
43 #define CKSUM_ERR _fprintf_stderr
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk. Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
55 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
58 .globl _os_cpu_in_cksum_mbuf
61 _os_cpu_in_cksum_mbuf:
67 * This function returns the partial 16-bit checksum accumulated in
68 * a 32-bit variable (withouth 1's complement); caller is responsible
69 * for folding the 32-bit sum into 16-bit and performinng the 1's
70 * complement if applicable
75 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
78 * uint64_t sum, partial;
79 * unsigned int final_acc;
81 * boolean_t needs_swap, started_on_odd;
87 * started_on_odd = FALSE;
96 #define started_on_odd x5
102 mov needs_swap, #0 // needs_swap = FALSE;
103 mov started_on_odd, #0 // started_on_odd = FALSE;
104 mov w3, w3 // clear higher half
109 * if (PREDICT_FALSE(m == NULL)) {
110 * CKSUM_ERR("%s: out of data\n", __func__);
116 * data = mtod(m, uint8_t *) + off;
117 * goto post_initial_offset;
127 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
128 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
131 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
132 sub mlen, mlen, off // mlen -= off;
133 add data, data, off // data = mtod(m, uint8_t *) + off;
134 b L_post_initial_offset
144 L_loop: // for (; len > 0; m = m->m_next) {
146 * if (PREDICT_FALSE(m == NULL)) {
147 * CKSUM_ERR("%s: out of data\n", __func__);
151 * data = mtod(m, uint8_t *);
153 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
154 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
155 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
157 L_post_initial_offset:
159 * if (mlen == 0) continue;
160 * if (mlen > len) mlen = len;
166 csel mlen, mlen, len, le
171 * if ((uintptr_t)data & 1) {
172 * started_on_odd = !started_on_odd;
173 * partial = *data << 8;
177 * needs_swap = started_on_odd;
185 eor started_on_odd, started_on_odd, #1
192 * if ((uintptr_t)data & 2) {
194 * goto trailing_bytes;
195 * partial += *(uint16_t *)(void *)data;
201 mov needs_swap, started_on_odd
204 b.lt L_trailing_bytes
211 * while (mlen >= 64) {
212 * __builtin_prefetch(data + 32);
213 * __builtin_prefetch(data + 64);
214 * partial += *(uint32_t *)(void *)data;
215 * partial += *(uint32_t *)(void *)(data + 4);
216 * partial += *(uint32_t *)(void *)(data + 8);
217 * partial += *(uint32_t *)(void *)(data + 12);
218 * partial += *(uint32_t *)(void *)(data + 16);
219 * partial += *(uint32_t *)(void *)(data + 20);
220 * partial += *(uint32_t *)(void *)(data + 24);
221 * partial += *(uint32_t *)(void *)(data + 28);
222 * partial += *(uint32_t *)(void *)(data + 32);
223 * partial += *(uint32_t *)(void *)(data + 36);
224 * partial += *(uint32_t *)(void *)(data + 40);
225 * partial += *(uint32_t *)(void *)(data + 44);
226 * partial += *(uint32_t *)(void *)(data + 48);
227 * partial += *(uint32_t *)(void *)(data + 52);
228 * partial += *(uint32_t *)(void *)(data + 56);
229 * partial += *(uint32_t *)(void *)(data + 60);
232 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
234 * // partial = (partial << 8) +
235 * // (partial >> 56);
236 * // sum += (partial >> 32);
237 * // sum += (partial & 0xffffffff);
243 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
247 // save used vector registers
250 st1.4s {v0, v1, v2, v3}, [x11], #4*16
251 st1.4s {v4, v5, v6, v7}, [x11], #4*16
253 // spread partial into 8 8-byte registers in v0-v3
259 // load the 1st 64 bytes (16 32-bit words)
260 ld1.4s {v4,v5,v6,v7},[data],#64
262 // branch to finish off if mlen<64
267 * loop for loading and accumulating 16 32-bit words into
268 * 8 8-byte accumulators per iteration.
271 subs mlen, mlen, #64 // mlen -= 64
274 ld1.4s {v4},[data], #16
277 ld1.4s {v5},[data], #16
280 ld1.4s {v6},[data], #16
283 ld1.4s {v7},[data], #16
298 fmov x7, d0 // partial in x7 now
300 // restore used vector registers
301 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
302 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
307 ldp x9, x10, [data], #16
308 ldp x11, x12, [data], #16
319 ldp x9, x10, [data], #16
342 b.eq L_trailing_bytes
353 eor started_on_odd, started_on_odd, #1
358 * partial = (partial << 8) + (partial >> 56);
364 * sum += (partial >> 32) + (partial & 0xffffffff);
365 * sum = (sum >> 32) + (sum & 0xffffffff);
369 add x3, x3, x7, lsr #32
373 add x3, x7, x3, lsr #32
377 ldr m, [m, #M_NEXT] // m = m->m_next
381 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
382 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
383 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
384 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
385 * return (final_acc & 0xffff);
390 and x0, x4, x3, lsr #48
391 and x1, x4, x3, lsr #32
392 and x2, x4, x3, lsr #16
397 and w1, w4, w0, lsr #16
400 and w1, w4, w0, lsr #16
404 * If we were to 1's complement it (XOR with 0xffff):
413 adrp x0, Lin_cksum_whoops_str@page
414 add x0, x0, Lin_cksum_whoops_str@pageoff
419 Lin_cksum_whoops_str:
420 .asciz "os_cpu_in_cksum_mbuf: out of data\n"