2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
4 * This document is the property of Apple Inc.
5 * It is considered confidential and proprietary.
7 * This document may not be reproduced or transmitted in any form,
8 * in whole or in part, without the express written permission of
13 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
14 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
15 * on the 64-bit part in netinet/cpu_in_cksum.c
17 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
21 #define CKSUM_ERR _kprintf
23 #ifndef LIBSYSCALL_INTERFACE
24 #error "LIBSYSCALL_INTERFACE not defined"
25 #endif /* !LIBSYSCALL_INTERFACE */
26 #define CKSUM_ERR _fprintf_stderr
32 * Ugly, but we have little choice, since relying on genassym and <assym.s>
33 * is not possible unless this code lives in osfmk. Note also that this
34 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
35 * authentic; it only cares about 3 fields.
38 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
41 .globl _os_cpu_in_cksum_mbuf
44 _os_cpu_in_cksum_mbuf:
50 * This function returns the partial 16-bit checksum accumulated in
51 * a 32-bit variable (withouth 1's complement); caller is responsible
52 * for folding the 32-bit sum into 16-bit and performinng the 1's
53 * complement if applicable
58 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
61 * uint64_t sum, partial;
62 * unsigned int final_acc;
64 * boolean_t needs_swap, started_on_odd;
70 * started_on_odd = FALSE;
79 #define started_on_odd x5
85 mov needs_swap, #0 // needs_swap = FALSE;
86 mov started_on_odd, #0 // started_on_odd = FALSE;
87 mov w3, w3 // clear higher half
92 * if (PREDICT_FALSE(m == NULL)) {
93 * CKSUM_ERR("%s: out of data\n", __func__);
99 * data = mtod(m, uint8_t *) + off;
100 * goto post_initial_offset;
110 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
111 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
114 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
115 sub mlen, mlen, off // mlen -= off;
116 add data, data, off // data = mtod(m, uint8_t *) + off;
117 b L_post_initial_offset
127 L_loop: // for (; len > 0; m = m->m_next) {
129 * if (PREDICT_FALSE(m == NULL)) {
130 * CKSUM_ERR("%s: out of data\n", __func__);
134 * data = mtod(m, uint8_t *);
136 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
137 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
138 ldr data, [m, #M_DATA] // mtod(m, uint8_t *)
140 L_post_initial_offset:
142 * if (mlen == 0) continue;
143 * if (mlen > len) mlen = len;
149 csel mlen, mlen, len, le
154 * if ((uintptr_t)data & 1) {
155 * started_on_odd = !started_on_odd;
156 * partial = *data << 8;
160 * needs_swap = started_on_odd;
168 eor started_on_odd, started_on_odd, #1
175 * if ((uintptr_t)data & 2) {
177 * goto trailing_bytes;
178 * partial += *(uint16_t *)(void *)data;
184 mov needs_swap, started_on_odd
187 b.lt L_trailing_bytes
194 * while (mlen >= 64) {
195 * __builtin_prefetch(data + 32);
196 * __builtin_prefetch(data + 64);
197 * partial += *(uint32_t *)(void *)data;
198 * partial += *(uint32_t *)(void *)(data + 4);
199 * partial += *(uint32_t *)(void *)(data + 8);
200 * partial += *(uint32_t *)(void *)(data + 12);
201 * partial += *(uint32_t *)(void *)(data + 16);
202 * partial += *(uint32_t *)(void *)(data + 20);
203 * partial += *(uint32_t *)(void *)(data + 24);
204 * partial += *(uint32_t *)(void *)(data + 28);
205 * partial += *(uint32_t *)(void *)(data + 32);
206 * partial += *(uint32_t *)(void *)(data + 36);
207 * partial += *(uint32_t *)(void *)(data + 40);
208 * partial += *(uint32_t *)(void *)(data + 44);
209 * partial += *(uint32_t *)(void *)(data + 48);
210 * partial += *(uint32_t *)(void *)(data + 52);
211 * partial += *(uint32_t *)(void *)(data + 56);
212 * partial += *(uint32_t *)(void *)(data + 60);
215 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
217 * // partial = (partial << 8) +
218 * // (partial >> 56);
219 * // sum += (partial >> 32);
220 * // sum += (partial & 0xffffffff);
226 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
230 // save used vector registers
233 st1.4s {v0, v1, v2, v3}, [x11], #4*16
234 st1.4s {v4, v5, v6, v7}, [x11], #4*16
236 // spread partial into 8 8-byte registers in v0-v3
242 // load the 1st 64 bytes (16 32-bit words)
243 ld1.4s {v4,v5,v6,v7},[data],#64
245 // branch to finish off if mlen<64
250 * loop for loading and accumulating 16 32-bit words into
251 * 8 8-byte accumulators per iteration.
254 subs mlen, mlen, #64 // mlen -= 64
257 ld1.4s {v4},[data], #16
260 ld1.4s {v5},[data], #16
263 ld1.4s {v6},[data], #16
266 ld1.4s {v7},[data], #16
281 fmov x7, d0 // partial in x7 now
283 // restore used vector registers
284 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
285 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
290 ldp x9, x10, [data], #16
291 ldp x11, x12, [data], #16
302 ldp x9, x10, [data], #16
325 b.eq L_trailing_bytes
336 eor started_on_odd, started_on_odd, #1
341 * partial = (partial << 8) + (partial >> 56);
347 * sum += (partial >> 32) + (partial & 0xffffffff);
348 * sum = (sum >> 32) + (sum & 0xffffffff);
352 add x3, x3, x7, lsr #32
356 add x3, x7, x3, lsr #32
360 ldr m, [m, #M_NEXT] // m = m->m_next
364 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
365 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
366 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
367 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
368 * return (final_acc & 0xffff);
373 and x0, x4, x3, lsr #48
374 and x1, x4, x3, lsr #32
375 and x2, x4, x3, lsr #16
380 and w1, w4, w0, lsr #16
383 and w1, w4, w0, lsr #16
387 * If we were to 1's complement it (XOR with 0xffff):
396 adrp x0, Lin_cksum_whoops_str@page
397 add x0, x0, Lin_cksum_whoops_str@pageoff
402 Lin_cksum_whoops_str:
403 .asciz "os_cpu_in_cksum_mbuf: out of data\n"