]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/cpu_in_cksum.c
xnu-3248.40.184.tar.gz
[apple/xnu.git] / bsd / netinet / cpu_in_cksum.c
1 /*
2 * Copyright (c) 2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*-
30 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 *
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in
41 * the documentation and/or other materials provided with the
42 * distribution.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
47 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
48 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
49 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
50 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
51 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
52 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
53 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
54 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57
58 #include <sys/param.h>
59 #include <mach/boolean.h>
60 #include <machine/endian.h>
61 #include <sys/mcache.h>
62 #include <sys/mbuf.h>
63 #include <kern/debug.h>
64 #include <netinet/in.h>
65 #include <libkern/libkern.h>
66
67 int cpu_in_cksum(struct mbuf *, int, int, uint32_t);
68
69 #define PREDICT_FALSE(_exp) __builtin_expect((_exp), 0)
70
71 /*
72 * Checksum routine for Internet Protocol family headers (Portable Version).
73 *
74 * This routine is very heavily used in the network
75 * code and should be modified for each CPU to be as fast as possible.
76 *
77 * A discussion of different implementation techniques can be found in
78 * RFC 1071.
79 *
80 * The default implementation for 32-bit architectures is using
81 * a 32-bit accumulator and operating on 16-bit operands.
82 *
83 * The default implementation for 64-bit architectures is using
84 * a 64-bit accumulator and operating on 32-bit operands.
85 *
86 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
87 * of the inner loop. After each iteration of the inner loop, a partial
88 * reduction is done to avoid carry in long packets.
89 */
90
91 #if ULONG_MAX == 0xffffffffUL
92 /* 32-bit version */
93 int
94 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
95 {
96 int mlen;
97 uint32_t sum, partial;
98 unsigned int final_acc;
99 uint8_t *data;
100 boolean_t needs_swap, started_on_odd;
101
102 VERIFY(len >= 0);
103 VERIFY(off >= 0);
104
105 needs_swap = FALSE;
106 started_on_odd = FALSE;
107 sum = (initial_sum >> 16) + (initial_sum & 0xffff);
108
109 for (;;) {
110 if (PREDICT_FALSE(m == NULL)) {
111 printf("%s: out of data\n", __func__);
112 return (-1);
113 }
114 mlen = m->m_len;
115 if (mlen > off) {
116 mlen -= off;
117 data = mtod(m, uint8_t *) + off;
118 goto post_initial_offset;
119 }
120 off -= mlen;
121 if (len == 0)
122 break;
123 m = m->m_next;
124 }
125
126 for (; len > 0; m = m->m_next) {
127 if (PREDICT_FALSE(m == NULL)) {
128 printf("%s: out of data\n", __func__);
129 return (-1);
130 }
131 mlen = m->m_len;
132 data = mtod(m, uint8_t *);
133 post_initial_offset:
134 if (mlen == 0)
135 continue;
136 if (mlen > len)
137 mlen = len;
138 len -= mlen;
139
140 partial = 0;
141 if ((uintptr_t)data & 1) {
142 /* Align on word boundary */
143 started_on_odd = !started_on_odd;
144 #if BYTE_ORDER == LITTLE_ENDIAN
145 partial = *data << 8;
146 #else
147 partial = *data;
148 #endif
149 ++data;
150 --mlen;
151 }
152 needs_swap = started_on_odd;
153 while (mlen >= 32) {
154 __builtin_prefetch(data + 32);
155 partial += *(uint16_t *)(void *)data;
156 partial += *(uint16_t *)(void *)(data + 2);
157 partial += *(uint16_t *)(void *)(data + 4);
158 partial += *(uint16_t *)(void *)(data + 6);
159 partial += *(uint16_t *)(void *)(data + 8);
160 partial += *(uint16_t *)(void *)(data + 10);
161 partial += *(uint16_t *)(void *)(data + 12);
162 partial += *(uint16_t *)(void *)(data + 14);
163 partial += *(uint16_t *)(void *)(data + 16);
164 partial += *(uint16_t *)(void *)(data + 18);
165 partial += *(uint16_t *)(void *)(data + 20);
166 partial += *(uint16_t *)(void *)(data + 22);
167 partial += *(uint16_t *)(void *)(data + 24);
168 partial += *(uint16_t *)(void *)(data + 26);
169 partial += *(uint16_t *)(void *)(data + 28);
170 partial += *(uint16_t *)(void *)(data + 30);
171 data += 32;
172 mlen -= 32;
173 if (PREDICT_FALSE(partial & 0xc0000000)) {
174 if (needs_swap)
175 partial = (partial << 8) +
176 (partial >> 24);
177 sum += (partial >> 16);
178 sum += (partial & 0xffff);
179 partial = 0;
180 }
181 }
182 if (mlen & 16) {
183 partial += *(uint16_t *)(void *)data;
184 partial += *(uint16_t *)(void *)(data + 2);
185 partial += *(uint16_t *)(void *)(data + 4);
186 partial += *(uint16_t *)(void *)(data + 6);
187 partial += *(uint16_t *)(void *)(data + 8);
188 partial += *(uint16_t *)(void *)(data + 10);
189 partial += *(uint16_t *)(void *)(data + 12);
190 partial += *(uint16_t *)(void *)(data + 14);
191 data += 16;
192 mlen -= 16;
193 }
194 /*
195 * mlen is not updated below as the remaining tests
196 * are using bit masks, which are not affected.
197 */
198 if (mlen & 8) {
199 partial += *(uint16_t *)(void *)data;
200 partial += *(uint16_t *)(void *)(data + 2);
201 partial += *(uint16_t *)(void *)(data + 4);
202 partial += *(uint16_t *)(void *)(data + 6);
203 data += 8;
204 }
205 if (mlen & 4) {
206 partial += *(uint16_t *)(void *)data;
207 partial += *(uint16_t *)(void *)(data + 2);
208 data += 4;
209 }
210 if (mlen & 2) {
211 partial += *(uint16_t *)(void *)data;
212 data += 2;
213 }
214 if (mlen & 1) {
215 #if BYTE_ORDER == LITTLE_ENDIAN
216 partial += *data;
217 #else
218 partial += *data << 8;
219 #endif
220 started_on_odd = !started_on_odd;
221 }
222
223 if (needs_swap)
224 partial = (partial << 8) + (partial >> 24);
225 sum += (partial >> 16) + (partial & 0xffff);
226 /*
227 * Reduce sum to allow potential byte swap
228 * in the next iteration without carry.
229 */
230 sum = (sum >> 16) + (sum & 0xffff);
231 }
232 final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
233 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
234 return (~final_acc & 0xffff);
235 }
236
237 #else
238 /* 64-bit version */
239 int
240 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
241 {
242 int mlen;
243 uint64_t sum, partial;
244 unsigned int final_acc;
245 uint8_t *data;
246 boolean_t needs_swap, started_on_odd;
247
248 VERIFY(len >= 0);
249 VERIFY(off >= 0);
250
251 needs_swap = FALSE;
252 started_on_odd = FALSE;
253 sum = initial_sum;
254
255 for (;;) {
256 if (PREDICT_FALSE(m == NULL)) {
257 printf("%s: out of data\n", __func__);
258 return (-1);
259 }
260 mlen = m->m_len;
261 if (mlen > off) {
262 mlen -= off;
263 data = mtod(m, uint8_t *) + off;
264 goto post_initial_offset;
265 }
266 off -= mlen;
267 if (len == 0)
268 break;
269 m = m->m_next;
270 }
271
272 for (; len > 0; m = m->m_next) {
273 if (PREDICT_FALSE(m == NULL)) {
274 printf("%s: out of data\n", __func__);
275 return (-1);
276 }
277 mlen = m->m_len;
278 data = mtod(m, uint8_t *);
279 post_initial_offset:
280 if (mlen == 0)
281 continue;
282 if (mlen > len)
283 mlen = len;
284 len -= mlen;
285
286 partial = 0;
287 if ((uintptr_t)data & 1) {
288 /* Align on word boundary */
289 started_on_odd = !started_on_odd;
290 #if BYTE_ORDER == LITTLE_ENDIAN
291 partial = *data << 8;
292 #else
293 partial = *data;
294 #endif
295 ++data;
296 --mlen;
297 }
298 needs_swap = started_on_odd;
299 if ((uintptr_t)data & 2) {
300 if (mlen < 2)
301 goto trailing_bytes;
302 partial += *(uint16_t *)(void *)data;
303 data += 2;
304 mlen -= 2;
305 }
306 while (mlen >= 64) {
307 __builtin_prefetch(data + 32);
308 __builtin_prefetch(data + 64);
309 partial += *(uint32_t *)(void *)data;
310 partial += *(uint32_t *)(void *)(data + 4);
311 partial += *(uint32_t *)(void *)(data + 8);
312 partial += *(uint32_t *)(void *)(data + 12);
313 partial += *(uint32_t *)(void *)(data + 16);
314 partial += *(uint32_t *)(void *)(data + 20);
315 partial += *(uint32_t *)(void *)(data + 24);
316 partial += *(uint32_t *)(void *)(data + 28);
317 partial += *(uint32_t *)(void *)(data + 32);
318 partial += *(uint32_t *)(void *)(data + 36);
319 partial += *(uint32_t *)(void *)(data + 40);
320 partial += *(uint32_t *)(void *)(data + 44);
321 partial += *(uint32_t *)(void *)(data + 48);
322 partial += *(uint32_t *)(void *)(data + 52);
323 partial += *(uint32_t *)(void *)(data + 56);
324 partial += *(uint32_t *)(void *)(data + 60);
325 data += 64;
326 mlen -= 64;
327 if (PREDICT_FALSE(partial & (3ULL << 62))) {
328 if (needs_swap)
329 partial = (partial << 8) +
330 (partial >> 56);
331 sum += (partial >> 32);
332 sum += (partial & 0xffffffff);
333 partial = 0;
334 }
335 }
336 /*
337 * mlen is not updated below as the remaining tests
338 * are using bit masks, which are not affected.
339 */
340 if (mlen & 32) {
341 partial += *(uint32_t *)(void *)data;
342 partial += *(uint32_t *)(void *)(data + 4);
343 partial += *(uint32_t *)(void *)(data + 8);
344 partial += *(uint32_t *)(void *)(data + 12);
345 partial += *(uint32_t *)(void *)(data + 16);
346 partial += *(uint32_t *)(void *)(data + 20);
347 partial += *(uint32_t *)(void *)(data + 24);
348 partial += *(uint32_t *)(void *)(data + 28);
349 data += 32;
350 }
351 if (mlen & 16) {
352 partial += *(uint32_t *)(void *)data;
353 partial += *(uint32_t *)(void *)(data + 4);
354 partial += *(uint32_t *)(void *)(data + 8);
355 partial += *(uint32_t *)(void *)(data + 12);
356 data += 16;
357 }
358 if (mlen & 8) {
359 partial += *(uint32_t *)(void *)data;
360 partial += *(uint32_t *)(void *)(data + 4);
361 data += 8;
362 }
363 if (mlen & 4) {
364 partial += *(uint32_t *)(void *)data;
365 data += 4;
366 }
367 if (mlen & 2) {
368 partial += *(uint16_t *)(void *)data;
369 data += 2;
370 }
371 trailing_bytes:
372 if (mlen & 1) {
373 #if BYTE_ORDER == LITTLE_ENDIAN
374 partial += *data;
375 #else
376 partial += *data << 8;
377 #endif
378 started_on_odd = !started_on_odd;
379 }
380
381 if (needs_swap)
382 partial = (partial << 8) + (partial >> 56);
383 sum += (partial >> 32) + (partial & 0xffffffff);
384 /*
385 * Reduce sum to allow potential byte swap
386 * in the next iteration without carry.
387 */
388 sum = (sum >> 32) + (sum & 0xffffffff);
389 }
390 final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
391 ((sum >> 16) & 0xffff) + (sum & 0xffff);
392 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
393 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
394 return (~final_acc & 0xffff);
395 }
396 #endif /* ULONG_MAX != 0xffffffffUL */