X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/765c9de3b4af7c2078d16a03812ae2c7c2b24938..eb6b6ca394357805f2bdba989abae309f718b4d8:/bsd/netinet/in_cksum.c

diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c
index 9d349d7b5..b4cd509ff 100644
--- a/bsd/netinet/in_cksum.c
+++ b/bsd/netinet/in_cksum.c
@@ -1,23 +1,29 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * Copyright (c) 1988, 1992, 1993
@@ -55,10 +61,14 @@
  */
 
 #include <sys/param.h>
+#include <machine/endian.h>
 #include <sys/mbuf.h>
-#include <sys/kdebug.h>
-
-#define DBG_FNC_IN_CKSUM	NETDBG_CODE(DBG_NETIP, (3 << 8))
+#include <kern/debug.h>
+#include <net/dlil.h>
+#include <netinet/in.h>
+#define _IP_VHL
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
 
 /*
  * Checksum routine for Internet Protocol family headers (Portable Version).
@@ -66,378 +76,564 @@
  * This routine is very heavily used in the network
  * code and should be modified for each CPU to be as fast as possible.
  */
-
-union s_util {
-        char    c[2];
-        u_short s;
-};
+#define REDUCE16 {                                                        \
+	q_util.q = sum;                                                   \
+	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
+	sum = l_util.s[0] + l_util.s[1];                                  \
+	ADDCARRY(sum);                                                    \
+}
 
 union l_util {
-        u_int16_t s[2];
-        u_int32_t l;   
+	uint16_t s[2];
+	uint32_t l;
 };
 
 union q_util {
-        u_int16_t s[4];
-        u_int32_t l[2];
-        u_int64_t q;
-};    
-
-#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
-
-#define REDUCE32                                                          \
-    {                                                                     \
-        q_util.q = sum;                                                   \
-        sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3];      \
-    }
-#define REDUCE16                                                          \
-    {                                                                     \
-        q_util.q = sum;                                                   \
-        l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
-        sum = l_util.s[0] + l_util.s[1];                                  \
-        ADDCARRY(sum);                                                    \
-    }
-
-#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
-
-                
-#if defined(ppc)
-
-__inline unsigned short
-in_addword(u_short a, u_short b)
+	uint16_t s[4];
+	uint32_t l[2];
+	uint64_t q;
+};
+
+extern uint32_t os_cpu_in_cksum(const void *, uint32_t, uint32_t);
+
+/*
+ * Perform 16-bit 1's complement sum on a contiguous span.
+ */
+uint16_t
+b_sum16(const void *buf, int len)
 {
-        union l_util l_util;   
-	u_int32_t sum = a + b;
-	REDUCE;
-	return (sum);
+	return os_cpu_in_cksum(buf, len, 0);
 }
 
-__inline unsigned short
-in_pseudo(u_int a, u_int b, u_int c)
+uint16_t inet_cksum_simple(struct mbuf *, int);
+/*
+ * For the exported _in_cksum symbol in BSDKernel symbol set.
+ */
+uint16_t
+inet_cksum_simple(struct mbuf *m, int len)
 {
-        u_int64_t sum;
-        union q_util q_util;
-        union l_util l_util;   
+	return inet_cksum(m, 0, 0, len);
+}
 
-        sum = (u_int64_t) a + b + c;
-        REDUCE16;
-        return (sum);
+uint16_t
+in_addword(uint16_t a, uint16_t b)
+{
+	uint64_t sum = a + b;
 
+	ADDCARRY(sum);
+	return sum;
 }
 
-int
-in_cksum(m, len)
-	register struct mbuf *m;
-	register int len;
+uint16_t
+in_pseudo(uint32_t a, uint32_t b, uint32_t c)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int starting_on_odd  = 0;
+	uint64_t sum;
+	union q_util q_util;
+	union l_util l_util;
 
+	sum = (uint64_t)a + b + c;
+	REDUCE16;
+	return sum;
+}
 
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0);
+uint16_t
+in_pseudo64(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t sum;
+	union q_util q_util;
+	union l_util l_util;
 
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		mlen = m->m_len;
-		w = mtod(m, u_short *);
+	sum = a + b + c;
+	REDUCE16;
+	return sum;
+}
 
-		if (len < mlen)
-			mlen = len;
+/*
+ * May be used on IP header with options.
+ */
+uint16_t
+in_cksum_hdr_opt(const struct ip *ip)
+{
+	return ~b_sum16(ip, (IP_VHL_HL(ip->ip_vhl) << 2)) & 0xffff;
+}
 
-		sum = xsum_assym(w, mlen, sum, starting_on_odd);
-		len -= mlen;
-		if (mlen & 0x1)
-		{
-		    if (starting_on_odd)
-			starting_on_odd = 0;
-		    else
-			starting_on_odd = 1;
-		}
+/*
+ * A wrapper around the simple in_cksum_hdr() and the more complicated
+ * inet_cksum(); the former is chosen if the IP header is simple,
+ * contiguous and 32-bit aligned.  Also does some stats accounting.
+ */
+uint16_t
+ip_cksum_hdr_dir(struct mbuf *m, uint32_t hlen, int out)
+{
+	struct ip *ip = mtod(m, struct ip *);
+
+	if (out) {
+		ipstat.ips_snd_swcsum++;
+		ipstat.ips_snd_swcsum_bytes += hlen;
+	} else {
+		ipstat.ips_rcv_swcsum++;
+		ipstat.ips_rcv_swcsum_bytes += hlen;
+	}
+
+	if (hlen == sizeof(*ip) &&
+	    m->m_len >= sizeof(*ip) && IP_HDR_ALIGNED_P(ip)) {
+		return in_cksum_hdr(ip);
 	}
 
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0);
-	return (~sum & 0xffff);
+	return inet_cksum(m, 0, 0, hlen);
 }
 
-u_short
-in_cksum_skip(m, len, skip)
-        register struct mbuf *m;
-        register int len;
-        register int skip;
+uint16_t
+ip_cksum_hdr_dir_buffer(const void *buffer, uint32_t hlen, uint32_t len,
+    int out)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int starting_on_odd  = 0;
-
-	len -= skip;
-        for (; skip && m; m = m->m_next) {
-                if (m->m_len > skip) {
-                        mlen = m->m_len - skip;
-			w = (u_short *)(m->m_data+skip);
-                        goto skip_start;
-                } else {    
-                        skip -= m->m_len;
-                }
-        }
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		mlen = m->m_len;
-		w = mtod(m, u_short *);
+	const struct ip *ip = buffer;
+
+	if (out) {
+		ipstat.ips_snd_swcsum++;
+		ipstat.ips_snd_swcsum_bytes += hlen;
+	} else {
+		ipstat.ips_rcv_swcsum++;
+		ipstat.ips_rcv_swcsum_bytes += hlen;
+	}
 
-skip_start:
-		if (len < mlen)
-			mlen = len;
-		sum = xsum_assym(w, mlen, sum, starting_on_odd);
-		len -= mlen;
-		if (mlen & 0x1)
-		{
-		    if (starting_on_odd)
-			starting_on_odd = 0;
-		    else
-			starting_on_odd = 1;
+	if (hlen == sizeof(*ip) &&
+	    len >= sizeof(*ip) && IP_HDR_ALIGNED_P(ip)) {
+		return in_cksum_hdr(ip);
+	}
+
+	return inet_cksum_buffer(buffer, 0, 0, hlen);
+}
+
+/*
+ * m MUST contain at least an IP header, if nxt is specified;
+ * nxt is the upper layer protocol number;
+ * off is an offset where TCP/UDP/ICMP header starts;
+ * len is a total length of a transport segment (e.g. TCP header + TCP payload)
+ */
+uint16_t
+inet_cksum(struct mbuf *m, uint32_t nxt, uint32_t off, uint32_t len)
+{
+	uint32_t sum;
+
+	sum = m_sum16(m, off, len);
+
+	/* include pseudo header checksum? */
+	if (nxt != 0) {
+		struct ip *ip;
+		unsigned char buf[sizeof((*ip))] __attribute__((aligned(8)));
+		uint32_t mlen;
+
+		/*
+		 * Sanity check
+		 *
+		 * Use m_length2() instead of m_length(), as we cannot rely on
+		 * the caller setting m_pkthdr.len correctly, if the mbuf is
+		 * a M_PKTHDR one.
+		 */
+		if ((mlen = m_length2(m, NULL)) < sizeof(*ip)) {
+			panic("%s: mbuf %p too short (%d) for IPv4 header",
+			    __func__, m, mlen);
+			/* NOTREACHED */
 		}
+
+		/*
+		 * In case the IP header is not contiguous, or not 32-bit
+		 * aligned, copy it to a local buffer.  Note here that we
+		 * expect the data pointer to point to the IP header.
+		 */
+		if ((sizeof(*ip) > m->m_len) ||
+		    !IP_HDR_ALIGNED_P(mtod(m, caddr_t))) {
+			m_copydata(m, 0, sizeof(*ip), (caddr_t)buf);
+			ip = (struct ip *)(void *)buf;
+		} else {
+			ip = (struct ip *)(void *)(m->m_data);
+		}
+
+		/* add pseudo header checksum */
+		sum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+		    htonl(len + nxt));
+
+		/* fold in carry bits */
+		ADDCARRY(sum);
 	}
 
-	return (~sum & 0xffff);
+	return ~sum & 0xffff;
 }
-#else
-
-u_short 
-in_addword(u_short a, u_short b)
-{       
-        union l_util l_util;   
-        u_int32_t sum = a + b;
-        REDUCE(sum);
-        return (sum);
-}       
-
-u_short
-in_pseudo(u_int a, u_int b, u_int c)
+
+/*
+ * buffer MUST contain at least an IP header, if nxt is specified;
+ * nxt is the upper layer protocol number;
+ * off is an offset where TCP/UDP/ICMP header starts;
+ * len is a total length of a transport segment (e.g. TCP header + TCP payload)
+ */
+uint16_t
+inet_cksum_buffer(const void *buffer, uint32_t nxt, uint32_t off,
+    uint32_t len)
 {
-        u_int64_t sum;  
-        union q_util q_util;
-        union l_util l_util;   
+	uint32_t sum;
 
-        sum = (u_int64_t) a + b + c;
-        REDUCE16;
-        return (sum);
+	if (off >= len) {
+		panic("%s: off (%d) >= len (%d)", __func__, off, len);
+	}
+
+	sum = b_sum16(&((const uint8_t *)buffer)[off], len);
+
+	/* include pseudo header checksum? */
+	if (nxt != 0) {
+		const struct ip *ip;
+		unsigned char buf[sizeof((*ip))] __attribute__((aligned(8)));
+
+		/*
+		 * In case the IP header is not contiguous, or not 32-bit
+		 * aligned, copy it to a local buffer.  Note here that we
+		 * expect the data pointer to point to the IP header.
+		 */
+		if (!IP_HDR_ALIGNED_P(buffer)) {
+			memcpy(buf, buffer, sizeof(*ip));
+			ip = (const struct ip *)(const void *)buf;
+		} else {
+			ip = (const struct ip *)buffer;
+		}
+
+		/* add pseudo header checksum */
+		sum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+		    htonl(len + nxt));
+
+		/* fold in carry bits */
+		ADDCARRY(sum);
+	}
+
+	return ~sum & 0xffff;
 }
 
+#if DEBUG || DEVELOPMENT
+#include <pexpert/pexpert.h>
 
-int
-in_cksum(m, len)
-	register struct mbuf *m;
-	register int len;
-{
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int byte_swapped = 0;
-	union s_util s_util;
-	union l_util l_util;   
+#define CKSUM_ERR kprintf
 
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0);
+/*
+ * The following routines implement the portable, reference implementation
+ * of os_cpu_in_cksum_mbuf().  This is currently used only for validating
+ * the correctness of the platform-specific implementation, at boot time
+ * in dlil_verify_sum16().  It returns the 32-bit accumulator without doing
+ * a 1's complement on it.
+ */
+#if !defined(__LP64__)
+/* 32-bit version */
+uint32_t
+in_cksum_mbuf_ref(struct mbuf *m, int len, int off, uint32_t initial_sum)
+{
+	int mlen;
+	uint32_t sum, partial;
+	unsigned int final_acc;
+	uint8_t *data;
+	boolean_t needs_swap, started_on_odd;
+
+	VERIFY(len >= 0);
+	VERIFY(off >= 0);
+
+	needs_swap = FALSE;
+	started_on_odd = FALSE;
+	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
+
+	for (;;) {
+		if (__improbable(m == NULL)) {
+			CKSUM_ERR("%s: out of data\n", __func__);
+			return (uint32_t)-1;
+		}
+		mlen = m->m_len;
+		if (mlen > off) {
+			mlen -= off;
+			data = mtod(m, uint8_t *) + off;
+			goto post_initial_offset;
+		}
+		off -= mlen;
+		if (len == 0) {
+			break;
+		}
+		m = m->m_next;
+	}
 
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
+	for (; len > 0; m = m->m_next) {
+		if (__improbable(m == NULL)) {
+			CKSUM_ERR("%s: out of data\n", __func__);
+			return (uint32_t)-1;
+		}
+		mlen = m->m_len;
+		data = mtod(m, uint8_t *);
+post_initial_offset:
+		if (mlen == 0) {
 			continue;
-		w = mtod(m, u_short *);
-		if (mlen == -1) {
-			/*
-			 * The first byte of this mbuf is the continuation
-			 * of a word spanning between this mbuf and the
-			 * last mbuf.
-			 *
-			 * s_util.c[0] is already saved when scanning previous
-			 * mbuf.
-			 */
-			s_util.c[1] = *(char *)w;
-			sum += s_util.s;
-			w = (u_short *)((char *)w + 1);
-			mlen = m->m_len - 1;
-			len--;
-		} else
-			mlen = m->m_len;
-		if (len < mlen)
+		}
+		if (mlen > len) {
 			mlen = len;
+		}
 		len -= mlen;
+
+		partial = 0;
+		if ((uintptr_t)data & 1) {
+			/* Align on word boundary */
+			started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+			partial = *data << 8;
+#else /* BYTE_ORDER != LITTLE_ENDIAN */
+			partial = *data;
+#endif /* BYTE_ORDER != LITTLE_ENDIAN */
+			++data;
+			--mlen;
+		}
+		needs_swap = started_on_odd;
+		while (mlen >= 32) {
+			__builtin_prefetch(data + 32);
+			partial += *(uint16_t *)(void *)data;
+			partial += *(uint16_t *)(void *)(data + 2);
+			partial += *(uint16_t *)(void *)(data + 4);
+			partial += *(uint16_t *)(void *)(data + 6);
+			partial += *(uint16_t *)(void *)(data + 8);
+			partial += *(uint16_t *)(void *)(data + 10);
+			partial += *(uint16_t *)(void *)(data + 12);
+			partial += *(uint16_t *)(void *)(data + 14);
+			partial += *(uint16_t *)(void *)(data + 16);
+			partial += *(uint16_t *)(void *)(data + 18);
+			partial += *(uint16_t *)(void *)(data + 20);
+			partial += *(uint16_t *)(void *)(data + 22);
+			partial += *(uint16_t *)(void *)(data + 24);
+			partial += *(uint16_t *)(void *)(data + 26);
+			partial += *(uint16_t *)(void *)(data + 28);
+			partial += *(uint16_t *)(void *)(data + 30);
+			data += 32;
+			mlen -= 32;
+			if (__improbable(partial & 0xc0000000)) {
+				if (needs_swap) {
+					partial = (partial << 8) +
+					    (partial >> 24);
+				}
+				sum += (partial >> 16);
+				sum += (partial & 0xffff);
+				partial = 0;
+			}
+		}
+		if (mlen & 16) {
+			partial += *(uint16_t *)(void *)data;
+			partial += *(uint16_t *)(void *)(data + 2);
+			partial += *(uint16_t *)(void *)(data + 4);
+			partial += *(uint16_t *)(void *)(data + 6);
+			partial += *(uint16_t *)(void *)(data + 8);
+			partial += *(uint16_t *)(void *)(data + 10);
+			partial += *(uint16_t *)(void *)(data + 12);
+			partial += *(uint16_t *)(void *)(data + 14);
+			data += 16;
+			mlen -= 16;
+		}
 		/*
-		 * Force to even boundary.
+		 * mlen is not updated below as the remaining tests
+		 * are using bit masks, which are not affected.
 		 */
-		if ((1 & (int) w) && (mlen > 0)) {
-			REDUCE;
-			sum <<= 8;
-			s_util.c[0] = *(u_char *)w;
-			w = (u_short *)((char *)w + 1);
-			mlen--;
-			byte_swapped = 1;
+		if (mlen & 8) {
+			partial += *(uint16_t *)(void *)data;
+			partial += *(uint16_t *)(void *)(data + 2);
+			partial += *(uint16_t *)(void *)(data + 4);
+			partial += *(uint16_t *)(void *)(data + 6);
+			data += 8;
+		}
+		if (mlen & 4) {
+			partial += *(uint16_t *)(void *)data;
+			partial += *(uint16_t *)(void *)(data + 2);
+			data += 4;
+		}
+		if (mlen & 2) {
+			partial += *(uint16_t *)(void *)data;
+			data += 2;
+		}
+		if (mlen & 1) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+			partial += *data;
+#else /* BYTE_ORDER != LITTLE_ENDIAN */
+			partial += *data << 8;
+#endif /* BYTE_ORDER != LITTLE_ENDIAN */
+			started_on_odd = !started_on_odd;
 		}
+
+		if (needs_swap) {
+			partial = (partial << 8) + (partial >> 24);
+		}
+		sum += (partial >> 16) + (partial & 0xffff);
 		/*
-		 * Unroll the loop to make overhead from
-		 * branches &c small.
+		 * Reduce sum to allow potential byte swap
+		 * in the next iteration without carry.
 		 */
-		while ((mlen -= 32) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
-			sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
-			sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
-			w += 16;
-		}
-		mlen += 32;
-		while ((mlen -= 8) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			w += 4;
-		}
-		mlen += 8;
-		if (mlen == 0 && byte_swapped == 0)
-			continue;
-		REDUCE;
-		while ((mlen -= 2) >= 0) {
-			sum += *w++;
-		}
-		if (byte_swapped) {
-			REDUCE;
-			sum <<= 8;
-			byte_swapped = 0;
-			if (mlen == -1) {
-				s_util.c[1] = *(char *)w;
-				sum += s_util.s;
-				mlen = 0;
-			} else
-				mlen = -1;
-		} else if (mlen == -1)
-			s_util.c[0] = *(char *)w;
-	}
-	if (len)
-		printf("cksum: out of data\n");
-	if (mlen == -1) {
-		/* The last mbuf has odd # of bytes. Follow the
-		   standard (the odd byte may be shifted left by 8 bits
-		   or not as determined by endian-ness of the machine) */
-		s_util.c[1] = 0;
-		sum += s_util.s;
+		sum = (sum >> 16) + (sum & 0xffff);
 	}
-	REDUCE;
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0);
-	return (~sum & 0xffff);
+	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
+	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+	return final_acc & 0xffff;
 }
 
-int
-in_cksum_skip(m, len, skip)
-	register struct mbuf *m;
-	register u_short len;
-	register u_short skip;
+#else /* __LP64__ */
+/* 64-bit version */
+uint32_t
+in_cksum_mbuf_ref(struct mbuf *m, int len, int off, uint32_t initial_sum)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int byte_swapped = 0;
-	union s_util s_util;
-	union l_util l_util;   
-
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0);
-
-	len -= skip;
-        for (; skip && m; m = m->m_next) {
-                if (m->m_len > skip) {
-                        mlen = m->m_len - skip;
-			w = (u_short *)(m->m_data+skip);
-                        goto skip_start;
-                } else {    
-                        skip -= m->m_len;
-                }
-        }
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		w = mtod(m, u_short *);
-
-		if (mlen == -1) {
-			/*
-			 * The first byte of this mbuf is the continuation
-			 * of a word spanning between this mbuf and the
-			 * last mbuf.
-			 *
-			 * s_util.c[0] is already saved when scanning previous
-			 * mbuf.
-			 */
-			s_util.c[1] = *(char *)w;
-			sum += s_util.s;
-			w = (u_short *)((char *)w + 1);
-			mlen = m->m_len - 1;
-			len--;
-		} else {
-		  mlen = m->m_len;
+	int mlen;
+	uint64_t sum, partial;
+	unsigned int final_acc;
+	uint8_t *data;
+	boolean_t needs_swap, started_on_odd;
+
+	VERIFY(len >= 0);
+	VERIFY(off >= 0);
+
+	needs_swap = FALSE;
+	started_on_odd = FALSE;
+	sum = initial_sum;
+
+	for (;;) {
+		if (__improbable(m == NULL)) {
+			CKSUM_ERR("%s: out of data\n", __func__);
+			return (uint32_t)-1;
+		}
+		mlen = m->m_len;
+		if (mlen > off) {
+			mlen -= off;
+			data = mtod(m, uint8_t *) + off;
+			goto post_initial_offset;
 		}
-skip_start:
-		if (len < mlen)
-		    mlen = len;
+		off -= mlen;
+		if (len == 0) {
+			break;
+		}
+		m = m->m_next;
+	}
 
+	for (; len > 0; m = m->m_next) {
+		if (__improbable(m == NULL)) {
+			CKSUM_ERR("%s: out of data\n", __func__);
+			return (uint32_t)-1;
+		}
+		mlen = m->m_len;
+		data = mtod(m, uint8_t *);
+post_initial_offset:
+		if (mlen == 0) {
+			continue;
+		}
+		if (mlen > len) {
+			mlen = len;
+		}
 		len -= mlen;
+
+		partial = 0;
+		if ((uintptr_t)data & 1) {
+			/* Align on word boundary */
+			started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+			partial = *data << 8;
+#else /* BYTE_ORDER != LITTLE_ENDIAN */
+			partial = *data;
+#endif /* BYTE_ORDER != LITTLE_ENDIAN */
+			++data;
+			--mlen;
+		}
+		needs_swap = started_on_odd;
+		if ((uintptr_t)data & 2) {
+			if (mlen < 2) {
+				goto trailing_bytes;
+			}
+			partial += *(uint16_t *)(void *)data;
+			data += 2;
+			mlen -= 2;
+		}
+		while (mlen >= 64) {
+			__builtin_prefetch(data + 32);
+			__builtin_prefetch(data + 64);
+			partial += *(uint32_t *)(void *)data;
+			partial += *(uint32_t *)(void *)(data + 4);
+			partial += *(uint32_t *)(void *)(data + 8);
+			partial += *(uint32_t *)(void *)(data + 12);
+			partial += *(uint32_t *)(void *)(data + 16);
+			partial += *(uint32_t *)(void *)(data + 20);
+			partial += *(uint32_t *)(void *)(data + 24);
+			partial += *(uint32_t *)(void *)(data + 28);
+			partial += *(uint32_t *)(void *)(data + 32);
+			partial += *(uint32_t *)(void *)(data + 36);
+			partial += *(uint32_t *)(void *)(data + 40);
+			partial += *(uint32_t *)(void *)(data + 44);
+			partial += *(uint32_t *)(void *)(data + 48);
+			partial += *(uint32_t *)(void *)(data + 52);
+			partial += *(uint32_t *)(void *)(data + 56);
+			partial += *(uint32_t *)(void *)(data + 60);
+			data += 64;
+			mlen -= 64;
+			if (__improbable(partial & (3ULL << 62))) {
+				if (needs_swap) {
+					partial = (partial << 8) +
+					    (partial >> 56);
+				}
+				sum += (partial >> 32);
+				sum += (partial & 0xffffffff);
+				partial = 0;
+			}
+		}
 		/*
-		 * Force to even boundary.
+		 * mlen is not updated below as the remaining tests
+		 * are using bit masks, which are not affected.
 		 */
-		if ((1 & (int) w) && (mlen > 0)) {
-			REDUCE;
-			sum <<= 8;
-			s_util.c[0] = *(u_char *)w;
-			w = (u_short *)((char *)w + 1);
-			mlen--;
-			byte_swapped = 1;
+		if (mlen & 32) {
+			partial += *(uint32_t *)(void *)data;
+			partial += *(uint32_t *)(void *)(data + 4);
+			partial += *(uint32_t *)(void *)(data + 8);
+			partial += *(uint32_t *)(void *)(data + 12);
+			partial += *(uint32_t *)(void *)(data + 16);
+			partial += *(uint32_t *)(void *)(data + 20);
+			partial += *(uint32_t *)(void *)(data + 24);
+			partial += *(uint32_t *)(void *)(data + 28);
+			data += 32;
+		}
+		if (mlen & 16) {
+			partial += *(uint32_t *)(void *)data;
+			partial += *(uint32_t *)(void *)(data + 4);
+			partial += *(uint32_t *)(void *)(data + 8);
+			partial += *(uint32_t *)(void *)(data + 12);
+			data += 16;
+		}
+		if (mlen & 8) {
+			partial += *(uint32_t *)(void *)data;
+			partial += *(uint32_t *)(void *)(data + 4);
+			data += 8;
+		}
+		if (mlen & 4) {
+			partial += *(uint32_t *)(void *)data;
+			data += 4;
 		}
+		if (mlen & 2) {
+			partial += *(uint16_t *)(void *)data;
+			data += 2;
+		}
+trailing_bytes:
+		if (mlen & 1) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+			partial += *data;
+#else /* BYTE_ORDER != LITTLE_ENDIAN */
+			partial += *data << 8;
+#endif /* BYTE_ORDER != LITTLE_ENDIAN */
+			started_on_odd = !started_on_odd;
+		}
+
+		if (needs_swap) {
+			partial = (partial << 8) + (partial >> 56);
+		}
+		sum += (partial >> 32) + (partial & 0xffffffff);
 		/*
-		 * Unroll the loop to make overhead from
-		 * branches &c small.
+		 * Reduce sum to allow potential byte swap
+		 * in the next iteration without carry.
 		 */
-		while ((mlen -= 32) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
-			sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
-			sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
-			w += 16;
-		}
-		mlen += 32;
-		while ((mlen -= 8) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			w += 4;
-		}
-		mlen += 8;
-		if (mlen == 0 && byte_swapped == 0)
-			continue;
-		REDUCE;
-		while ((mlen -= 2) >= 0) {
-			sum += *w++;
-		}
-		if (byte_swapped) {
-			REDUCE;
-			sum <<= 8;
-			byte_swapped = 0;
-			if (mlen == -1) {
-				s_util.c[1] = *(char *)w;
-				sum += s_util.s;
-				mlen = 0;
-			} else
-				mlen = -1;
-		} else if (mlen == -1)
-			s_util.c[0] = *(char *)w;
-	}
-	if (len)
-		printf("cksum: out of data\n");
-	if (mlen == -1) {
-		/* The last mbuf has odd # of bytes. Follow the
-		   standard (the odd byte may be shifted left by 8 bits
-		   or not as determined by endian-ness of the machine) */
-		s_util.c[1] = 0;
-		sum += s_util.s;
+		sum = (sum >> 32) + (sum & 0xffffffff);
 	}
-	REDUCE;
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0);
-	return (~sum & 0xffff);
+	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
+	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
+	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+	return final_acc & 0xffff;
 }
-
-#endif
+#endif /* __LP64 */
+#endif /* DEBUG || DEVELOPMENT */