X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/ff6e181ae92fc6f1e89841290f461d1f2f9badd9..d190cdc3f5544636abb56dc1874be391d3e1b148:/bsd/netinet/in_cksum.c

diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c
index ac8b2648c..bc302ae30 100644
--- a/bsd/netinet/in_cksum.c
+++ b/bsd/netinet/in_cksum.c
@@ -1,14 +1,19 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
  * 
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
@@ -18,7 +23,7 @@
  * Please see the License for the specific language governing rights and
  * limitations under the License.
  * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * Copyright (c) 1988, 1992, 1993
@@ -56,10 +61,14 @@
  */
 
 #include <sys/param.h>
+#include <machine/endian.h>
 #include <sys/mbuf.h>
-#include <sys/kdebug.h>
-
-#define DBG_FNC_IN_CKSUM	NETDBG_CODE(DBG_NETIP, (3 << 8))
+#include <kern/debug.h>
+#include <net/dlil.h>
+#include <netinet/in.h>
+#define	_IP_VHL
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
 
 /*
  * Checksum routine for Internet Protocol family headers (Portable Version).
@@ -67,378 +76,415 @@
  * This routine is very heavily used in the network
  * code and should be modified for each CPU to be as fast as possible.
  */
-
-union s_util {
-        char    c[2];
-        u_short s;
-};
+#define REDUCE16 {							  \
+	q_util.q = sum;							  \
+	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
+	sum = l_util.s[0] + l_util.s[1];				  \
+	ADDCARRY(sum);							  \
+}
 
 union l_util {
-        u_int16_t s[2];
-        u_int32_t l;   
+        uint16_t s[2];
+        uint32_t l;
 };
 
 union q_util {
-        u_int16_t s[4];
-        u_int32_t l[2];
-        u_int64_t q;
-};    
-
-#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
-
-#define REDUCE32                                                          \
-    {                                                                     \
-        q_util.q = sum;                                                   \
-        sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3];      \
-    }
-#define REDUCE16                                                          \
-    {                                                                     \
-        q_util.q = sum;                                                   \
-        l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
-        sum = l_util.s[0] + l_util.s[1];                                  \
-        ADDCARRY(sum);                                                    \
-    }
-
-#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
-
-                
-#if defined(ppc)
-
-__inline unsigned short
-in_addword(u_short a, u_short b)
-{
-        union l_util l_util;   
-	u_int32_t sum = a + b;
-	REDUCE;
-	return (sum);
-}
+        uint16_t s[4];
+        uint32_t l[2];
+        uint64_t q;
+};
 
-__inline unsigned short
-in_pseudo(u_int a, u_int b, u_int c)
-{
-        u_int64_t sum;
-        union q_util q_util;
-        union l_util l_util;   
+#define	PREDICT_FALSE(_exp)	__builtin_expect((_exp), 0)
 
-        sum = (u_int64_t) a + b + c;
-        REDUCE16;
-        return (sum);
+static uint16_t in_cksumdata(const void *buf, int len);
 
-}
+/*
+ * Portable version of 16-bit 1's complement sum function that works
+ * on a contiguous buffer.  This is used mainly for instances where
+ * the caller is certain about the buffer requirements, e.g. for IP
+ * header checksum calculation, though it is capable of being used
+ * on any arbitrary data span.  The platform-specific cpu_in_cksum()
+ * routine might be better-optmized, so use that instead for large
+ * data span.
+ *
+ * The logic is borrowed from <bsd/netinet/cpu_in_cksum.c>
+ */
 
-int
-in_cksum(m, len)
-	register struct mbuf *m;
-	register int len;
+#if ULONG_MAX == 0xffffffffUL
+/* 32-bit version */
+static uint16_t
+in_cksumdata(const void *buf, int mlen)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int starting_on_odd  = 0;
-
-
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0);
-
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		mlen = m->m_len;
-		w = mtod(m, u_short *);
-
-		if (len < mlen)
-			mlen = len;
-
-		sum = xsum_assym(w, mlen, sum, starting_on_odd);
-		len -= mlen;
-		if (mlen & 0x1)
-		{
-		    if (starting_on_odd)
-			starting_on_odd = 0;
-		    else
-			starting_on_odd = 1;
+	uint32_t sum, partial;
+	unsigned int final_acc;
+	const uint8_t *data = (const uint8_t *)buf;
+	boolean_t needs_swap, started_on_odd;
+
+	VERIFY(mlen >= 0);
+
+	needs_swap = FALSE;
+	started_on_odd = FALSE;
+
+	sum = 0;
+	partial = 0;
+
+	if ((uintptr_t)data & 1) {
+		/* Align on word boundary */
+		started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		partial = *data << 8;
+#else
+		partial = *data;
+#endif
+		++data;
+		--mlen;
+	}
+	needs_swap = started_on_odd;
+	while (mlen >= 32) {
+		__builtin_prefetch(data + 32);
+		partial += *(const uint16_t *)(const void *)data;
+		partial += *(const uint16_t *)(const void *)(data + 2);
+		partial += *(const uint16_t *)(const void *)(data + 4);
+		partial += *(const uint16_t *)(const void *)(data + 6);
+		partial += *(const uint16_t *)(const void *)(data + 8);
+		partial += *(const uint16_t *)(const void *)(data + 10);
+		partial += *(const uint16_t *)(const void *)(data + 12);
+		partial += *(const uint16_t *)(const void *)(data + 14);
+		partial += *(const uint16_t *)(const void *)(data + 16);
+		partial += *(const uint16_t *)(const void *)(data + 18);
+		partial += *(const uint16_t *)(const void *)(data + 20);
+		partial += *(const uint16_t *)(const void *)(data + 22);
+		partial += *(const uint16_t *)(const void *)(data + 24);
+		partial += *(const uint16_t *)(const void *)(data + 26);
+		partial += *(const uint16_t *)(const void *)(data + 28);
+		partial += *(const uint16_t *)(const void *)(data + 30);
+		data += 32;
+		mlen -= 32;
+		if (PREDICT_FALSE(partial & 0xc0000000)) {
+			if (needs_swap)
+				partial = (partial << 8) +
+				    (partial >> 24);
+			sum += (partial >> 16);
+			sum += (partial & 0xffff);
+			partial = 0;
 		}
 	}
+	if (mlen & 16) {
+		partial += *(const uint16_t *)(const void *)data;
+		partial += *(const uint16_t *)(const void *)(data + 2);
+		partial += *(const uint16_t *)(const void *)(data + 4);
+		partial += *(const uint16_t *)(const void *)(data + 6);
+		partial += *(const uint16_t *)(const void *)(data + 8);
+		partial += *(const uint16_t *)(const void *)(data + 10);
+		partial += *(const uint16_t *)(const void *)(data + 12);
+		partial += *(const uint16_t *)(const void *)(data + 14);
+		data += 16;
+		mlen -= 16;
+	}
+	/*
+	 * mlen is not updated below as the remaining tests
+	 * are using bit masks, which are not affected.
+	 */
+	if (mlen & 8) {
+		partial += *(const uint16_t *)(const void *)data;
+		partial += *(const uint16_t *)(const void *)(data + 2);
+		partial += *(const uint16_t *)(const void *)(data + 4);
+		partial += *(const uint16_t *)(const void *)(data + 6);
+		data += 8;
+	}
+	if (mlen & 4) {
+		partial += *(const uint16_t *)(const void *)data;
+		partial += *(const uint16_t *)(const void *)(data + 2);
+		data += 4;
+	}
+	if (mlen & 2) {
+		partial += *(const uint16_t *)(const void *)data;
+		data += 2;
+	}
+	if (mlen & 1) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+		partial += *data;
+#else
+		partial += *data << 8;
+#endif
+		started_on_odd = !started_on_odd;
+	}
 
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0);
-	return (~sum & 0xffff);
+	if (needs_swap)
+		partial = (partial << 8) + (partial >> 24);
+	sum += (partial >> 16) + (partial & 0xffff);
+	sum = (sum >> 16) + (sum & 0xffff);
+
+	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
+	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+
+	return (final_acc);
 }
 
-u_short
-in_cksum_skip(m, len, skip)
-        register struct mbuf *m;
-        register int len;
-        register int skip;
+#else
+/* 64-bit version */
+static uint16_t
+in_cksumdata(const void *buf, int mlen)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int starting_on_odd  = 0;
-
-	len -= skip;
-        for (; skip && m; m = m->m_next) {
-                if (m->m_len > skip) {
-                        mlen = m->m_len - skip;
-			w = (u_short *)(m->m_data+skip);
-                        goto skip_start;
-                } else {    
-                        skip -= m->m_len;
-                }
-        }
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		mlen = m->m_len;
-		w = mtod(m, u_short *);
-
-skip_start:
-		if (len < mlen)
-			mlen = len;
-		sum = xsum_assym(w, mlen, sum, starting_on_odd);
-		len -= mlen;
-		if (mlen & 0x1)
-		{
-		    if (starting_on_odd)
-			starting_on_odd = 0;
-		    else
-			starting_on_odd = 1;
+	uint64_t sum, partial;
+	unsigned int final_acc;
+	const uint8_t *data = (const uint8_t *)buf;
+	boolean_t needs_swap, started_on_odd;
+
+	VERIFY(mlen >= 0);
+
+	needs_swap = FALSE;
+	started_on_odd = FALSE;
+
+	sum = 0;
+	partial = 0;
+
+	if ((uintptr_t)data & 1) {
+		/* Align on word boundary */
+		started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		partial = *data << 8;
+#else
+		partial = *data;
+#endif
+		++data;
+		--mlen;
+	}
+	needs_swap = started_on_odd;
+	if ((uintptr_t)data & 2) {
+		if (mlen < 2)
+			goto trailing_bytes;
+		partial += *(const uint16_t *)(const void *)data;
+		data += 2;
+		mlen -= 2;
+	}
+	while (mlen >= 64) {
+		__builtin_prefetch(data + 32);
+		__builtin_prefetch(data + 64);
+		partial += *(const uint32_t *)(const void *)data;
+		partial += *(const uint32_t *)(const void *)(data + 4);
+		partial += *(const uint32_t *)(const void *)(data + 8);
+		partial += *(const uint32_t *)(const void *)(data + 12);
+		partial += *(const uint32_t *)(const void *)(data + 16);
+		partial += *(const uint32_t *)(const void *)(data + 20);
+		partial += *(const uint32_t *)(const void *)(data + 24);
+		partial += *(const uint32_t *)(const void *)(data + 28);
+		partial += *(const uint32_t *)(const void *)(data + 32);
+		partial += *(const uint32_t *)(const void *)(data + 36);
+		partial += *(const uint32_t *)(const void *)(data + 40);
+		partial += *(const uint32_t *)(const void *)(data + 44);
+		partial += *(const uint32_t *)(const void *)(data + 48);
+		partial += *(const uint32_t *)(const void *)(data + 52);
+		partial += *(const uint32_t *)(const void *)(data + 56);
+		partial += *(const uint32_t *)(const void *)(data + 60);
+		data += 64;
+		mlen -= 64;
+		if (PREDICT_FALSE(partial & (3ULL << 62))) {
+			if (needs_swap)
+				partial = (partial << 8) +
+				    (partial >> 56);
+			sum += (partial >> 32);
+			sum += (partial & 0xffffffff);
+			partial = 0;
 		}
 	}
+	/*
+	 * mlen is not updated below as the remaining tests
+	 * are using bit masks, which are not affected.
+	 */
+	if (mlen & 32) {
+		partial += *(const uint32_t *)(const void *)data;
+		partial += *(const uint32_t *)(const void *)(data + 4);
+		partial += *(const uint32_t *)(const void *)(data + 8);
+		partial += *(const uint32_t *)(const void *)(data + 12);
+		partial += *(const uint32_t *)(const void *)(data + 16);
+		partial += *(const uint32_t *)(const void *)(data + 20);
+		partial += *(const uint32_t *)(const void *)(data + 24);
+		partial += *(const uint32_t *)(const void *)(data + 28);
+		data += 32;
+	}
+	if (mlen & 16) {
+		partial += *(const uint32_t *)(const void *)data;
+		partial += *(const uint32_t *)(const void *)(data + 4);
+		partial += *(const uint32_t *)(const void *)(data + 8);
+		partial += *(const uint32_t *)(const void *)(data + 12);
+		data += 16;
+	}
+	if (mlen & 8) {
+		partial += *(const uint32_t *)(const void *)data;
+		partial += *(const uint32_t *)(const void *)(data + 4);
+		data += 8;
+	}
+	if (mlen & 4) {
+		partial += *(const uint32_t *)(const void *)data;
+		data += 4;
+	}
+	if (mlen & 2) {
+		partial += *(const uint16_t *)(const void *)data;
+		data += 2;
+	}
+trailing_bytes:
+	if (mlen & 1) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+		partial += *data;
+#else
+		partial += *data << 8;
+#endif
+		started_on_odd = !started_on_odd;
+	}
 
-	return (~sum & 0xffff);
+	if (needs_swap)
+		partial = (partial << 8) + (partial >> 56);
+	sum += (partial >> 32) + (partial & 0xffffffff);
+	sum = (sum >> 32) + (sum & 0xffffffff);
+
+	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
+	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
+	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+
+	return (final_acc);
 }
-#else
+#endif /* ULONG_MAX != 0xffffffffUL */
 
-u_short 
-in_addword(u_short a, u_short b)
-{       
-        union l_util l_util;   
-        u_int32_t sum = a + b;
-        REDUCE(sum);
-        return (sum);
-}       
+/*
+ * Perform 16-bit 1's complement sum on a contiguous span.
+ */
+uint16_t
+b_sum16(const void *buf, int len)
+{
+	return (in_cksumdata(buf, len));
+}
+
+uint16_t inet_cksum_simple(struct mbuf *, int);
+/*
+ * For the exported _in_cksum symbol in BSDKernel symbol set.
+ */
+uint16_t
+inet_cksum_simple(struct mbuf *m, int len)
+{
+	return (inet_cksum(m, 0, 0, len));
+}
 
-u_short
-in_pseudo(u_int a, u_int b, u_int c)
+uint16_t
+in_addword(uint16_t a, uint16_t b)
 {
-        u_int64_t sum;  
+	uint64_t sum = a + b;
+
+	ADDCARRY(sum);
+	return (sum);
+}
+
+uint16_t
+in_pseudo(uint32_t a, uint32_t b, uint32_t c)
+{
+        uint64_t sum;
         union q_util q_util;
-        union l_util l_util;   
+        union l_util l_util;
 
-        sum = (u_int64_t) a + b + c;
+        sum = (uint64_t)a + b + c;
         REDUCE16;
         return (sum);
 }
 
+uint16_t
+in_pseudo64(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t sum;
+	union q_util q_util;
+	union l_util l_util;
 
-int
-in_cksum(m, len)
-	register struct mbuf *m;
-	register int len;
+	sum = a + b + c;
+	REDUCE16;
+	return (sum);
+}
+
+/*
+ * May be used on IP header with options.
+ */
+uint16_t
+in_cksum_hdr_opt(const struct ip *ip)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int byte_swapped = 0;
-	union s_util s_util;
-	union l_util l_util;   
-
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0);
-
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		w = mtod(m, u_short *);
-		if (mlen == -1) {
-			/*
-			 * The first byte of this mbuf is the continuation
-			 * of a word spanning between this mbuf and the
-			 * last mbuf.
-			 *
-			 * s_util.c[0] is already saved when scanning previous
-			 * mbuf.
-			 */
-			s_util.c[1] = *(char *)w;
-			sum += s_util.s;
-			w = (u_short *)((char *)w + 1);
-			mlen = m->m_len - 1;
-			len--;
-		} else
-			mlen = m->m_len;
-		if (len < mlen)
-			mlen = len;
-		len -= mlen;
-		/*
-		 * Force to even boundary.
-		 */
-		if ((1 & (int) w) && (mlen > 0)) {
-			REDUCE;
-			sum <<= 8;
-			s_util.c[0] = *(u_char *)w;
-			w = (u_short *)((char *)w + 1);
-			mlen--;
-			byte_swapped = 1;
-		}
-		/*
-		 * Unroll the loop to make overhead from
-		 * branches &c small.
-		 */
-		while ((mlen -= 32) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
-			sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
-			sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
-			w += 16;
-		}
-		mlen += 32;
-		while ((mlen -= 8) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			w += 4;
-		}
-		mlen += 8;
-		if (mlen == 0 && byte_swapped == 0)
-			continue;
-		REDUCE;
-		while ((mlen -= 2) >= 0) {
-			sum += *w++;
-		}
-		if (byte_swapped) {
-			REDUCE;
-			sum <<= 8;
-			byte_swapped = 0;
-			if (mlen == -1) {
-				s_util.c[1] = *(char *)w;
-				sum += s_util.s;
-				mlen = 0;
-			} else
-				mlen = -1;
-		} else if (mlen == -1)
-			s_util.c[0] = *(char *)w;
-	}
-	if (len)
-		printf("cksum: out of data\n");
-	if (mlen == -1) {
-		/* The last mbuf has odd # of bytes. Follow the
-		   standard (the odd byte may be shifted left by 8 bits
-		   or not as determined by endian-ness of the machine) */
-		s_util.c[1] = 0;
-		sum += s_util.s;
+	return (~b_sum16(ip, (IP_VHL_HL(ip->ip_vhl) << 2)) & 0xffff);
+}
+
+/*
+ * A wrapper around the simple in_cksum_hdr() and the more complicated
+ * inet_cksum(); the former is chosen if the IP header is simple,
+ * contiguous and 32-bit aligned.  Also does some stats accounting.
+ */
+uint16_t
+ip_cksum_hdr_dir(struct mbuf *m, uint32_t hlen, int out)
+{
+	struct ip *ip = mtod(m, struct ip *);
+
+	if (out) {
+		ipstat.ips_snd_swcsum++;
+		ipstat.ips_snd_swcsum_bytes += hlen;
+	} else {
+		ipstat.ips_rcv_swcsum++;
+		ipstat.ips_rcv_swcsum_bytes += hlen;
 	}
-	REDUCE;
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0);
-	return (~sum & 0xffff);
+
+	if (hlen == sizeof (*ip) &&
+	    m->m_len >= sizeof (*ip) && IP_HDR_ALIGNED_P(ip))
+		return (in_cksum_hdr(ip));
+
+	return (inet_cksum(m, 0, 0, hlen));
 }
 
-int
-in_cksum_skip(m, len, skip)
-	register struct mbuf *m;
-	register u_short len;
-	register u_short skip;
+/*
+ * m MUST contain at least an IP header, if nxt is specified;
+ * nxt is the upper layer protocol number;
+ * off is an offset where TCP/UDP/ICMP header starts;
+ * len is a total length of a transport segment (e.g. TCP header + TCP payload)
+ */
+uint16_t
+inet_cksum(struct mbuf *m, uint32_t nxt, uint32_t off, uint32_t len)
 {
-	register u_short *w;
-	register int sum = 0;
-	register int mlen = 0;
-	int byte_swapped = 0;
-	union s_util s_util;
-	union l_util l_util;   
-
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0);
-
-	len -= skip;
-        for (; skip && m; m = m->m_next) {
-                if (m->m_len > skip) {
-                        mlen = m->m_len - skip;
-			w = (u_short *)(m->m_data+skip);
-                        goto skip_start;
-                } else {    
-                        skip -= m->m_len;
-                }
-        }
-	for (;m && len; m = m->m_next) {
-		if (m->m_len == 0)
-			continue;
-		w = mtod(m, u_short *);
-
-		if (mlen == -1) {
-			/*
-			 * The first byte of this mbuf is the continuation
-			 * of a word spanning between this mbuf and the
-			 * last mbuf.
-			 *
-			 * s_util.c[0] is already saved when scanning previous
-			 * mbuf.
-			 */
-			s_util.c[1] = *(char *)w;
-			sum += s_util.s;
-			w = (u_short *)((char *)w + 1);
-			mlen = m->m_len - 1;
-			len--;
-		} else {
-		  mlen = m->m_len;
-		}
-skip_start:
-		if (len < mlen)
-		    mlen = len;
+	uint32_t sum;
+
+	sum = m_sum16(m, off, len);
+
+	/* include pseudo header checksum? */
+	if (nxt != 0) {
+		struct ip *ip;
+		unsigned char buf[sizeof ((*ip))] __attribute__((aligned(8)));
+		uint32_t mlen;
 
-		len -= mlen;
 		/*
-		 * Force to even boundary.
+		 * Sanity check
+		 *
+		 * Use m_length2() instead of m_length(), as we cannot rely on
+		 * the caller setting m_pkthdr.len correctly, if the mbuf is
+		 * a M_PKTHDR one.
 		 */
-		if ((1 & (int) w) && (mlen > 0)) {
-			REDUCE;
-			sum <<= 8;
-			s_util.c[0] = *(u_char *)w;
-			w = (u_short *)((char *)w + 1);
-			mlen--;
-			byte_swapped = 1;
+		if ((mlen = m_length2(m, NULL)) < sizeof (*ip)) {
+			panic("%s: mbuf %p too short (%d) for IPv4 header",
+			    __func__, m, mlen);
+			/* NOTREACHED */
 		}
+
 		/*
-		 * Unroll the loop to make overhead from
-		 * branches &c small.
+		 * In case the IP header is not contiguous, or not 32-bit
+		 * aligned, copy it to a local buffer.  Note here that we
+		 * expect the data pointer to point to the IP header.
 		 */
-		while ((mlen -= 32) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
-			sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
-			sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
-			w += 16;
-		}
-		mlen += 32;
-		while ((mlen -= 8) >= 0) {
-			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
-			w += 4;
-		}
-		mlen += 8;
-		if (mlen == 0 && byte_swapped == 0)
-			continue;
-		REDUCE;
-		while ((mlen -= 2) >= 0) {
-			sum += *w++;
+		if ((sizeof (*ip) > m->m_len) ||
+		    !IP_HDR_ALIGNED_P(mtod(m, caddr_t))) {
+			m_copydata(m, 0, sizeof (*ip), (caddr_t)buf);
+			ip = (struct ip *)(void *)buf;
+		} else {
+			ip = (struct ip *)(void *)(m->m_data);
 		}
-		if (byte_swapped) {
-			REDUCE;
-			sum <<= 8;
-			byte_swapped = 0;
-			if (mlen == -1) {
-				s_util.c[1] = *(char *)w;
-				sum += s_util.s;
-				mlen = 0;
-			} else
-				mlen = -1;
-		} else if (mlen == -1)
-			s_util.c[0] = *(char *)w;
-	}
-	if (len)
-		printf("cksum: out of data\n");
-	if (mlen == -1) {
-		/* The last mbuf has odd # of bytes. Follow the
-		   standard (the odd byte may be shifted left by 8 bits
-		   or not as determined by endian-ness of the machine) */
-		s_util.c[1] = 0;
-		sum += s_util.s;
+
+		/* add pseudo header checksum */
+		sum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+		    htonl(len + nxt));
+
+		/* fold in carry bits */
+		ADDCARRY(sum);
 	}
-	REDUCE;
-	KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0);
+
 	return (~sum & 0xffff);
 }
-
-#endif