+#define PREDICT_FALSE(_exp) __builtin_expect((_exp), 0)
+
+static uint16_t in_cksumdata(const void *buf, int len);
+
+/*
+ * Portable version of 16-bit 1's complement sum function that works
+ * on a contiguous buffer. This is used mainly for instances where
+ * the caller is certain about the buffer requirements, e.g. for IP
+ * header checksum calculation, though it is capable of being used
+ * on any arbitrary data span. The platform-specific cpu_in_cksum()
+ * routine might be better-optmized, so use that instead for large
+ * data span.
+ *
+ * The logic is borrowed from <bsd/netinet/cpu_in_cksum.c>
+ */
+
+#if ULONG_MAX == 0xffffffffUL
+/* 32-bit version */
+static uint16_t
+in_cksumdata(const void *buf, int mlen)
+{
+ uint32_t sum, partial;
+ unsigned int final_acc;
+ uint8_t *data = (void *)buf;
+ boolean_t needs_swap, started_on_odd;
+
+ VERIFY(mlen >= 0);
+
+ needs_swap = FALSE;
+ started_on_odd = FALSE;
+
+ sum = 0;
+ partial = 0;
+
+ if ((uintptr_t)data & 1) {
+ /* Align on word boundary */
+ started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+ partial = *data << 8;
+#else
+ partial = *data;
+#endif
+ ++data;
+ --mlen;
+ }
+ needs_swap = started_on_odd;
+ while (mlen >= 32) {
+ __builtin_prefetch(data + 32);
+ partial += *(uint16_t *)(void *)data;
+ partial += *(uint16_t *)(void *)(data + 2);
+ partial += *(uint16_t *)(void *)(data + 4);
+ partial += *(uint16_t *)(void *)(data + 6);
+ partial += *(uint16_t *)(void *)(data + 8);
+ partial += *(uint16_t *)(void *)(data + 10);
+ partial += *(uint16_t *)(void *)(data + 12);
+ partial += *(uint16_t *)(void *)(data + 14);
+ partial += *(uint16_t *)(void *)(data + 16);
+ partial += *(uint16_t *)(void *)(data + 18);
+ partial += *(uint16_t *)(void *)(data + 20);
+ partial += *(uint16_t *)(void *)(data + 22);
+ partial += *(uint16_t *)(void *)(data + 24);
+ partial += *(uint16_t *)(void *)(data + 26);
+ partial += *(uint16_t *)(void *)(data + 28);
+ partial += *(uint16_t *)(void *)(data + 30);
+ data += 32;
+ mlen -= 32;
+ if (PREDICT_FALSE(partial & 0xc0000000)) {
+ if (needs_swap)
+ partial = (partial << 8) +
+ (partial >> 24);
+ sum += (partial >> 16);
+ sum += (partial & 0xffff);
+ partial = 0;
+ }
+ }
+ if (mlen & 16) {
+ partial += *(uint16_t *)(void *)data;
+ partial += *(uint16_t *)(void *)(data + 2);
+ partial += *(uint16_t *)(void *)(data + 4);
+ partial += *(uint16_t *)(void *)(data + 6);
+ partial += *(uint16_t *)(void *)(data + 8);
+ partial += *(uint16_t *)(void *)(data + 10);
+ partial += *(uint16_t *)(void *)(data + 12);
+ partial += *(uint16_t *)(void *)(data + 14);
+ data += 16;
+ mlen -= 16;
+ }
+ /*
+ * mlen is not updated below as the remaining tests
+ * are using bit masks, which are not affected.
+ */
+ if (mlen & 8) {
+ partial += *(uint16_t *)(void *)data;
+ partial += *(uint16_t *)(void *)(data + 2);
+ partial += *(uint16_t *)(void *)(data + 4);
+ partial += *(uint16_t *)(void *)(data + 6);
+ data += 8;
+ }
+ if (mlen & 4) {
+ partial += *(uint16_t *)(void *)data;
+ partial += *(uint16_t *)(void *)(data + 2);
+ data += 4;
+ }
+ if (mlen & 2) {
+ partial += *(uint16_t *)(void *)data;
+ data += 2;
+ }
+ if (mlen & 1) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ partial += *data;
+#else
+ partial += *data << 8;
+#endif
+ started_on_odd = !started_on_odd;
+ }