+ final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
+ final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+
+ return (final_acc);
+}
+
+#else
+/* 64-bit version */
+static uint16_t
+in_cksumdata(const void *buf, int mlen)
+{
+ uint64_t sum, partial;
+ unsigned int final_acc;
+ uint8_t *data = (void *)buf;
+ boolean_t needs_swap, started_on_odd;
+
+ VERIFY(mlen >= 0);
+
+ needs_swap = FALSE;
+ started_on_odd = FALSE;
+
+ sum = 0;
+ partial = 0;
+
+ if ((uintptr_t)data & 1) {
+ /* Align on word boundary */
+ started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+ partial = *data << 8;
+#else
+ partial = *data;
+#endif
+ ++data;
+ --mlen;
+ }
+ needs_swap = started_on_odd;
+ if ((uintptr_t)data & 2) {
+ if (mlen < 2)
+ goto trailing_bytes;
+ partial += *(uint16_t *)(void *)data;
+ data += 2;
+ mlen -= 2;
+ }
+ while (mlen >= 64) {
+ __builtin_prefetch(data + 32);
+ __builtin_prefetch(data + 64);
+ partial += *(uint32_t *)(void *)data;
+ partial += *(uint32_t *)(void *)(data + 4);
+ partial += *(uint32_t *)(void *)(data + 8);
+ partial += *(uint32_t *)(void *)(data + 12);
+ partial += *(uint32_t *)(void *)(data + 16);
+ partial += *(uint32_t *)(void *)(data + 20);
+ partial += *(uint32_t *)(void *)(data + 24);
+ partial += *(uint32_t *)(void *)(data + 28);
+ partial += *(uint32_t *)(void *)(data + 32);
+ partial += *(uint32_t *)(void *)(data + 36);
+ partial += *(uint32_t *)(void *)(data + 40);
+ partial += *(uint32_t *)(void *)(data + 44);
+ partial += *(uint32_t *)(void *)(data + 48);
+ partial += *(uint32_t *)(void *)(data + 52);
+ partial += *(uint32_t *)(void *)(data + 56);
+ partial += *(uint32_t *)(void *)(data + 60);
+ data += 64;
+ mlen -= 64;
+ if (PREDICT_FALSE(partial & (3ULL << 62))) {
+ if (needs_swap)
+ partial = (partial << 8) +
+ (partial >> 56);
+ sum += (partial >> 32);
+ sum += (partial & 0xffffffff);
+ partial = 0;
+ }
+ }
+ /*
+ * mlen is not updated below as the remaining tests
+ * are using bit masks, which are not affected.
+ */
+ if (mlen & 32) {
+ partial += *(uint32_t *)(void *)data;
+ partial += *(uint32_t *)(void *)(data + 4);
+ partial += *(uint32_t *)(void *)(data + 8);
+ partial += *(uint32_t *)(void *)(data + 12);
+ partial += *(uint32_t *)(void *)(data + 16);
+ partial += *(uint32_t *)(void *)(data + 20);
+ partial += *(uint32_t *)(void *)(data + 24);
+ partial += *(uint32_t *)(void *)(data + 28);
+ data += 32;
+ }
+ if (mlen & 16) {
+ partial += *(uint32_t *)(void *)data;
+ partial += *(uint32_t *)(void *)(data + 4);
+ partial += *(uint32_t *)(void *)(data + 8);
+ partial += *(uint32_t *)(void *)(data + 12);
+ data += 16;
+ }
+ if (mlen & 8) {
+ partial += *(uint32_t *)(void *)data;
+ partial += *(uint32_t *)(void *)(data + 4);
+ data += 8;
+ }
+ if (mlen & 4) {
+ partial += *(uint32_t *)(void *)data;
+ data += 4;
+ }
+ if (mlen & 2) {
+ partial += *(uint16_t *)(void *)data;
+ data += 2;
+ }
+trailing_bytes:
+ if (mlen & 1) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ partial += *data;
+#else
+ partial += *data << 8;
+#endif
+ started_on_odd = !started_on_odd;
+ }
+
+ if (needs_swap)
+ partial = (partial << 8) + (partial >> 56);
+ sum += (partial >> 32) + (partial & 0xffffffff);
+ sum = (sum >> 32) + (sum & 0xffffffff);
+
+ final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
+ ((sum >> 16) & 0xffff) + (sum & 0xffff);
+ final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+ final_acc = (final_acc >> 16) + (final_acc & 0xffff);
+
+ return (final_acc);
+}
+#endif /* ULONG_MAX != 0xffffffffUL */