- register u_short *w;
- register int sum = 0;
- register int mlen = 0;
- int starting_on_odd = 0;
-
- len -= skip;
- for (; skip && m; m = m->m_next) {
- if (m->m_len > skip) {
- mlen = m->m_len - skip;
- w = (u_short *)(m->m_data+skip);
- goto skip_start;
- } else {
- skip -= m->m_len;
- }
- }
- for (;m && len; m = m->m_next) {
- if (m->m_len == 0)
- continue;
- mlen = m->m_len;
- w = mtod(m, u_short *);
-
-skip_start:
- if (len < mlen)
- mlen = len;
- sum = xsum_assym(w, mlen, sum, starting_on_odd);
- len -= mlen;
- if (mlen & 0x1)
- {
- if (starting_on_odd)
- starting_on_odd = 0;
- else
- starting_on_odd = 1;
+ uint64_t sum, partial;
+ unsigned int final_acc;
+ const uint8_t *data = (const uint8_t *)buf;
+ boolean_t needs_swap, started_on_odd;
+
+ VERIFY(mlen >= 0);
+
+ needs_swap = FALSE;
+ started_on_odd = FALSE;
+
+ sum = 0;
+ partial = 0;
+
+ if ((uintptr_t)data & 1) {
+ /* Align on word boundary */
+ started_on_odd = !started_on_odd;
+#if BYTE_ORDER == LITTLE_ENDIAN
+ partial = *data << 8;
+#else
+ partial = *data;
+#endif
+ ++data;
+ --mlen;
+ }
+ needs_swap = started_on_odd;
+ if ((uintptr_t)data & 2) {
+ if (mlen < 2)
+ goto trailing_bytes;
+ partial += *(const uint16_t *)(const void *)data;
+ data += 2;
+ mlen -= 2;
+ }
+ while (mlen >= 64) {
+ __builtin_prefetch(data + 32);
+ __builtin_prefetch(data + 64);
+ partial += *(const uint32_t *)(const void *)data;
+ partial += *(const uint32_t *)(const void *)(data + 4);
+ partial += *(const uint32_t *)(const void *)(data + 8);
+ partial += *(const uint32_t *)(const void *)(data + 12);
+ partial += *(const uint32_t *)(const void *)(data + 16);
+ partial += *(const uint32_t *)(const void *)(data + 20);
+ partial += *(const uint32_t *)(const void *)(data + 24);
+ partial += *(const uint32_t *)(const void *)(data + 28);
+ partial += *(const uint32_t *)(const void *)(data + 32);
+ partial += *(const uint32_t *)(const void *)(data + 36);
+ partial += *(const uint32_t *)(const void *)(data + 40);
+ partial += *(const uint32_t *)(const void *)(data + 44);
+ partial += *(const uint32_t *)(const void *)(data + 48);
+ partial += *(const uint32_t *)(const void *)(data + 52);
+ partial += *(const uint32_t *)(const void *)(data + 56);
+ partial += *(const uint32_t *)(const void *)(data + 60);
+ data += 64;
+ mlen -= 64;
+ if (PREDICT_FALSE(partial & (3ULL << 62))) {
+ if (needs_swap)
+ partial = (partial << 8) +
+ (partial >> 56);
+ sum += (partial >> 32);
+ sum += (partial & 0xffffffff);
+ partial = 0;