bsd/net/flowhash.c

   1 /*
   2  * Copyright (c) 2011-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * http://code.google.com/p/smhasher/
  31  *
  32  * Copyright (c) 2009-2011 Austin Appleby.
  33  *
  34  * MurmurHash3 was written by Austin Appleby, and is placed in the public
  35  * domain. The author hereby disclaims copyright to this source code.
  36  */
  37
  38 /*
  39  * http://burtleburtle.net/bob/hash/
  40  *
  41  * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
  42  *
  43  * You can use this free for any purpose.  It's in the public domain.
  44  * It has no warranty.
  45  */
  46
  47 #include <stdbool.h>
  48 #include <sys/types.h>
  49 #include <machine/endian.h>
  50 #include <net/flowhash.h>
  51
  52 static inline u_int32_t getblock32(const u_int32_t *, int);
  53 static inline u_int64_t getblock64(const u_int64_t *, int);
  54 static inline u_int32_t mh3_fmix32(u_int32_t);
  55 static inline u_int64_t mh3_fmix64(u_int64_t);
  56
  57 #define ALIGNED16(v)    ((((uintptr_t)(v)) & 1) == 0)
  58 #define ALIGNED32(v)    ((((uintptr_t)(v)) & 3) == 0)
  59 #define ALIGNED64(v)    ((((uintptr_t)(v)) & 7) == 0)
  60
  61 #define ROTL32(x, r)    (((x) << (r)) | ((x) >> (32 - (r))))
  62 #define ROTL64(x, r)    (((x) << (r)) | ((x) >> (64 - (r))))
  63
  64 /*
  65  * The following hash algorithms are selected based on performance:
  66  *
  67  * 64-bit:      MurmurHash3_x64_128
  68  * 32-bit:      JHash
  69  */
  70 #if   defined(__LP64__)
  71 net_flowhash_fn_t *net_flowhash = net_flowhash_mh3_x64_128;
  72 #else /* !__LP64__ */
  73 net_flowhash_fn_t *net_flowhash = net_flowhash_jhash;
  74 #endif /* !__LP64__ */
  75
  76 #if defined(__i386__) || defined(__x86_64__) || defined(__arm64__)
  77 static inline u_int32_t
  78 getblock32(const u_int32_t *p, int i)
  79 {
  80         return (p[i]);
  81 }
  82
  83 static inline u_int64_t
  84 getblock64(const u_int64_t *p, int i)
  85 {
  86         return (p[i]);
  87 }
  88 #else /* !__i386__ && !__x86_64__ && !__arm64__*/
  89 static inline u_int32_t
  90 getblock32(const u_int32_t *p, int i)
  91 {
  92         const u_int8_t *bytes = (u_int8_t *)(void *)(uintptr_t)(p + i);
  93         u_int32_t value;
  94
  95         if (ALIGNED32(p)) {
  96                 value = p[i];
  97         } else {
  98 #if BYTE_ORDER == BIG_ENDIAN
  99                 value =
 100                     (((u_int32_t)bytes[0]) << 24) |
 101                     (((u_int32_t)bytes[1]) << 16) |
 102                     (((u_int32_t)bytes[2]) << 8) |
 103                     ((u_int32_t)bytes[3]);
 104 #else /* LITTLE_ENDIAN */
 105                 value =
 106                     (((u_int32_t)bytes[3]) << 24) |
 107                     (((u_int32_t)bytes[2]) << 16) |
 108                     (((u_int32_t)bytes[1]) << 8) |
 109                     ((u_int32_t)bytes[0]);
 110 #endif /* LITTLE_ENDIAN */
 111         }
 112         return (value);
 113 }
 114
 115 static inline u_int64_t
 116 getblock64(const u_int64_t *p, int i)
 117 {
 118         const u_int8_t *bytes = (const u_int8_t *)(void *)(uintptr_t)(p + i);
 119         u_int64_t value;
 120
 121         if (ALIGNED64(p)) {
 122                 value = p[i];
 123         } else {
 124 #if BYTE_ORDER == BIG_ENDIAN
 125                 value =
 126                     (((u_int64_t)bytes[0]) << 56) |
 127                     (((u_int64_t)bytes[1]) << 48) |
 128                     (((u_int64_t)bytes[2]) << 40) |
 129                     (((u_int64_t)bytes[3]) << 32) |
 130                     (((u_int64_t)bytes[4]) << 24) |
 131                     (((u_int64_t)bytes[5]) << 16) |
 132                     (((u_int64_t)bytes[6]) << 8) |
 133                     ((u_int64_t)bytes[7]);
 134 #else /* LITTLE_ENDIAN */
 135                 value =
 136                     (((u_int64_t)bytes[7]) << 56) |
 137                     (((u_int64_t)bytes[6]) << 48) |
 138                     (((u_int64_t)bytes[5]) << 40) |
 139                     (((u_int64_t)bytes[4]) << 32) |
 140                     (((u_int64_t)bytes[3]) << 24) |
 141                     (((u_int64_t)bytes[2]) << 16) |
 142                     (((u_int64_t)bytes[1]) << 8) |
 143                     ((u_int64_t)bytes[0]);
 144 #endif /* LITTLE_ENDIAN */
 145         }
 146         return (value);
 147 }
 148 #endif /* !__i386__ && !__x86_64 && !__arm64__ */
 149
 150 static inline u_int32_t
 151 mh3_fmix32(u_int32_t h)
 152 {
 153         h ^= h >> 16;
 154         h *= 0x85ebca6b;
 155         h ^= h >> 13;
 156         h *= 0xc2b2ae35;
 157         h ^= h >> 16;
 158
 159         return (h);
 160 }
 161
 162 static inline u_int64_t
 163 mh3_fmix64(u_int64_t k)
 164 {
 165         k ^= k >> 33;
 166         k *= 0xff51afd7ed558ccdLLU;
 167         k ^= k >> 33;
 168         k *= 0xc4ceb9fe1a85ec53LLU;
 169         k ^= k >> 33;
 170
 171         return (k);
 172 }
 173
 174 /*
 175  * MurmurHash3_x86_32
 176  */
 177 #define MH3_X86_32_C1   0xcc9e2d51
 178 #define MH3_X86_32_C2   0x1b873593
 179
 180 u_int32_t
 181 net_flowhash_mh3_x86_32(const void *key, u_int32_t len, const u_int32_t seed)
 182 {
 183         const u_int8_t *data = (const u_int8_t *)key;
 184         const u_int32_t nblocks = len / 4;
 185         const u_int32_t *blocks;
 186         const u_int8_t *tail;
 187         u_int32_t h1 = seed, k1;
 188         int i;
 189
 190         /* body */
 191         blocks = (const u_int32_t *)(const void *)(data + nblocks * 4);
 192
 193         for (i = -nblocks; i; i++) {
 194                 k1 = getblock32(blocks, i);
 195
 196                 k1 *= MH3_X86_32_C1;
 197                 k1 = ROTL32(k1, 15);
 198                 k1 *= MH3_X86_32_C2;
 199
 200                 h1 ^= k1;
 201                 h1 = ROTL32(h1, 13);
 202                 h1 = h1 * 5 + 0xe6546b64;
 203         }
 204
 205         /* tail */
 206         tail = (const u_int8_t *)(const void *)(data + nblocks * 4);
 207         k1 = 0;
 208
 209         switch (len & 3) {
 210         case 3:
 211                 k1 ^= tail[2] << 16;
 212                 /* FALLTHRU */
 213         case 2:
 214                 k1 ^= tail[1] << 8;
 215                 /* FALLTHRU */
 216         case 1:
 217                 k1 ^= tail[0];
 218                 k1 *= MH3_X86_32_C1;
 219                 k1 = ROTL32(k1, 15);
 220                 k1 *= MH3_X86_32_C2;
 221                 h1 ^= k1;
 222         };
 223
 224         /* finalization */
 225         h1 ^= len;
 226
 227         h1 = mh3_fmix32(h1);
 228
 229         return (h1);
 230 }
 231
 232 /*
 233  * MurmurHash3_x64_128
 234  */
 235 #define MH3_X64_128_C1  0x87c37b91114253d5LLU
 236 #define MH3_X64_128_C2  0x4cf5ad432745937fLLU
 237
 238 u_int32_t
 239 net_flowhash_mh3_x64_128(const void *key, u_int32_t len, const u_int32_t seed)
 240 {
 241         const u_int8_t *data = (const u_int8_t *)key;
 242         const u_int32_t nblocks = len / 16;
 243         const u_int64_t *blocks;
 244         const u_int8_t *tail;
 245         u_int64_t h1 = seed, k1;
 246         u_int64_t h2 = seed, k2;
 247         u_int32_t i;
 248
 249         /* body */
 250         blocks = (const u_int64_t *)(const void *)data;
 251
 252         for (i = 0; i < nblocks; i++) {
 253                 k1 = getblock64(blocks, i * 2 + 0);
 254                 k2 = getblock64(blocks, i * 2 + 1);
 255
 256                 k1 *= MH3_X64_128_C1;
 257 #if defined(__x86_64__)
 258         __asm__ ( "rol   $31, %[k1]\n\t" :[k1] "+r" (k1) : :);
 259 #elif defined(__arm64__)
 260         __asm__ ( "ror   %[k1], %[k1], #(64-31)\n\t" :[k1] "+r" (k1) : :);
 261 #else /* !__x86_64__ && !__arm64__ */
 262                 k1 = ROTL64(k1, 31);
 263 #endif /* !__x86_64__ && !__arm64__ */
 264                 k1 *= MH3_X64_128_C2;
 265                 h1 ^= k1;
 266
 267 #if defined(__x86_64__)
 268         __asm__ ( "rol   $27, %[h1]\n\t" :[h1] "+r" (h1) : :);
 269 #elif defined(__arm64__)
 270         __asm__ ( "ror   %[h1], %[h1], #(64-27)\n\t" :[h1] "+r" (h1) : :);
 271 #else /* !__x86_64__ && !__arm64__ */
 272         h1 = ROTL64(h1, 27);
 273 #endif /* !__x86_64__ && !__arm64__ */
 274                 h1 += h2;
 275                 h1 = h1 * 5 + 0x52dce729;
 276
 277                 k2 *= MH3_X64_128_C2;
 278 #if defined(__x86_64__)
 279         __asm__ ( "rol   $33, %[k2]\n\t" :[k2] "+r" (k2) : :);
 280 #elif defined(__arm64__)
 281         __asm__ ( "ror   %[k2], %[k2], #(64-33)\n\t" :[k2] "+r" (k2) : :);
 282 #else /* !__x86_64__ && !__arm64__ */
 283         k2 = ROTL64(k2, 33);
 284 #endif /* !__x86_64__ && !__arm64__ */
 285                 k2 *= MH3_X64_128_C1;
 286                 h2 ^= k2;
 287
 288 #if defined(__x86_64__)
 289         __asm__ ( "rol   $31, %[h2]\n\t" :[h2] "+r" (h2) : :);
 290 #elif defined(__arm64__)
 291         __asm__ ( "ror   %[h2], %[h2], #(64-31)\n\t" :[h2] "+r" (h2) : :);
 292 #else /* !__x86_64__ && !__arm64__ */
 293         h2 = ROTL64(h2, 31);
 294 #endif /* !__x86_64__ && !__arm64__ */
 295                 h2 += h1;
 296                 h2 = h2 * 5+ 0x38495ab5;
 297         }
 298
 299         /* tail */
 300         tail = (const u_int8_t *)(const void *)(data + nblocks * 16);
 301         k1 = 0;
 302         k2 = 0;
 303
 304         switch (len & 15) {
 305         case 15:
 306                 k2 ^= ((u_int64_t)tail[14]) << 48;
 307                 /* FALLTHRU */
 308         case 14:
 309                 k2 ^= ((u_int64_t)tail[13]) << 40;
 310                 /* FALLTHRU */
 311         case 13:
 312                 k2 ^= ((u_int64_t)tail[12]) << 32;
 313                 /* FALLTHRU */
 314         case 12:
 315                 k2 ^= ((u_int64_t)tail[11]) << 24;
 316                 /* FALLTHRU */
 317         case 11:
 318                 k2 ^= ((u_int64_t)tail[10]) << 16;
 319                 /* FALLTHRU */
 320         case 10:
 321                 k2 ^= ((u_int64_t)tail[9]) << 8;
 322                 /* FALLTHRU */
 323         case 9:
 324                 k2 ^= ((u_int64_t)tail[8]) << 0;
 325                 k2 *= MH3_X64_128_C2;
 326 #if defined(__x86_64__)
 327         __asm__ ( "rol   $33, %[k2]\n\t" :[k2] "+r" (k2) : :);
 328 #elif defined(__arm64__)
 329         __asm__ ( "ror   %[k2], %[k2], #(64-33)\n\t" :[k2] "+r" (k2) : :);
 330 #else /* !__x86_64__ && !__arm64__ */
 331         k2 = ROTL64(k2, 33);
 332 #endif /* !__x86_64__ && !__arm64__ */
 333                 k2 *= MH3_X64_128_C1;
 334                 h2 ^= k2;
 335                 /* FALLTHRU */
 336         case 8:
 337                 k1 ^= ((u_int64_t)tail[7]) << 56;
 338                 /* FALLTHRU */
 339         case 7:
 340                 k1 ^= ((u_int64_t)tail[6]) << 48;
 341                 /* FALLTHRU */
 342         case 6:
 343                 k1 ^= ((u_int64_t)tail[5]) << 40;
 344                 /* FALLTHRU */
 345         case 5:
 346                 k1 ^= ((u_int64_t)tail[4]) << 32;
 347                 /* FALLTHRU */
 348         case 4:
 349                 k1 ^= ((u_int64_t)tail[3]) << 24;
 350                 /* FALLTHRU */
 351         case 3:
 352                 k1 ^= ((u_int64_t)tail[2]) << 16;
 353                 /* FALLTHRU */
 354         case 2:
 355                 k1 ^= ((u_int64_t)tail[1]) << 8;
 356                 /* FALLTHRU */
 357         case 1:
 358                 k1 ^= ((u_int64_t)tail[0]) << 0;
 359                 k1 *= MH3_X64_128_C1;
 360 #if defined(__x86_64__)
 361         __asm__ ( "rol   $31, %[k1]\n\t" :[k1] "+r" (k1) : :);
 362 #elif defined(__arm64__)
 363         __asm__ ( "ror   %[k1], %[k1], #(64-31)\n\t" :[k1] "+r" (k1) : :);
 364 #else /* !__x86_64__ && !__arm64__ */
 365         k1 = ROTL64(k1, 31);
 366 #endif /* !__x86_64__ && !__arm64__ */
 367                 k1 *= MH3_X64_128_C2;
 368                 h1 ^= k1;
 369         };
 370
 371         /* finalization */
 372         h1 ^= len;
 373         h2 ^= len;
 374
 375         h1 += h2;
 376         h2 += h1;
 377
 378         h1 = mh3_fmix64(h1);
 379         h2 = mh3_fmix64(h2);
 380
 381         h1 += h2;
 382         h2 += h1;
 383
 384         /* throw all but lowest 32-bit */
 385         return (h1 & 0xffffffff);
 386 }
 387
 388 #define JHASH_INIT      0xdeadbeef
 389
 390 #define JHASH_MIX(a, b, c) {                    \
 391         a -= c;  a ^= ROTL32(c, 4);   c += b;   \
 392         b -= a;  b ^= ROTL32(a, 6);   a += c;   \
 393         c -= b;  c ^= ROTL32(b, 8);   b += a;   \
 394         a -= c;  a ^= ROTL32(c, 16);  c += b;   \
 395         b -= a;  b ^= ROTL32(a, 19);  a += c;   \
 396         c -= b;  c ^= ROTL32(b, 4);   b += a;   \
 397 }
 398
 399 #define JHASH_FINAL(a, b, c) {                  \
 400         c ^= b;  c -= ROTL32(b, 14);            \
 401         a ^= c;  a -= ROTL32(c, 11);            \
 402         b ^= a;  b -= ROTL32(a, 25);            \
 403         c ^= b;  c -= ROTL32(b, 16);            \
 404         a ^= c;  a -= ROTL32(c, 4);             \
 405         b ^= a;  b -= ROTL32(a, 14);            \
 406         c ^= b;  c -= ROTL32(b, 24);            \
 407 }
 408
 409 #if BYTE_ORDER == BIG_ENDIAN
 410 /*
 411  * hashbig()
 412  */
 413 u_int32_t
 414 net_flowhash_jhash(const void *key, u_int32_t len, const u_int32_t seed)
 415 {
 416         u_int32_t a, b, c;
 417
 418         /* Set up the internal state */
 419         a = b = c = JHASH_INIT + len + seed;
 420
 421         if (ALIGNED32(key)) {
 422                 /* read 32-bit chunks */
 423                 const u_int32_t *k = (const u_int32_t *)key;
 424
 425                 /*
 426                  * all but last block:
 427                  * aligned reads and affect 32 bits of (a,b,c)
 428                  */
 429                 while (len > 12) {
 430                         a += k[0];
 431                         b += k[1];
 432                         c += k[2];
 433                         JHASH_MIX(a, b, c);
 434                         len -= 12;
 435                         k += 3;
 436                 }
 437
 438                 /*
 439                  * handle the last (probably partial) block
 440                  *
 441                  * "k[2] << 8" actually reads beyond the end of the string,
 442                  * but then shifts out the part it's not allowed to read.
 443                  * Because the string is aligned, the illegal read is in
 444                  * the same word as the rest of the string.  The masking
 445                  * trick does make the hash noticably faster for short
 446                  * strings (like English words).
 447                  */
 448                 switch (len) {
 449                 case 12:
 450                         c += k[2];
 451                         b += k[1];
 452                         a += k[0];
 453                         break;
 454
 455                 case 11:
 456                         c += k[2] & 0xffffff00;
 457                         b += k[1];
 458                         a += k[0];
 459                         break;
 460
 461                 case 10:
 462                         c += k[2] & 0xffff0000;
 463                         b += k[1];
 464                         a += k[0];
 465                         break;
 466
 467                 case 9:
 468                         c += k[2] & 0xff000000;
 469                         b += k[1];
 470                         a += k[0];
 471                         break;
 472
 473                 case 8:
 474                         b += k[1];
 475                         a += k[0];
 476                         break;
 477
 478                 case 7:
 479                         b += k[1] & 0xffffff00;
 480                         a += k[0];
 481                         break;
 482
 483                 case 6:
 484                         b += k[1] & 0xffff0000;
 485                         a += k[0];
 486                         break;
 487
 488                 case 5:
 489                         b += k[1] & 0xff000000;
 490                         a += k[0];
 491                         break;
 492
 493                 case 4:
 494                         a += k[0];
 495                         break;
 496
 497                 case 3:
 498                         a += k[0] & 0xffffff00;
 499                         break;
 500
 501                 case 2:
 502                         a += k[0] & 0xffff0000;
 503                         break;
 504
 505                 case 1:
 506                         a += k[0] & 0xff000000;
 507                         break;
 508
 509                 case 0:
 510                         /* zero length requires no mixing */
 511                         return (c);
 512                 }
 513
 514                 JHASH_FINAL(a, b, c);
 515
 516                 return (c);
 517         }
 518
 519         /* need to read the key one byte at a time */
 520         const u_int8_t *k = (const u_int8_t *)key;
 521
 522         /* all but the last block: affect some 32 bits of (a,b,c) */
 523         while (len > 12) {
 524                 a += ((u_int32_t)k[0]) << 24;
 525                 a += ((u_int32_t)k[1]) << 16;
 526                 a += ((u_int32_t)k[2]) << 8;
 527                 a += ((u_int32_t)k[3]);
 528                 b += ((u_int32_t)k[4]) << 24;
 529                 b += ((u_int32_t)k[5]) << 16;
 530                 b += ((u_int32_t)k[6]) << 8;
 531                 b += ((u_int32_t)k[7]);
 532                 c += ((u_int32_t)k[8]) << 24;
 533                 c += ((u_int32_t)k[9]) << 16;
 534                 c += ((u_int32_t)k[10]) << 8;
 535                 c += ((u_int32_t)k[11]);
 536                 JHASH_MIX(a, b, c);
 537                 len -= 12;
 538                 k += 12;
 539         }
 540
 541         /* last block: affect all 32 bits of (c) */
 542         switch (len) {
 543         case 12:
 544                 c += k[11];
 545                 /* FALLTHRU */
 546         case 11:
 547                 c += ((u_int32_t)k[10]) << 8;
 548                 /* FALLTHRU */
 549         case 10:
 550                 c += ((u_int32_t)k[9]) << 16;
 551                 /* FALLTHRU */
 552         case 9:
 553                 c += ((u_int32_t)k[8]) << 24;
 554                 /* FALLTHRU */
 555         case 8:
 556                 b += k[7];
 557                 /* FALLTHRU */
 558         case 7:
 559                 b += ((u_int32_t)k[6]) << 8;
 560                 /* FALLTHRU */
 561         case 6:
 562                 b += ((u_int32_t)k[5]) << 16;
 563                 /* FALLTHRU */
 564         case 5:
 565                 b += ((u_int32_t)k[4]) << 24;
 566                 /* FALLTHRU */
 567         case 4:
 568                 a += k[3];
 569                 /* FALLTHRU */
 570         case 3:
 571                 a += ((u_int32_t)k[2]) << 8;
 572                 /* FALLTHRU */
 573         case 2:
 574                 a += ((u_int32_t)k[1]) << 16;
 575                 /* FALLTHRU */
 576         case 1:
 577                 a += ((u_int32_t)k[0]) << 24;
 578                 break;
 579
 580         case 0:
 581                 /* zero length requires no mixing */
 582                 return (c);
 583         }
 584
 585         JHASH_FINAL(a, b, c);
 586
 587         return (c);
 588 }
 589 #else /* LITTLE_ENDIAN */
 590 /*
 591  * hashlittle()
 592  */
 593 u_int32_t
 594 net_flowhash_jhash(const void *key, u_int32_t len, const u_int32_t seed)
 595 {
 596         u_int32_t a, b, c;
 597
 598         /* Set up the internal state */
 599         a = b = c = JHASH_INIT + len + seed;
 600
 601 #if defined(__i386__) || defined(__x86_64__)
 602         /*
 603          * On i386/x86_64, it is faster to read 32-bit chunks if the key
 604          * is aligned 32-bit OR not 16-bit, and perform 16-bit reads if it
 605          * is aligned 16-bit.
 606          */
 607         if (ALIGNED32(key) || !ALIGNED16(key)) {
 608 #else /* !defined(__i386__) && !defined(__x86_64__) */
 609         if (ALIGNED32(key)) {
 610 #endif /* !defined(__i386__) && !defined(__x86_64__) */
 611                 /* read 32-bit chunks */
 612                 const u_int32_t *k = (const u_int32_t *)key;
 613
 614                 /*
 615                  * all but last block:
 616                  * aligned reads and affect 32 bits of (a,b,c)
 617                  */
 618                 while (len > 12) {
 619                         a += k[0];
 620                         b += k[1];
 621                         c += k[2];
 622                         JHASH_MIX(a, b, c);
 623                         len -= 12;
 624                         k += 3;
 625                 }
 626
 627                 /*
 628                  * handle the last (probably partial) block
 629                  *
 630                  * "k[2] & 0xffffff" actually reads beyond the end of the
 631                  * string, but then masks off the part it's not allowed
 632                  * to read.  Because the string is aligned, the masked-off
 633                  * tail is in the same word as the rest of the string.
 634                  * The masking trick does make the hash noticably faster
 635                  * for short strings (like English words).
 636                  */
 637                 switch (len) {
 638                 case 12:
 639                         c += k[2];
 640                         b += k[1];
 641                         a += k[0];
 642                         break;
 643
 644                 case 11:
 645                         c += k[2] & 0xffffff;
 646                         b += k[1];
 647                         a += k[0];
 648                         break;
 649
 650                 case 10:
 651                         c += k[2] & 0xffff;
 652                         b += k[1];
 653                         a += k[0];
 654                         break;
 655
 656                 case 9:
 657                         c += k[2] & 0xff;
 658                         b += k[1];
 659                         a += k[0];
 660                         break;
 661
 662                 case 8:
 663                         b += k[1];
 664                         a += k[0];
 665                         break;
 666
 667                 case 7:
 668                         b += k[1] & 0xffffff;
 669                         a += k[0];
 670                         break;
 671
 672                 case 6:
 673                         b += k[1] & 0xffff;
 674                         a += k[0];
 675                         break;
 676
 677                 case 5:
 678                         b += k[1] & 0xff;
 679                         a += k[0];
 680                         break;
 681
 682                 case 4:
 683                         a += k[0];
 684                         break;
 685
 686                 case 3:
 687                         a += k[0] & 0xffffff;
 688                         break;
 689
 690                 case 2:
 691                         a += k[0] & 0xffff;
 692                         break;
 693
 694                 case 1:
 695                         a += k[0] & 0xff;
 696                         break;
 697
 698                 case 0:
 699                         /* zero length requires no mixing */
 700                         return (c);
 701                 }
 702
 703                 JHASH_FINAL(a, b, c);
 704
 705                 return (c);
 706         }
 707 #if !defined(__i386__) && !defined(__x86_64__)
 708         else if (ALIGNED16(key)) {
 709 #endif /* !defined(__i386__) && !defined(__x86_64__) */
 710                 /* read 16-bit chunks */
 711                 const u_int16_t *k = (const u_int16_t *)key;
 712                 const u_int8_t *k8;
 713
 714                 /* all but last block: aligned reads and different mixing */
 715                 while (len > 12) {
 716                         a += k[0] + (((u_int32_t)k[1]) << 16);
 717                         b += k[2] + (((u_int32_t)k[3]) << 16);
 718                         c += k[4] + (((u_int32_t)k[5]) << 16);
 719                         JHASH_MIX(a, b, c);
 720                         len -= 12;
 721                         k += 6;
 722                 }
 723
 724                 /* handle the last (probably partial) block */
 725                 k8 = (const u_int8_t *)k;
 726                 switch (len) {
 727                 case 12:
 728                         c += k[4] + (((u_int32_t)k[5]) << 16);
 729                         b += k[2] + (((u_int32_t)k[3]) << 16);
 730                         a += k[0] + (((u_int32_t)k[1]) << 16);
 731                         break;
 732
 733                 case 11:
 734                         c += ((u_int32_t)k8[10]) << 16;
 735                         /* FALLTHRU */
 736                 case 10:
 737                         c += k[4];
 738                         b += k[2] + (((u_int32_t)k[3]) << 16);
 739                         a += k[0] + (((u_int32_t)k[1]) << 16);
 740                         break;
 741
 742                 case 9:
 743                         c += k8[8];
 744                         /* FALLTHRU */
 745                 case 8:
 746                         b += k[2] + (((u_int32_t)k[3]) << 16);
 747                         a += k[0] + (((u_int32_t)k[1]) << 16);
 748                         break;
 749
 750                 case 7:
 751                         b += ((u_int32_t)k8[6]) << 16;
 752                         /* FALLTHRU */
 753                 case 6:
 754                         b += k[2];
 755                         a += k[0] + (((u_int32_t)k[1]) << 16);
 756                         break;
 757
 758                 case 5:
 759                         b += k8[4];
 760                         /* FALLTHRU */
 761                 case 4:
 762                         a += k[0] + (((u_int32_t)k[1]) << 16);
 763                         break;
 764
 765                 case 3:
 766                         a += ((u_int32_t)k8[2]) << 16;
 767                         /* FALLTHRU */
 768                 case 2:
 769                         a += k[0];
 770                         break;
 771
 772                 case 1:
 773                         a += k8[0];
 774                         break;
 775
 776                 case 0:
 777                         /* zero length requires no mixing */
 778                         return (c);
 779                 }
 780
 781                 JHASH_FINAL(a, b, c);
 782
 783                 return (c);
 784 #if !defined(__i386__) && !defined(__x86_64__)
 785         }
 786
 787         /* need to read the key one byte at a time */
 788         const u_int8_t *k = (const u_int8_t *)key;
 789
 790         /* all but the last block: affect some 32 bits of (a,b,c) */
 791         while (len > 12) {
 792                 a += k[0];
 793                 a += ((u_int32_t)k[1]) << 8;
 794                 a += ((u_int32_t)k[2]) << 16;
 795                 a += ((u_int32_t)k[3]) << 24;
 796                 b += k[4];
 797                 b += ((u_int32_t)k[5]) << 8;
 798                 b += ((u_int32_t)k[6]) << 16;
 799                 b += ((u_int32_t)k[7]) << 24;
 800                 c += k[8];
 801                 c += ((u_int32_t)k[9]) << 8;
 802                 c += ((u_int32_t)k[10]) << 16;
 803                 c += ((u_int32_t)k[11]) << 24;
 804                 JHASH_MIX(a, b, c);
 805                 len -= 12;
 806                 k += 12;
 807         }
 808
 809         /* last block: affect all 32 bits of (c) */
 810         switch (len) {
 811         case 12:
 812                 c += ((u_int32_t)k[11]) << 24;
 813                 /* FALLTHRU */
 814         case 11:
 815                 c += ((u_int32_t)k[10]) << 16;
 816                 /* FALLTHRU */
 817         case 10:
 818                 c += ((u_int32_t)k[9]) << 8;
 819                 /* FALLTHRU */
 820         case 9:
 821                 c += k[8];
 822                 /* FALLTHRU */
 823         case 8:
 824                 b += ((u_int32_t)k[7]) << 24;
 825                 /* FALLTHRU */
 826         case 7:
 827                 b += ((u_int32_t)k[6]) << 16;
 828                 /* FALLTHRU */
 829         case 6:
 830                 b += ((u_int32_t)k[5]) << 8;
 831                 /* FALLTHRU */
 832         case 5:
 833                 b += k[4];
 834                 /* FALLTHRU */
 835         case 4:
 836                 a += ((u_int32_t)k[3]) << 24;
 837                 /* FALLTHRU */
 838         case 3:
 839                 a += ((u_int32_t)k[2]) << 16;
 840                 /* FALLTHRU */
 841         case 2:
 842                 a += ((u_int32_t)k[1]) << 8;
 843                 /* FALLTHRU */
 844         case 1:
 845                 a += k[0];
 846                 break;
 847
 848         case 0:
 849                 /* zero length requires no mixing */
 850                 return (c);
 851         }
 852
 853         JHASH_FINAL(a, b, c);
 854
 855         return (c);
 856 #endif /* !defined(__i386__) && !defined(__x86_64__) */
 857 }
 858 #endif /* LITTLE_ENDIAN */