Hash function switched to murmurhash2.

author antirez <antirez@gmail.com>

Wed, 3 Oct 2012 17:14:46 +0000 (19:14 +0200)

committer antirez <antirez@gmail.com>

Fri, 5 Oct 2012 09:16:40 +0000 (11:16 +0200)
author antirez <antirez@gmail.com>
Wed, 3 Oct 2012 17:14:46 +0000 (19:14 +0200)
committer antirez <antirez@gmail.com>
Fri, 5 Oct 2012 09:16:40 +0000 (11:16 +0200)
diff --git a/src/dict.c b/src/dict.c

index 66bb983a83aaf426bac9ab0cea93d8d0d03c4212..ec58e820073065dbee96d69a1d1a8135f2ef3f7a 100644 (file)
--- a/src/dict.c
+++ b/src/dict.c
@@ -85,29 +85,73 @@ unsigned int dictIdentityHashFunction(unsigned int key)
      return key;
  }
  
-static int dict_hash_function_seed = 5381;
+static uint32_t dict_hash_function_seed = 5381;
  
-void dictSetHashFunctionSeed(unsigned int seed) {
+void dictSetHashFunctionSeed(uint32_t seed) {
      dict_hash_function_seed = seed;
  }
  
-unsigned int dictGetHashFunctionSeed(void) {
+uint32_t dictGetHashFunctionSeed(void) {
      return dict_hash_function_seed;
  }
  
-/* Generic hash function (a popular one from Bernstein).
- * I tested a few and this was the best. */
-unsigned int dictGenHashFunction(const unsigned char *buf, int len) {
-    unsigned int hash = dict_hash_function_seed;
+/* MurmurHash2, by Austin Appleby
+ * Note - This code makes a few assumptions about how your machine behaves -
+ * 1. We can read a 4-byte value from any address without crashing
+ * 2. sizeof(int) == 4
+ *
+ * And it has a few limitations -
+ *
+ * 1. It will not work incrementally.
+ * 2. It will not produce the same results on little-endian and big-endian
+ *    machines.
+ */
+unsigned int dictGenHashFunction(const void *key, int len) {
+    /* 'm' and 'r' are mixing constants generated offline.
+     They're not really 'magic', they just happen to work well.  */
+    uint32_t seed = dict_hash_function_seed;
+    const uint32_t m = 0x5bd1e995;
+    const int r = 24;
  
-    while (len--)
-        hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */
-    return hash;
+    /* Initialize the hash to a 'random' value */
+    uint32_t h = seed ^ len;
+
+    /* Mix 4 bytes at a time into the hash */
+    const unsigned char *data = (const unsigned char *)key;
+
+    while(len >= 4) {
+        uint32_t k = *(uint32_t*)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    /* Handle the last few bytes of the input array  */
+    switch(len) {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0]; h *= m;
+    };
+
+    /* Do a few final mixes of the hash to ensure the last few
+     * bytes are well-incorporated. */
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return (unsigned int)h;
  }
  
-/* And a case insensitive version */
+/* And a case insensitive hash function (based on djb hash) */
  unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len) {
-    unsigned int hash = dict_hash_function_seed;
+    unsigned int hash = (unsigned int)dict_hash_function_seed;
  
      while (len--)
          hash = ((hash << 5) + hash) + (tolower(*buf++)); /* hash * 33 + c */
diff --git a/src/dict.h b/src/dict.h

index 5f85695354471b29ab074f974368a673a5bfc45b..f480ae539232fbe374feeb96661a3088d8b12e20 100644 (file)
--- a/src/dict.h
+++ b/src/dict.h
@@ -155,7 +155,7 @@ dictEntry *dictNext(dictIterator *iter);
  void dictReleaseIterator(dictIterator *iter);
  dictEntry *dictGetRandomKey(dict *d);
  void dictPrintStats(dict *d);
-unsigned int dictGenHashFunction(const unsigned char *buf, int len);
+unsigned int dictGenHashFunction(const void *key, int len);
  unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len);
  void dictEmpty(dict *d);
  void dictEnableResize(void);
author	antirez <antirez@gmail.com>
	Wed, 3 Oct 2012 17:14:46 +0000 (19:14 +0200)
committer	antirez <antirez@gmail.com>
	Fri, 5 Oct 2012 09:16:40 +0000 (11:16 +0200)
src/dict.c		patch \| blob \| blame \| history
src/dict.h		patch \| blob \| blame \| history