negative caching implemented

author antirez <antirez@gmail.com>

Fri, 31 Dec 2010 16:32:59 +0000 (17:32 +0100)

committer antirez <antirez@gmail.com>

Fri, 31 Dec 2010 16:32:59 +0000 (17:32 +0100)
author antirez <antirez@gmail.com>
Fri, 31 Dec 2010 16:32:59 +0000 (17:32 +0100)
committer antirez <antirez@gmail.com>
Fri, 31 Dec 2010 16:32:59 +0000 (17:32 +0100)
diff --git a/src/db.c b/src/db.c

index 48663e9c97df48d96467f7fb30a5b4c8a843a905..1a30034e811306e2848c4d27cee2c2511b3789d2 100644 (file)
--- a/src/db.c
+++ b/src/db.c
@@ -38,6 +38,8 @@ robj *lookupKey(redisDb *db, robj *key) {
           * async loading of this key, what may happen is that the old
           * key is loaded in memory if this gets deleted in the meantime. */
          if (server.ds_enabled && cacheKeyMayExist(db,key)) {
+            redisLog(REDIS_DEBUG,"Force loading key %s via lookup",
+                key->ptr);
              val = dsGet(db,key,&expire);
              if (val) {
                  int retval = dbAdd(db,key,val);
@@ -142,14 +144,13 @@ robj *dbRandomKey(redisDb *db) {
  
  /* Delete a key, value, and associated expiration entry if any, from the DB */
  int dbDelete(redisDb *db, robj *key) {
-    /* If VM is enabled make sure to awake waiting clients for this key:
-     * deleting the key will kill the I/O thread bringing the key from swap
-     * to memory, so the client will never be notified and unblocked if we
-     * don't do it now. */
+    /* If diskstore is enabled make sure to awake waiting clients for this key
+     * as it is not really useful to wait for a key already deleted to be
+     * loaded from disk. */
      if (server.ds_enabled) handleClientsBlockedOnSwappedKey(db,key);
  
-    /* FIXME: we need to delete the IO Job loading the key, or simply we can
-     * wait for it to finish. */
+    /* Mark this key as non existing on disk as well */
+    cacheSetKeyDoesNotExistRemember(db,key);
  
      /* Deleting an entry from the expires dict will not free the sds of
       * the key, because it is shared with the main dictionary. */
diff --git a/src/dscache.c b/src/dscache.c

index 7be3bf86ac27aff7154abcf0d8986919e025b020..a56fa118bba6807220fcae4c5c3e30428eb816fa 100644 (file)
--- a/src/dscache.c
+++ b/src/dscache.c
@@ -105,6 +105,10 @@
   *   value so it will be evicted later.
   *
   *   Are there other patterns like this where we load stale data?
+ *
+ *   Also, make sure that key preloading is ONLY done for keys that are
+ *   not marked as cacheKeyDoesNotExist(), otherwise, again, we can load
+ *   data from disk that should instead be deleted.
   */
  
  /* Virtual Memory is composed mainly of two subsystems:
@@ -259,7 +263,72 @@ int dsCanTouchDiskStore(void) {
      return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
  }
  
-/* =================== Virtual Memory - Threaded I/O  ======================= */
+/* ==================== Disk store negative caching  ========================
+ *
+ * When disk store is enabled, we need negative caching, that is, to remember
+ * keys that are for sure *not* on the disk key-value store.
+ *
+ * This is useful for two reasons:
+ *
+ * 1) Without negative caching cache misses will cost us a disk lookup, even
+ *    if the same non existing key is accessed again and again. We negative
+ *    caching we remember that the key is not on disk, so if it's not in memory
+ *    and we have a negative cache entry, we don't try a disk access at all.
+ *
+ * 2) Negative caching is the way to fix a specific race condition. For instance
+ *    think at the following sequence of commands:
+ *
+ *    SET foo bar
+ *    DEL foo
+ *    GET foo
+ *    
+ *    After the SET, we'll mark the value as dirty, so it will be flushed
+ *    on disk at some time. Later the key is deleted, so will be removed
+ *    from memory. Another job will be created to remove the key from the disk
+ *    store, but the removal is not synchronous, so may happen later in time.
+ *
+ *    Finally we have a GET foo operation. This operation may result in
+ *    reading back a value from disk that is not updated data, as the deletion
+ *    operaiton against the disk KV store was still not completed, so we
+ *    read old data.
+ *
+ * Remembering that the given key is deleted is important. We can discard this
+ * information once the key was really removed from the disk.
+ *
+ * So actually there are two kind of negative caching entries: entries that
+ * can be evicted when we need to reclaim memory, and entries that will
+ * not be evicted, for all the time we need this information to be available.
+ *
+ * The API allows to create both kind of negative caching. */
+
+int cacheKeyMayExist(redisDb *db, robj *key) {
+    return dictFind(db->io_negcache,key) == NULL;
+}
+
+void cacheSetKeyMayExist(redisDb *db, robj *key) {
+    dictDelete(db->io_negcache,key);
+}
+
+void cacheSetKeyDoesNotExist(redisDb *db, robj *key) {
+    struct dictEntry *de;
+
+    /* Don't overwrite negative cached entries with val set to 0, as this
+     * entries were created with cacheSetKeyDoesNotExistRemember(). */
+    de = dictFind(db->io_negcache,key);
+    if (de != NULL && dictGetEntryVal(de) == NULL) return;
+
+    if (dictReplace(db->io_negcache,key,(void*)time(NULL))) {
+        incrRefCount(key);
+    }
+}
+
+void cacheSetKeyDoesNotExistRemember(redisDb *db, robj *key) {
+    if (dictReplace(db->io_negcache,key,NULL)) {
+        incrRefCount(key);
+    }
+}
+
+/* ================== Disk store cache - Threaded I/O  ====================== */
  
  void freeIOJob(iojob *j) {
      decrRefCount(j->key);
@@ -310,15 +379,20 @@ void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
              if (j->val != NULL) {
                  /* Note: the key may already be here if between the time
                   * this key loading was scheduled and now there was the
-                 * need to blocking load the key for a key lookup. */
-                if (dbAdd(j->db,j->key,j->val) == REDIS_OK) {
+                 * need to blocking load the key for a key lookup.
+                 *
+                 * Also we don't add a key that was deleted in the
+                 * meantime and should not be on disk either. */
+                if (cacheKeyMayExist(j->db,j->key) &&
+                    dbAdd(j->db,j->key,j->val) == REDIS_OK)
+                {
                      incrRefCount(j->val);
                      if (j->expire != -1) setExpire(j->db,j->key,j->expire);
                  }
              } else {
                  /* The key does not exist. Create a negative cache entry
                   * for this key. */
-                /* FIXME: add this entry into the negative cache */
+                cacheSetKeyDoesNotExist(j->db,j->key);
              }
              /* Handle clients waiting for this key to be loaded. */
              handleClientsBlockedOnSwappedKey(j->db,j->key);
@@ -327,6 +401,12 @@ void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
              if (j->val) {
                  redisAssert(j->val->storage == REDIS_DS_SAVING);
                  j->val->storage = REDIS_DS_MEMORY;
+                cacheSetKeyMayExist(j->db,j->key);
+            } else {
+                /* Key deleted. Probably we have this key marked as
+                 * non existing, and impossible to evict, in our negative
+                 * cache entry. Add it as a normal negative cache entry. */
+                cacheSetKeyMayExist(j->db,j->key);
              }
              freeIOJob(j);
          }
@@ -585,15 +665,6 @@ void cacheCron(void) {
      }
  }
  
-/* ============ Negative caching for diskstore objects ====================== */
-/* Since accesses to keys that don't exist with disk store cost us a disk
- * access, we need to cache names of keys that do not exist but are frequently
- * accessed. */
-int cacheKeyMayExist(redisDb *db, robj *key) {
-    /* FIXME: for now we just always return true. */
-    return 1;
-}
-
  /* ============ Virtual Memory - Blocking clients on missing keys =========== */
  
  /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
@@ -624,6 +695,9 @@ int waitForSwappedKey(redisClient *c, robj *key) {
      de = dictFind(c->db->dict,key->ptr);
      if (de != NULL) return 0;
  
+    /* Don't wait for keys we are sure are not on disk either */
+    if (!cacheKeyMayExist(c->db,key)) return 0;
+
      /* Add the key to the list of keys this client is waiting for.
       * This maps clients to keys they are waiting for. */
      listAddNodeTail(c->io_keys,key);
@@ -645,13 +719,6 @@ int waitForSwappedKey(redisClient *c, robj *key) {
      listAddNodeTail(l,c);
  
      /* Are we already loading the key from disk? If not create a job */
-    /* FIXME: if a given client was blocked for this key (so job already
-     * created) but the client was freed, there may be a job loading this
-     * key even if de == NULL. Does this creates some race condition?
-     *
-     * Example: after the first load the key gets a DEL that will schedule
-     * a write. But the write will happen later, the duplicated load will
-     * fire and we'll get again the key in memory. */
      if (de == NULL)
          dsCreateIOJob(REDIS_IOJOB_LOAD,c->db,key,NULL);
      return 1;
diff --git a/src/redis.c b/src/redis.c

index bb917f50f0c7db3ca346dd12dfa971638bf8c704..65cf3ace9c045b1fbcffa6347aba51f37dc87626 100644 (file)
--- a/src/redis.c
+++ b/src/redis.c
@@ -345,7 +345,7 @@ unsigned int dictEncObjHash(const void *key) {
      }
  }
  
-/* Sets type */
+/* Sets type and diskstore negative caching hash table */
  dictType setDictType = {
      dictEncObjHash,            /* hash function */
      NULL,                      /* key dup */
@@ -854,8 +854,10 @@ void initServer() {
          server.db[j].expires = dictCreate(&keyptrDictType,NULL);
          server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
          server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
-        if (server.ds_enabled)
+        if (server.ds_enabled) {
              server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
+            server.db[j].io_negcache = dictCreate(&setDictType,NULL);
+        }
          server.db[j].id = j;
      }
      server.pubsub_channels = dictCreate(&keylistDictType,NULL);
diff --git a/src/redis.h b/src/redis.h

index 15c192cfa02c6a2a72b2c098451047a6ef8b2146..4a6a5dc6d055630ae3df40c0614ee8c7c05376de 100644 (file)
--- a/src/redis.h
+++ b/src/redis.h
@@ -269,6 +269,7 @@ typedef struct redisDb {
      dict *expires;              /* Timeout of keys with a timeout set */
      dict *blocking_keys;        /* Keys with clients waiting for data (BLPOP) */
      dict *io_keys;              /* Keys with clients waiting for VM I/O */
+    dict *io_negcache;          /* Negative caching for disk store */
      dict *watched_keys;         /* WATCHED keys for MULTI/EXEC CAS */
      int id;
  } redisDb;
@@ -809,6 +810,9 @@ int cacheFreeOneEntry(void);
  void cacheScheduleForFlush(redisDb *db, robj *key);
  void cacheCron(void);
  int cacheKeyMayExist(redisDb *db, robj *key);
+void cacheSetKeyExists(redisDb *db, robj *key);
+void cacheSetKeyDoesNotExist(redisDb *db, robj *key);
+void cacheSetKeyDoesNotExistRemember(redisDb *db, robj *key);
  
  /* Set data type */
  robj *setTypeCreate(robj *value);
author	antirez <antirez@gmail.com>
	Fri, 31 Dec 2010 16:32:59 +0000 (17:32 +0100)
committer	antirez <antirez@gmail.com>
	Fri, 31 Dec 2010 16:32:59 +0000 (17:32 +0100)
src/db.c		patch \| blob \| blame \| history
src/dscache.c		patch \| blob \| blame \| history
src/redis.c		patch \| blob \| blame \| history
src/redis.h		patch \| blob \| blame \| history