/* Log levels */
#define REDIS_DEBUG 0
-#define REDIS_NOTICE 1
-#define REDIS_WARNING 2
+#define REDIS_VERBOSE 1
+#define REDIS_NOTICE 2
+#define REDIS_WARNING 3
/* Anti-warning macro... */
#define REDIS_NOTUSED(V) ((void) V)
/* The VM object structure */
struct redisObjectVM {
- off_t offset; /* the page at witch the object is stored on disk */
- int pages; /* number of pages used on disk */
+ off_t page; /* the page at witch the object is stored on disk */
+ off_t usedpages; /* number of pages used on disk */
+ time_t atime; /* Last access time */
} vm;
/* The actual Redis Object */
void *ptr;
unsigned char type;
unsigned char encoding;
- unsigned char storage; /* where? REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
- unsigned char notused;
+ unsigned char storage; /* If this object is a key, where is the value?
+ * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
+ unsigned char vtype; /* If this object is a key, and value is swapped out,
+ * this is the type of the swapped out object. */
int refcount;
/* VM fields, this are only allocated if VM is active, otherwise the
* object allocation function will just allocate
_var.type = REDIS_STRING; \
_var.encoding = REDIS_ENCODING_RAW; \
_var.ptr = _ptr; \
+ if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
} while(0);
typedef struct redisDb {
redisClient *master; /* client that is master for this slave */
int replstate;
unsigned int maxclients;
- unsigned long maxmemory;
+ unsigned long long maxmemory;
unsigned int blockedclients;
/* Sort parameters - qsort_r() is only available under BSD so we
* have to take this state global, in order to pass it to sortCompare() */
int vm_enabled;
off_t vm_page_size;
off_t vm_pages;
- long vm_max_memory;
+ unsigned long long vm_max_memory;
/* Virtual memory state */
FILE *vm_fp;
int vm_fd;
off_t vm_next_page; /* Next probably empty page */
off_t vm_near_pages; /* Number of pages allocated sequentially */
unsigned char *vm_bitmap; /* Bitmap of free/used pages */
+ time_t unixtime; /* Unix time sampled every second. */
+ /* Virtual memory stats */
+ unsigned long long vm_stats_used_pages;
+ unsigned long long vm_stats_swapped_objects;
+ unsigned long long vm_stats_swapouts;
+ unsigned long long vm_stats_swapins;
};
typedef void redisCommandProc(redisClient *c);
static void incrRefCount(robj *o);
static int rdbSaveBackground(char *filename);
static robj *createStringObject(char *ptr, size_t len);
+static robj *dupStringObject(robj *o);
static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
static int syncWithMaster(void);
static int removeExpire(redisDb *db, robj *key);
static int expireIfNeeded(redisDb *db, robj *key);
static int deleteIfVolatile(redisDb *db, robj *key);
+static int deleteIfSwapped(redisDb *db, robj *key);
static int deleteKey(redisDb *db, robj *key);
static time_t getExpire(redisDb *db, robj *key);
static int setExpire(redisDb *db, robj *key, time_t when);
static void unblockClient(redisClient *c);
static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
static void vmInit(void);
+static void vmMarkPagesFree(off_t page, off_t count);
+static robj *vmLoadObject(robj *key);
+static robj *vmPreviewObject(robj *key);
+static int vmSwapOneObject(void);
+static int vmCanSwapOut(void);
+static void freeOneObjectFromFreelist(void);
static void authCommand(redisClient *c);
static void pingCommand(redisClient *c);
{
DICT_NOTUSED(privdata);
+ if (val == NULL) return; /* Values of swapped out keys as set to NULL */
decrRefCount(val);
}
!(c->flags & REDIS_MASTER) && /* no timeout for masters */
(now - c->lastinteraction > server.maxidletime))
{
- redisLog(REDIS_DEBUG,"Closing idle client");
+ redisLog(REDIS_VERBOSE,"Closing idle client");
freeClient(c);
} else if (c->flags & REDIS_BLOCKED) {
if (c->blockingto != 0 && c->blockingto < now) {
for (j = 0; j < server.dbnum; j++) {
if (htNeedsResize(server.db[j].dict)) {
- redisLog(REDIS_DEBUG,"The hash table %d is too sparse, resize it...",j);
+ redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
dictResize(server.db[j].dict);
- redisLog(REDIS_DEBUG,"Hash table %d resized.",j);
+ redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
}
if (htNeedsResize(server.db[j].expires))
dictResize(server.db[j].expires);
REDIS_NOTUSED(id);
REDIS_NOTUSED(clientData);
+ /* We take a cached value of the unix time in the global state because
+ * with virtual memory and aging there is to store the current time
+ * in objects at every object access, and accuracy is not needed.
+ * To access a global var is faster than calling time(NULL) */
+ server.unixtime = time(NULL);
+
/* Update the global state with the amount of used memory */
server.usedmemory = zmalloc_used_memory();
used = dictSize(server.db[j].dict);
vkeys = dictSize(server.db[j].expires);
if (!(loops % 5) && (used || vkeys)) {
- redisLog(REDIS_DEBUG,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
+ redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
/* dictPrintStats(server.dict); */
}
}
/* Show information about connected clients */
if (!(loops % 5)) {
- redisLog(REDIS_DEBUG,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
+ redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
listLength(server.clients)-listLength(server.slaves),
listLength(server.slaves),
server.usedmemory,
/* Continue to expire if at the end of the cycle more than 25%
* of the keys were expired. */
do {
- int num = dictSize(db->expires);
+ long num = dictSize(db->expires);
time_t now = time(NULL);
expired = 0;
} while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
}
+ /* Swap a few keys on disk if we are over the memory limit and VM
+ * is enbled. Try to free objects from the free list first. */
+ if (vmCanSwapOut()) {
+ while (server.vm_enabled && zmalloc_used_memory() >
+ server.vm_max_memory)
+ {
+ if (listLength(server.objfreelist)) {
+ freeOneObjectFromFreelist();
+ } else if (vmSwapOneObject() == REDIS_ERR) {
+ if ((loops % 30) == 0 && zmalloc_used_memory() >
+ (server.vm_max_memory+server.vm_max_memory/10)) {
+ redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
+ }
+ break;
+ }
+ }
+ }
+
/* Check if we should connect to a MASTER */
if (server.replstate == REDIS_REPL_CONNECT) {
redisLog(REDIS_NOTICE,"Connecting to MASTER...");
static void initServerConfig() {
server.dbnum = REDIS_DEFAULT_DBNUM;
server.port = REDIS_SERVERPORT;
- server.verbosity = REDIS_DEBUG;
+ server.verbosity = REDIS_VERBOSE;
server.maxidletime = REDIS_MAXIDLETIME;
server.saveparams = NULL;
server.logfile = NULL; /* NULL = log on standard output */
server.stat_numcommands = 0;
server.stat_numconnections = 0;
server.stat_starttime = time(NULL);
+ server.unixtime = time(NULL);
aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
if (server.appendonly) {
}
} else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
+ else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
else {
if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
err = "argument must be 'yes' or 'no'"; goto loaderr;
}
+ } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
+ server.vm_max_memory = strtoll(argv[1], NULL, 10);
+ } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
+ server.vm_page_size = strtoll(argv[1], NULL, 10);
+ } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
+ server.vm_pages = strtoll(argv[1], NULL, 10);
} else {
err = "Bad directive or wrong number of arguments"; goto loaderr;
}
if (errno == EAGAIN) {
nwritten = 0;
} else {
- redisLog(REDIS_DEBUG,
+ redisLog(REDIS_VERBOSE,
"Error writing to client: %s", strerror(errno));
freeClient(c);
return;
/* write all collected blocks at once */
if((nwritten = writev(fd, iov, ion)) < 0) {
if (errno != EAGAIN) {
- redisLog(REDIS_DEBUG,
+ redisLog(REDIS_VERBOSE,
"Error writing to client: %s", strerror(errno));
freeClient(c);
return;
}
return;
} else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
- redisLog(REDIS_DEBUG, "Client protocol error");
+ redisLog(REDIS_VERBOSE, "Client protocol error");
freeClient(c);
return;
}
if (errno == EAGAIN) {
nread = 0;
} else {
- redisLog(REDIS_DEBUG, "Reading from client: %s",strerror(errno));
+ redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
freeClient(c);
return;
}
} else if (nread == 0) {
- redisLog(REDIS_DEBUG, "Client closed connection");
+ redisLog(REDIS_VERBOSE, "Client closed connection");
freeClient(c);
return;
}
c->replstate == REDIS_REPL_ONLINE) &&
aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
sendReplyToClient, c) == AE_ERR) return;
+
+ if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
+ obj = dupStringObject(obj);
+ obj->refcount = 0; /* getDecodedObject() will increment the refcount */
+ }
listAddNodeTail(c->reply,getDecodedObject(obj));
}
cfd = anetAccept(server.neterr, fd, cip, &cport);
if (cfd == AE_ERR) {
- redisLog(REDIS_DEBUG,"Accepting client connection: %s", server.neterr);
+ redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
return;
}
- redisLog(REDIS_DEBUG,"Accepted %s:%d", cip, cport);
+ redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
if ((c = createClient(cfd)) == NULL) {
redisLog(REDIS_WARNING,"Error allocating resoures for the client");
close(cfd); /* May be already closed, just ingore errors */
o->encoding = REDIS_ENCODING_RAW;
o->ptr = ptr;
o->refcount = 1;
+ if (server.vm_enabled) {
+ o->vm.atime = server.unixtime;
+ o->storage = REDIS_VM_MEMORY;
+ }
return o;
}
return createObject(REDIS_STRING,sdsnewlen(ptr,len));
}
+static robj *dupStringObject(robj *o) {
+ return createStringObject(o->ptr,sdslen(o->ptr));
+}
+
static robj *createListObject(void) {
list *l = listCreate();
}
static void incrRefCount(robj *o) {
+ redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
o->refcount++;
-#ifdef DEBUG_REFCOUNT
- if (o->type == REDIS_STRING)
- printf("Increment '%s'(%p), now is: %d\n",o->ptr,o,o->refcount);
-#endif
}
static void decrRefCount(void *obj) {
robj *o = obj;
-#ifdef DEBUG_REFCOUNT
- if (o->type == REDIS_STRING)
- printf("Decrement '%s'(%p), now is: %d\n",o->ptr,o,o->refcount-1);
-#endif
+ /* REDIS_VM_SWAPPED */
+ if (server.vm_enabled && o->storage == REDIS_VM_SWAPPED) {
+ redisAssert(o->refcount == 1);
+ redisAssert(o->type == REDIS_STRING);
+ freeStringObject(o);
+ vmMarkPagesFree(o->vm.page,o->vm.usedpages);
+ if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
+ !listAddNodeHead(server.objfreelist,o))
+ zfree(o);
+ server.vm_stats_swapped_objects--;
+ return;
+ }
+ /* REDIS_VM_MEMORY */
if (--(o->refcount) == 0) {
switch(o->type) {
case REDIS_STRING: freeStringObject(o); break;
static robj *lookupKey(redisDb *db, robj *key) {
dictEntry *de = dictFind(db->dict,key);
- return de ? dictGetEntryVal(de) : NULL;
+ if (de) {
+ robj *key = dictGetEntryKey(de);
+ robj *val = dictGetEntryVal(de);
+
+ if (server.vm_enabled) {
+ if (key->storage == REDIS_VM_MEMORY) {
+ /* Update the access time of the key for the aging algorithm. */
+ key->vm.atime = server.unixtime;
+ } else {
+ /* Our value was swapped on disk. Bring it at home. */
+ redisAssert(val == NULL);
+ val = vmLoadObject(key);
+ dictGetEntryVal(de) = val;
+ }
+ }
+ return val;
+ } else {
+ return NULL;
+ }
}
static robj *lookupKeyRead(redisDb *db, robj *key) {
if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
if (rdbSaveTime(fp,expiretime) == -1) goto werr;
}
- /* Save the key and associated value */
- if (rdbSaveType(fp,o->type) == -1) goto werr;
- if (rdbSaveStringObject(fp,key) == -1) goto werr;
- /* Save the actual value */
- if (rdbSaveObject(fp,o) == -1) goto werr;
+ /* Save the key and associated value. This requires special
+ * handling if the value is swapped out. */
+ if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY) {
+ /* Save type, key, value */
+ if (rdbSaveType(fp,o->type) == -1) goto werr;
+ if (rdbSaveStringObject(fp,key) == -1) goto werr;
+ if (rdbSaveObject(fp,o) == -1) goto werr;
+ } else {
+ robj *po, *newkey;
+ /* Get a preview of the object in memory */
+ po = vmPreviewObject(key);
+ /* Also duplicate the key object, to pass around a standard
+ * string object. */
+ newkey = dupStringObject(key);
+ /* Save type, key, value */
+ if (rdbSaveType(fp,key->vtype) == -1) goto werr;
+ if (rdbSaveStringObject(fp,newkey) == -1) goto werr;
+ if (rdbSaveObject(fp,po) == -1) goto werr;
+ /* Remove the loaded object from memory */
+ decrRefCount(po);
+ decrRefCount(newkey);
+ }
}
dictReleaseIterator(di);
}
*
* isencoded is set to 1 if the readed length is not actually a length but
* an "encoding type", check the above comments for more info */
-static uint32_t rdbLoadLen(FILE *fp, int rdbver, int *isencoded) {
+static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
unsigned char buf[2];
uint32_t len;
+ int type;
if (isencoded) *isencoded = 0;
- if (rdbver == 0) {
+ if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
+ type = (buf[0]&0xC0)>>6;
+ if (type == REDIS_RDB_6BITLEN) {
+ /* Read a 6 bit len */
+ return buf[0]&0x3F;
+ } else if (type == REDIS_RDB_ENCVAL) {
+ /* Read a 6 bit len encoding type */
+ if (isencoded) *isencoded = 1;
+ return buf[0]&0x3F;
+ } else if (type == REDIS_RDB_14BITLEN) {
+ /* Read a 14 bit len */
+ if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
+ return ((buf[0]&0x3F)<<8)|buf[1];
+ } else {
+ /* Read a 32 bit len */
if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
return ntohl(len);
- } else {
- int type;
-
- if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
- type = (buf[0]&0xC0)>>6;
- if (type == REDIS_RDB_6BITLEN) {
- /* Read a 6 bit len */
- return buf[0]&0x3F;
- } else if (type == REDIS_RDB_ENCVAL) {
- /* Read a 6 bit len encoding type */
- if (isencoded) *isencoded = 1;
- return buf[0]&0x3F;
- } else if (type == REDIS_RDB_14BITLEN) {
- /* Read a 14 bit len */
- if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
- return ((buf[0]&0x3F)<<8)|buf[1];
- } else {
- /* Read a 32 bit len */
- if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
- return ntohl(len);
- }
}
}
return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
}
-static robj *rdbLoadLzfStringObject(FILE*fp, int rdbver) {
+static robj *rdbLoadLzfStringObject(FILE*fp) {
unsigned int len, clen;
unsigned char *c = NULL;
sds val = NULL;
- if ((clen = rdbLoadLen(fp,rdbver,NULL)) == REDIS_RDB_LENERR) return NULL;
- if ((len = rdbLoadLen(fp,rdbver,NULL)) == REDIS_RDB_LENERR) return NULL;
+ if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
if ((c = zmalloc(clen)) == NULL) goto err;
if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
if (fread(c,clen,1,fp) == 0) goto err;
return NULL;
}
-static robj *rdbLoadStringObject(FILE*fp, int rdbver) {
+static robj *rdbLoadStringObject(FILE*fp) {
int isencoded;
uint32_t len;
sds val;
- len = rdbLoadLen(fp,rdbver,&isencoded);
+ len = rdbLoadLen(fp,&isencoded);
if (isencoded) {
switch(len) {
case REDIS_RDB_ENC_INT8:
case REDIS_RDB_ENC_INT32:
return tryObjectSharing(rdbLoadIntegerObject(fp,len));
case REDIS_RDB_ENC_LZF:
- return tryObjectSharing(rdbLoadLzfStringObject(fp,rdbver));
+ return tryObjectSharing(rdbLoadLzfStringObject(fp));
default:
redisAssert(0!=0);
}
}
}
+/* Load a Redis object of the specified type from the specified file.
+ * On success a newly allocated object is returned, otherwise NULL. */
+static robj *rdbLoadObject(int type, FILE *fp) {
+ robj *o;
+
+ if (type == REDIS_STRING) {
+ /* Read string value */
+ if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
+ tryObjectEncoding(o);
+ } else if (type == REDIS_LIST || type == REDIS_SET) {
+ /* Read list/set value */
+ uint32_t listlen;
+
+ if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ o = (type == REDIS_LIST) ? createListObject() : createSetObject();
+ /* Load every single element of the list/set */
+ while(listlen--) {
+ robj *ele;
+
+ if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
+ tryObjectEncoding(ele);
+ if (type == REDIS_LIST) {
+ listAddNodeTail((list*)o->ptr,ele);
+ } else {
+ dictAdd((dict*)o->ptr,ele,NULL);
+ }
+ }
+ } else if (type == REDIS_ZSET) {
+ /* Read list/set value */
+ uint32_t zsetlen;
+ zset *zs;
+
+ if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ o = createZsetObject();
+ zs = o->ptr;
+ /* Load every single element of the list/set */
+ while(zsetlen--) {
+ robj *ele;
+ double *score = zmalloc(sizeof(double));
+
+ if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
+ tryObjectEncoding(ele);
+ if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
+ dictAdd(zs->dict,ele,score);
+ zslInsert(zs->zsl,*score,ele);
+ incrRefCount(ele); /* added to skiplist */
+ }
+ } else {
+ redisAssert(0 != 0);
+ }
+ return o;
+}
+
static int rdbLoad(char *filename) {
FILE *fp;
robj *keyobj = NULL;
redisDb *db = server.db+0;
char buf[1024];
time_t expiretime = -1, now = time(NULL);
+ long long loadedkeys = 0;
fp = fopen(filename,"r");
if (!fp) return REDIS_ERR;
return REDIS_ERR;
}
rdbver = atoi(buf+5);
- if (rdbver > 1) {
+ if (rdbver != 1) {
fclose(fp);
redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
return REDIS_ERR;
if (type == REDIS_EOF) break;
/* Handle SELECT DB opcode as a special case */
if (type == REDIS_SELECTDB) {
- if ((dbid = rdbLoadLen(fp,rdbver,NULL)) == REDIS_RDB_LENERR)
+ if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
goto eoferr;
if (dbid >= (unsigned)server.dbnum) {
redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
continue;
}
/* Read key */
- if ((keyobj = rdbLoadStringObject(fp,rdbver)) == NULL) goto eoferr;
-
- if (type == REDIS_STRING) {
- /* Read string value */
- if ((o = rdbLoadStringObject(fp,rdbver)) == NULL) goto eoferr;
- tryObjectEncoding(o);
- } else if (type == REDIS_LIST || type == REDIS_SET) {
- /* Read list/set value */
- uint32_t listlen;
-
- if ((listlen = rdbLoadLen(fp,rdbver,NULL)) == REDIS_RDB_LENERR)
- goto eoferr;
- o = (type == REDIS_LIST) ? createListObject() : createSetObject();
- /* Load every single element of the list/set */
- while(listlen--) {
- robj *ele;
-
- if ((ele = rdbLoadStringObject(fp,rdbver)) == NULL) goto eoferr;
- tryObjectEncoding(ele);
- if (type == REDIS_LIST) {
- listAddNodeTail((list*)o->ptr,ele);
- } else {
- dictAdd((dict*)o->ptr,ele,NULL);
- }
- }
- } else if (type == REDIS_ZSET) {
- /* Read list/set value */
- uint32_t zsetlen;
- zset *zs;
-
- if ((zsetlen = rdbLoadLen(fp,rdbver,NULL)) == REDIS_RDB_LENERR)
- goto eoferr;
- o = createZsetObject();
- zs = o->ptr;
- /* Load every single element of the list/set */
- while(zsetlen--) {
- robj *ele;
- double *score = zmalloc(sizeof(double));
-
- if ((ele = rdbLoadStringObject(fp,rdbver)) == NULL) goto eoferr;
- tryObjectEncoding(ele);
- if (rdbLoadDoubleValue(fp,score) == -1) goto eoferr;
- dictAdd(zs->dict,ele,score);
- zslInsert(zs->zsl,*score,ele);
- incrRefCount(ele); /* added to skiplist */
- }
- } else {
- redisAssert(0 != 0);
- }
+ if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
+ /* Read value */
+ if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
/* Add the new object in the hash table */
retval = dictAdd(d,keyobj,o);
if (retval == DICT_ERR) {
expiretime = -1;
}
keyobj = o = NULL;
+ /* Handle swapping while loading big datasets when VM is on */
+ loadedkeys++;
+ if (server.vm_enabled && (loadedkeys % 5000) == 0) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObject() == REDIS_ERR) break;
+ }
+ }
}
fclose(fp);
return REDIS_OK;
retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
if (retval == DICT_ERR) {
if (!nx) {
+ /* If the key is about a swapped value, we want a new key object
+ * to overwrite the old. So we delete the old key in the database.
+ * This will also make sure that swap pages about the old object
+ * will be marked as free. */
+ if (deleteIfSwapped(c->db,c->argv[1]))
+ incrRefCount(c->argv[1]);
dictReplace(c->db->dict,c->argv[1],c->argv[2]);
incrRefCount(c->argv[2]);
} else {
"redis_version:%s\r\n"
"arch_bits:%s\r\n"
"multiplexing_api:%s\r\n"
+ "process_id:%ld\r\n"
"uptime_in_seconds:%ld\r\n"
"uptime_in_days:%ld\r\n"
"connected_clients:%d\r\n"
"bgrewriteaof_in_progress:%d\r\n"
"total_connections_received:%lld\r\n"
"total_commands_processed:%lld\r\n"
+ "vm_enabled:%d\r\n"
"role:%s\r\n"
,REDIS_VERSION,
(sizeof(long) == 8) ? "64" : "32",
aeGetApiName(),
+ (long) getpid(),
uptime,
uptime/(3600*24),
listLength(server.clients)-listLength(server.slaves),
server.bgrewritechildpid != -1,
server.stat_numconnections,
server.stat_numcommands,
+ server.vm_enabled != 0,
server.masterhost == NULL ? "master" : "slave"
);
if (server.masterhost) {
server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
);
}
+ if (server.vm_enabled) {
+ info = sdscatprintf(info,
+ "vm_conf_max_memory:%llu\r\n"
+ "vm_conf_page_size:%llu\r\n"
+ "vm_conf_pages:%llu\r\n"
+ "vm_stats_used_pages:%llu\r\n"
+ "vm_stats_swapped_objects:%llu\r\n"
+ "vm_stats_swappin_count:%llu\r\n"
+ "vm_stats_swappout_count:%llu\r\n"
+ ,(unsigned long long) server.vm_max_memory,
+ (unsigned long long) server.vm_page_size,
+ (unsigned long long) server.vm_pages,
+ (unsigned long long) server.vm_stats_used_pages,
+ (unsigned long long) server.vm_stats_swapped_objects,
+ (unsigned long long) server.vm_stats_swapins,
+ (unsigned long long) server.vm_stats_swapouts
+ );
+ }
for (j = 0; j < server.dbnum; j++) {
long long keys, vkeys;
return;
}
if ((nwritten = write(fd,buf,buflen)) == -1) {
- redisLog(REDIS_DEBUG,"Write error sending DB to slave: %s",
+ redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
strerror(errno));
freeClient(slave);
return;
/* ============================ Maxmemory directive ======================== */
+/* Free one object form the pre-allocated objects free list. This is useful
+ * under low mem conditions as by default we take 1 million free objects
+ * allocated. */
+static void freeOneObjectFromFreelist(void) {
+ robj *o;
+
+ listNode *head = listFirst(server.objfreelist);
+ o = listNodeValue(head);
+ listDelNode(server.objfreelist,head);
+ zfree(o);
+}
+
/* This function gets called when 'maxmemory' is set on the config file to limit
* the max memory used by the server, and we are out of memory.
* This function will try to, in order:
static void freeMemoryIfNeeded(void) {
while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
if (listLength(server.objfreelist)) {
- robj *o;
-
- listNode *head = listFirst(server.objfreelist);
- o = listNodeValue(head);
- listDelNode(server.objfreelist,head);
- zfree(o);
+ freeOneObjectFromFreelist();
} else {
int j, k, freed = 0;
struct redisClient *fakeClient;
FILE *fp = fopen(filename,"r");
struct redis_stat sb;
+ unsigned long long loadedkeys = 0;
if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
return REDIS_ERR;
/* Clean up, ready for the next command */
for (j = 0; j < argc; j++) decrRefCount(argv[j]);
zfree(argv);
+ /* Handle swapping while loading big datasets when VM is on */
+ loadedkeys++;
+ if (server.vm_enabled && (loadedkeys % 5000) == 0) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObject() == REDIS_ERR) break;
+ }
+ }
}
fclose(fp);
freeFakeClient(fakeClient);
/* Iterate this DB writing every entry */
while((de = dictNext(di)) != NULL) {
- robj *key = dictGetEntryKey(de);
- robj *o = dictGetEntryVal(de);
- time_t expiretime = getExpire(db,key);
+ robj *key, *o;
+ time_t expiretime;
+ int swapped;
+
+ key = dictGetEntryKey(de);
+ if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY) {
+ o = dictGetEntryVal(de);
+ swapped = 0;
+ } else {
+ o = vmPreviewObject(key);
+ key = dupStringObject(key);
+ swapped = 1;
+ }
+ expiretime = getExpire(db,key);
/* Save the key and associated value */
if (o->type == REDIS_STRING) {
if (fwriteBulk(fp,key) == 0) goto werr;
if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
}
+ /* We created a few temp objects if the key->value pair
+ * was about a swapped out object. Free both. */
+ if (swapped) {
+ decrRefCount(key);
+ decrRefCount(o);
+ }
}
dictReleaseIterator(di);
}
server.vm_fd = fileno(server.vm_fp);
server.vm_next_page = 0;
server.vm_near_pages = 0;
+ server.vm_stats_used_pages = 0;
+ server.vm_stats_swapped_objects = 0;
+ server.vm_stats_swapouts = 0;
+ server.vm_stats_swapins = 0;
totsize = server.vm_pages*server.vm_page_size;
redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
if (ftruncate(server.vm_fd,totsize) == -1) {
} else {
redisLog(REDIS_NOTICE,"Swap file allocated with success");
}
- server.vm_bitmap = zmalloc((server.vm_near_pages+7)/8);
- memset(server.vm_bitmap,0,(server.vm_near_pages+7)/8);
+ server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
+ redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
+ (long long) (server.vm_pages+7)/8, server.vm_pages);
+ memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
/* Try to remove the swap file, so the OS will really delete it from the
* file system when Redis exists. */
unlink("/tmp/redisvm");
off_t byte = page/8;
int bit = page&7;
server.vm_bitmap[byte] |= 1<<bit;
+ redisLog(REDIS_DEBUG,"Mark used: %lld (byte:%lld bit:%d)\n",
+ (long long)page, (long long)byte, bit);
}
/* Mark N contiguous pages as used, with 'page' being the first. */
off_t j;
for (j = 0; j < count; j++)
- vmMarkPageUsed(page+count);
+ vmMarkPageUsed(page+j);
+ server.vm_stats_used_pages += count;
}
/* Mark the page as free */
off_t j;
for (j = 0; j < count; j++)
- vmMarkPageFree(page+count);
+ vmMarkPageFree(page+j);
+ server.vm_stats_used_pages -= count;
}
/* Test if the page is free */
static int vmFreePage(off_t page) {
off_t byte = page/8;
int bit = page&7;
- return server.vm_bitmap[byte] & bit;
+ return (server.vm_bitmap[byte] & (1<<bit)) == 0;
}
/* Find N contiguous free pages storing the first page of the cluster in *first.
- * Returns 1 if it was able to find N contiguous pages, otherwise 0 is
- * returned.
+ * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
+ * REDIS_ERR is returned.
*
* This function uses a simple algorithm: we try to allocate
* REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
while(offset < server.vm_pages) {
off_t this = base+offset;
+ redisLog(REDIS_DEBUG, "THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
/* If we overflow, restart from page zero */
if (this >= server.vm_pages) {
this -= server.vm_pages;
numfree++;
/* Already got N free pages? Return to the caller, with success */
if (numfree == n) {
- *first = this;
- return 1;
+ *first = this-(n-1);
+ server.vm_next_page = this+1;
+ return REDIS_OK;
}
} else {
/* The current one is not a free page */
offset++;
}
}
- return 0;
+ return REDIS_ERR;
+}
+
+/* Swap the 'val' object relative to 'key' into disk. Store all the information
+ * needed to later retrieve the object into the key object.
+ * If we can't find enough contiguous empty pages to swap the object on disk
+ * REDIS_ERR is returned. */
+static int vmSwapObject(robj *key, robj *val) {
+ off_t pages = rdbSavedObjectPages(val);
+ off_t page;
+
+ assert(key->storage == REDIS_VM_MEMORY);
+ assert(key->refcount == 1);
+ if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
+ if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
+ redisLog(REDIS_WARNING,
+ "Critical VM problem in vmSwapObject(): can't seek: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ rdbSaveObject(server.vm_fp,val);
+ key->vm.page = page;
+ key->vm.usedpages = pages;
+ key->storage = REDIS_VM_SWAPPED;
+ key->vtype = val->type;
+ decrRefCount(val); /* Deallocate the object from memory. */
+ vmMarkPagesUsed(page,pages);
+ redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
+ (unsigned char*) key->ptr,
+ (unsigned long long) page, (unsigned long long) pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ return REDIS_OK;
+}
+
+/* Load the value object relative to the 'key' object from swap to memory.
+ * The newly allocated object is returned.
+ *
+ * If preview is true the unserialized object is returned to the caller but
+ * no changes are made to the key object, nor the pages are marked as freed */
+static robj *vmGenericLoadObject(robj *key, int preview) {
+ robj *val;
+
+ redisAssert(key->storage == REDIS_VM_SWAPPED);
+ if (fseeko(server.vm_fp,key->vm.page*server.vm_page_size,SEEK_SET) == -1) {
+ redisLog(REDIS_WARNING,
+ "Unrecoverable VM problem in vmLoadObject(): can't seek: %s",
+ strerror(errno));
+ exit(1);
+ }
+ val = rdbLoadObject(key->vtype,server.vm_fp);
+ if (val == NULL) {
+ redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmLoadObject(): can't load object from swap file: %s", strerror(errno));
+ exit(1);
+ }
+ if (!preview) {
+ key->storage = REDIS_VM_MEMORY;
+ key->vm.atime = server.unixtime;
+ vmMarkPagesFree(key->vm.page,key->vm.usedpages);
+ redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
+ (unsigned char*) key->ptr);
+ server.vm_stats_swapped_objects--;
+ } else {
+ redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
+ (unsigned char*) key->ptr);
+ }
+ server.vm_stats_swapins++;
+ return val;
+}
+
+/* Plain object loading, from swap to memory */
+static robj *vmLoadObject(robj *key) {
+ return vmGenericLoadObject(key,0);
+}
+
+/* Just load the value on disk, without to modify the key.
+ * This is useful when we want to perform some operation on the value
+ * without to really bring it from swap to memory, like while saving the
+ * dataset or rewriting the append only log. */
+static robj *vmPreviewObject(robj *key) {
+ return vmGenericLoadObject(key,1);
+}
+
+/* How a good candidate is this object for swapping?
+ * The better candidate it is, the greater the returned value.
+ *
+ * Currently we try to perform a fast estimation of the object size in
+ * memory, and combine it with aging informations.
+ *
+ * Basically swappability = idle-time * log(estimated size)
+ *
+ * Bigger objects are preferred over smaller objects, but not
+ * proportionally, this is why we use the logarithm. This algorithm is
+ * just a first try and will probably be tuned later. */
+static double computeObjectSwappability(robj *o) {
+ time_t age = server.unixtime - o->vm.atime;
+ long asize = 0;
+ list *l;
+ dict *d;
+ struct dictEntry *de;
+ int z;
+
+ if (age <= 0) return 0;
+ switch(o->type) {
+ case REDIS_STRING:
+ if (o->encoding != REDIS_ENCODING_RAW) {
+ asize = sizeof(*o);
+ } else {
+ asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
+ }
+ break;
+ case REDIS_LIST:
+ l = o->ptr;
+ listNode *ln = listFirst(l);
+
+ asize = sizeof(list);
+ if (ln) {
+ robj *ele = ln->value;
+ long elesize;
+
+ elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
+ (sizeof(*o)+sdslen(ele->ptr)) :
+ sizeof(*o);
+ asize += (sizeof(listNode)+elesize)*listLength(l);
+ }
+ break;
+ case REDIS_SET:
+ case REDIS_ZSET:
+ z = (o->type == REDIS_ZSET);
+ d = z ? ((zset*)o->ptr)->dict : o->ptr;
+
+ asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
+ if (z) asize += sizeof(zset)-sizeof(dict);
+ if (dictSize(d)) {
+ long elesize;
+ robj *ele;
+
+ de = dictGetRandomKey(d);
+ ele = dictGetEntryKey(de);
+ elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
+ (sizeof(*o)+sdslen(ele->ptr)) :
+ sizeof(*o);
+ asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
+ if (z) asize += sizeof(zskiplistNode)*dictSize(d);
+ }
+ break;
+ }
+ return (double)asize*log(1+asize);
+}
+
+/* Try to swap an object that's a good candidate for swapping.
+ * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
+ * to swap any object at all. */
+static int vmSwapOneObject(void) {
+ int j, i;
+ struct dictEntry *best = NULL;
+ double best_swappability = 0;
+ robj *key, *val;
+
+ for (j = 0; j < server.dbnum; j++) {
+ redisDb *db = server.db+j;
+ int maxtries = 1000;
+
+ if (dictSize(db->dict) == 0) continue;
+ for (i = 0; i < 5; i++) {
+ dictEntry *de;
+ double swappability;
+
+ if (maxtries) maxtries--;
+ de = dictGetRandomKey(db->dict);
+ key = dictGetEntryKey(de);
+ val = dictGetEntryVal(de);
+ if (key->storage != REDIS_VM_MEMORY) {
+ if (maxtries) i--; /* don't count this try */
+ continue;
+ }
+ swappability = computeObjectSwappability(val);
+ if (!best || swappability > best_swappability) {
+ best = de;
+ best_swappability = swappability;
+ }
+ }
+ }
+ if (best == NULL) {
+ redisLog(REDIS_DEBUG,"No swappable key found!");
+ return REDIS_ERR;
+ }
+ key = dictGetEntryKey(best);
+ val = dictGetEntryVal(best);
+
+ redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
+ key->ptr, best_swappability);
+
+ /* Unshare the key if needed */
+ if (key->refcount > 1) {
+ robj *newkey = dupStringObject(key);
+ decrRefCount(key);
+ key = dictGetEntryKey(best) = newkey;
+ }
+ /* Swap it */
+ if (vmSwapObject(key,val) == REDIS_OK) {
+ dictGetEntryVal(best) = NULL;
+ return REDIS_OK;
+ } else {
+ return REDIS_ERR;
+ }
+}
+
+/* Return true if it's safe to swap out objects in a given moment.
+ * Basically we don't want to swap objects out while there is a BGSAVE
+ * or a BGAEOREWRITE running in backgroud. */
+static int vmCanSwapOut(void) {
+ return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
+}
+
+/* Delete a key if swapped. Returns 1 if the key was found, was swapped
+ * and was deleted. Otherwise 0 is returned. */
+static int deleteIfSwapped(redisDb *db, robj *key) {
+ dictEntry *de;
+ robj *foundkey;
+
+ if ((de = dictFind(db->dict,key)) == NULL) return 0;
+ foundkey = dictGetEntryKey(de);
+ if (foundkey->storage == REDIS_VM_MEMORY) return 0;
+ deleteKey(db,key);
+ return 1;
}
/* ================================= Debugging ============================== */
"+Key at:%p refcount:%d, value at:%p refcount:%d encoding:%d serializedlength:%lld\r\n",
(void*)key, key->refcount, (void*)val, val->refcount,
val->encoding, rdbSavedObjectLen(val)));
+ } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
+ dictEntry *de = dictFind(c->db->dict,c->argv[2]);
+ robj *key, *val;
+
+ if (!server.vm_enabled) {
+ addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
+ return;
+ }
+ if (!de) {
+ addReply(c,shared.nokeyerr);
+ return;
+ }
+ key = dictGetEntryKey(de);
+ val = dictGetEntryVal(de);
+ /* If the key is shared we want to create a copy */
+ if (key->refcount > 1) {
+ robj *newkey = dupStringObject(key);
+ decrRefCount(key);
+ key = dictGetEntryKey(de) = newkey;
+ }
+ /* Swap it */
+ if (key->storage != REDIS_VM_MEMORY) {
+ addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
+ } else if (vmSwapObject(key,val) == REDIS_OK) {
+ dictGetEntryVal(de) = NULL;
+ addReply(c,shared.ok);
+ } else {
+ addReply(c,shared.err);
+ }
} else {
addReplySds(c,sdsnew(
- "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|RELOAD]\r\n"));
+ "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
}
}