#include <sys/uio.h>
#include <limits.h>
#include <math.h>
+#include <pthread.h>
#if defined(__sun)
#include "solarisfixes.h"
* Check vmFindContiguousPages() to know more about this magic numbers. */
#define REDIS_VM_MAX_NEAR_PAGES 65536
#define REDIS_VM_MAX_RANDOM_JUMP 4096
+#define REDIS_VM_MAX_THREADS 32
/* Client flags */
#define REDIS_CLOSE 1 /* This client connection should be closed ASAP */
#define REDIS_MONITOR 8 /* This client is a slave monitor, see MONITOR */
#define REDIS_MULTI 16 /* This client is in a MULTI context */
#define REDIS_BLOCKED 32 /* The client is waiting in a blocking operation */
+#define REDIS_IO_WAIT 64 /* The client is waiting for Virtual Memory I/O */
/* Slave replication state - slave side */
#define REDIS_REPL_NONE 0 /* No active replication */
/* Log levels */
#define REDIS_DEBUG 0
-#define REDIS_NOTICE 1
-#define REDIS_WARNING 2
+#define REDIS_VERBOSE 1
+#define REDIS_NOTICE 2
+#define REDIS_WARNING 3
/* Anti-warning macro... */
#define REDIS_NOTUSED(V) ((void) V)
int blockingkeysnum; /* Number of blocking keys */
time_t blockingto; /* Blocking operation timeout. If UNIX current time
* is >= blockingto then the operation timed out. */
+ list *io_keys; /* Keys this client is waiting to be loaded from the
+ * swap file in order to continue. */
} redisClient;
struct saveparam {
off_t vm_near_pages; /* Number of pages allocated sequentially */
unsigned char *vm_bitmap; /* Bitmap of free/used pages */
time_t unixtime; /* Unix time sampled every second. */
+ /* Virtual memory I/O threads stuff */
+ /* An I/O thread process an element taken from the io_jobs queue and
+ * put the result of the operation in the io_done list. While the
+ * job is being processed, it's put on io_processing queue. */
+ list *io_newjobs; /* List of VM I/O jobs yet to be processed */
+ list *io_processing; /* List of VM I/O jobs being processed */
+ list *io_processed; /* List of VM I/O jobs already processed */
+ list *io_clients; /* All the clients waiting for SWAP I/O operations */
+ pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
+ int io_active_threads; /* Number of running I/O threads */
+ int vm_max_threads; /* Max number of I/O threads running at the same time */
+ /* Our main thread is blocked on the event loop, locking for sockets ready
+ * to be read or written, so when a threaded I/O operation is ready to be
+ * processed by the main thread, the I/O thread will use a unix pipe to
+ * awake the main thread. The followings are the two pipe FDs. */
+ int io_ready_pipe_read;
+ int io_ready_pipe_write;
+ /* Virtual memory stats */
+ unsigned long long vm_stats_used_pages;
+ unsigned long long vm_stats_swapped_objects;
+ unsigned long long vm_stats_swapouts;
+ unsigned long long vm_stats_swapins;
+ FILE *devnull;
};
typedef void redisCommandProc(redisClient *c);
static double R_Zero, R_PosInf, R_NegInf, R_Nan;
+/* VM threaded I/O request message */
+#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
+#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
+#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
+typedef struct iojon {
+ int type; /* Request type, REDIS_IOJOB_* */
+ redisDb *db;/* Redis database */
+ robj *key; /* This I/O request is about swapping this key */
+ robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
+ * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
+ off_t page; /* Swap page where to read/write the object */
+ off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
+ int canceled; /* True if this command was canceled by blocking side of VM */
+ pthread_t thread; /* ID of the thread processing this entry */
+} iojob;
+
/*================================ Prototypes =============================== */
static void freeStringObject(robj *o);
static void vmMarkPagesFree(off_t page, off_t count);
static robj *vmLoadObject(robj *key);
static robj *vmPreviewObject(robj *key);
-static int vmSwapOneObject(void);
+static int vmSwapOneObjectBlocking(void);
+static int vmSwapOneObjectThreaded(void);
static int vmCanSwapOut(void);
+static void freeOneObjectFromFreelist(void);
+static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
+static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
+static void vmCancelThreadedIOJob(robj *o);
+static void lockThreadedIO(void);
+static void unlockThreadedIO(void);
+static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
+static void freeIOJob(iojob *j);
+static void queueIOJob(iojob *j);
static void authCommand(redisClient *c);
static void pingCommand(redisClient *c);
!(c->flags & REDIS_MASTER) && /* no timeout for masters */
(now - c->lastinteraction > server.maxidletime))
{
- redisLog(REDIS_DEBUG,"Closing idle client");
+ redisLog(REDIS_VERBOSE,"Closing idle client");
freeClient(c);
} else if (c->flags & REDIS_BLOCKED) {
if (c->blockingto != 0 && c->blockingto < now) {
for (j = 0; j < server.dbnum; j++) {
if (htNeedsResize(server.db[j].dict)) {
- redisLog(REDIS_DEBUG,"The hash table %d is too sparse, resize it...",j);
+ redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
dictResize(server.db[j].dict);
- redisLog(REDIS_DEBUG,"Hash table %d resized.",j);
+ redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
}
if (htNeedsResize(server.db[j].expires))
dictResize(server.db[j].expires);
used = dictSize(server.db[j].dict);
vkeys = dictSize(server.db[j].expires);
if (!(loops % 5) && (used || vkeys)) {
- redisLog(REDIS_DEBUG,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
+ redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
/* dictPrintStats(server.dict); */
}
}
/* Show information about connected clients */
if (!(loops % 5)) {
- redisLog(REDIS_DEBUG,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
+ redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
listLength(server.clients)-listLength(server.slaves),
listLength(server.slaves),
server.usedmemory,
}
/* Swap a few keys on disk if we are over the memory limit and VM
- * is enbled. */
+ * is enbled. Try to free objects from the free list first. */
if (vmCanSwapOut()) {
while (server.vm_enabled && zmalloc_used_memory() >
- server.vm_max_memory) {
- if (vmSwapOneObject() == REDIS_ERR) {
- if (zmalloc_used_memory() >
- (server.vm_max_memory+server.vm_max_memory/10)) {
- redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
+ server.vm_max_memory)
+ {
+ if (listLength(server.objfreelist)) {
+ freeOneObjectFromFreelist();
+ } else {
+ if (vmSwapOneObjectThreaded() == REDIS_ERR) {
+ if ((loops % 30) == 0 && zmalloc_used_memory() >
+ (server.vm_max_memory+server.vm_max_memory/10)) {
+ redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
+ }
}
+ /* Note that we freed just one object, because anyway when
+ * the I/O thread in charge to swap this object out will
+ * do its work, the handler of completed jobs will try to swap
+ * more objects if we are out of memory. */
break;
}
}
static void initServerConfig() {
server.dbnum = REDIS_DEFAULT_DBNUM;
server.port = REDIS_SERVERPORT;
- server.verbosity = REDIS_DEBUG;
+ server.verbosity = REDIS_VERBOSE;
server.maxidletime = REDIS_MAXIDLETIME;
server.saveparams = NULL;
server.logfile = NULL; /* NULL = log on standard output */
server.vm_page_size = 256; /* 256 bytes per page */
server.vm_pages = 1024*1024*100; /* 104 millions of pages */
server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
+ server.vm_max_threads = 4;
resetServerSaveParams();
signal(SIGPIPE, SIG_IGN);
setupSigSegvAction();
+ server.devnull = fopen("/dev/null","w");
+ if (server.devnull == NULL) {
+ redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
+ exit(1);
+ }
server.clients = listCreate();
server.slaves = listCreate();
server.monitors = listCreate();
server.stat_starttime = time(NULL);
server.unixtime = time(NULL);
aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
+ if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
+ acceptHandler, NULL) == AE_ERR) oom("creating file event");
if (server.appendonly) {
server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
}
} else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
+ else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
else {
server.vm_page_size = strtoll(argv[1], NULL, 10);
} else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
server.vm_pages = strtoll(argv[1], NULL, 10);
+ } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
+ server.vm_max_threads = strtoll(argv[1], NULL, 10);
} else {
err = "Bad directive or wrong number of arguments"; goto loaderr;
}
listRelease(c->reply);
freeClientArgv(c);
close(c->fd);
+ /* Remove from the list of clients */
ln = listSearchKey(server.clients,c);
redisAssert(ln != NULL);
listDelNode(server.clients,ln);
+ /* Remove from the list of clients waiting for VM operations */
+ if (server.vm_enabled && listLength(c->io_keys)) {
+ ln = listSearchKey(server.io_clients,c);
+ if (ln) listDelNode(server.io_clients,ln);
+ listRelease(c->io_keys);
+ }
+ listRelease(c->io_keys);
+ /* Other cleanup */
if (c->flags & REDIS_SLAVE) {
if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
close(c->repldbfd);
if (errno == EAGAIN) {
nwritten = 0;
} else {
- redisLog(REDIS_DEBUG,
+ redisLog(REDIS_VERBOSE,
"Error writing to client: %s", strerror(errno));
freeClient(c);
return;
/* write all collected blocks at once */
if((nwritten = writev(fd, iov, ion)) < 0) {
if (errno != EAGAIN) {
- redisLog(REDIS_DEBUG,
+ redisLog(REDIS_VERBOSE,
"Error writing to client: %s", strerror(errno));
freeClient(c);
return;
* would not be called at all, but after the execution of the first commands
* in the input buffer the client may be blocked, and the "goto again"
* will try to reiterate. The following line will make it return asap. */
- if (c->flags & REDIS_BLOCKED) return;
+ if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
if (c->bulklen == -1) {
/* Read the first line of the query */
char *p = strchr(c->querybuf,'\n');
}
return;
} else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
- redisLog(REDIS_DEBUG, "Client protocol error");
+ redisLog(REDIS_VERBOSE, "Client protocol error");
freeClient(c);
return;
}
if (errno == EAGAIN) {
nread = 0;
} else {
- redisLog(REDIS_DEBUG, "Reading from client: %s",strerror(errno));
+ redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
freeClient(c);
return;
}
} else if (nread == 0) {
- redisLog(REDIS_DEBUG, "Client closed connection");
+ redisLog(REDIS_VERBOSE, "Client closed connection");
freeClient(c);
return;
}
c->authenticated = 0;
c->replstate = REDIS_REPL_NONE;
c->reply = listCreate();
- c->blockingkeys = NULL;
- c->blockingkeysnum = 0;
listSetFreeMethod(c->reply,decrRefCount);
listSetDupMethod(c->reply,dupClientReplyValue);
+ c->blockingkeys = NULL;
+ c->blockingkeysnum = 0;
+ c->io_keys = listCreate();
+ listSetFreeMethod(c->io_keys,decrRefCount);
if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
readQueryFromClient, c) == AE_ERR) {
freeClient(c);
cfd = anetAccept(server.neterr, fd, cip, &cport);
if (cfd == AE_ERR) {
- redisLog(REDIS_DEBUG,"Accepting client connection: %s", server.neterr);
+ redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
return;
}
- redisLog(REDIS_DEBUG,"Accepted %s:%d", cip, cport);
+ redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
if ((c = createClient(cfd)) == NULL) {
redisLog(REDIS_WARNING,"Error allocating resoures for the client");
close(cfd); /* May be already closed, just ingore errors */
}
static robj *dupStringObject(robj *o) {
+ assert(o->encoding == REDIS_ENCODING_RAW);
return createStringObject(o->ptr,sdslen(o->ptr));
}
static void decrRefCount(void *obj) {
robj *o = obj;
- /* REDIS_VM_SWAPPED */
- if (server.vm_enabled && o->storage == REDIS_VM_SWAPPED) {
- redisAssert(o->refcount == 1);
+ /* Object is swapped out, or in the process of being loaded. */
+ if (server.vm_enabled &&
+ (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
+ {
+ if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
+ redisAssert(o->refcount == 1);
+ }
+ if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
redisAssert(o->type == REDIS_STRING);
freeStringObject(o);
vmMarkPagesFree(o->vm.page,o->vm.usedpages);
if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
!listAddNodeHead(server.objfreelist,o))
zfree(o);
+ server.vm_stats_swapped_objects--;
return;
}
- /* REDIS_VM_MEMORY */
+ /* Object is in memory, or in the process of being swapped out. */
if (--(o->refcount) == 0) {
+ if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
+ vmCancelThreadedIOJob(obj);
switch(o->type) {
case REDIS_STRING: freeStringObject(o); break;
case REDIS_LIST: freeListObject(o); break;
robj *val = dictGetEntryVal(de);
if (server.vm_enabled) {
- if (key->storage == REDIS_VM_MEMORY) {
+ if (key->storage == REDIS_VM_MEMORY ||
+ key->storage == REDIS_VM_SWAPPING)
+ {
+ /* If we were swapping the object out, stop it, this key
+ * was requested. */
+ if (key->storage == REDIS_VM_SWAPPING)
+ vmCancelThreadedIOJob(key);
/* Update the access time of the key for the aging algorithm. */
key->vm.atime = server.unixtime;
} else {
static int rdbSaveStringObject(FILE *fp, robj *obj) {
int retval;
- obj = getDecodedObject(obj);
- retval = rdbSaveStringObjectRaw(fp,obj);
- decrRefCount(obj);
+ if (obj->storage == REDIS_VM_MEMORY &&
+ obj->encoding != REDIS_ENCODING_RAW)
+ {
+ obj = getDecodedObject(obj);
+ retval = rdbSaveStringObjectRaw(fp,obj);
+ decrRefCount(obj);
+ } else {
+ /* This is a fast path when we are sure the object is not encoded.
+ * Note that's any *faster* actually as we needed to add the conditional
+ * but because this may happen in a background process we don't want
+ * to touch the object fields with incr/decrRefCount in order to
+ * preveny copy on write of pages.
+ *
+ * Also incrRefCount() will have a failing assert() if we try to call
+ * it against an object with storage != REDIS_VM_MEMORY. */
+ retval = rdbSaveStringObjectRaw(fp,obj);
+ }
return retval;
}
* the rdbSaveObject() function. Currently we use a trick to get
* this length with very little changes to the code. In the future
* we could switch to a faster solution. */
-static off_t rdbSavedObjectLen(robj *o) {
- static FILE *fp = NULL;
-
- if (fp == NULL) fp = fopen("/dev/null","w");
- assert(fp != NULL);
-
+static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
+ if (fp == NULL) fp = server.devnull;
rewind(fp);
assert(rdbSaveObject(fp,o) != 1);
return ftello(fp);
}
/* Return the number of pages required to save this object in the swap file */
-static off_t rdbSavedObjectPages(robj *o) {
- off_t bytes = rdbSavedObjectLen(o);
+static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
+ off_t bytes = rdbSavedObjectLen(o,fp);
return (bytes+(server.vm_page_size-1))/server.vm_page_size;
}
}
/* Save the key and associated value. This requires special
* handling if the value is swapped out. */
- if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY) {
+ if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
+ key->storage == REDIS_VM_SWAPPING) {
/* Save type, key, value */
if (rdbSaveType(fp,o->type) == -1) goto werr;
if (rdbSaveStringObject(fp,key) == -1) goto werr;
if (rdbSaveObject(fp,o) == -1) goto werr;
} else {
- robj *po, *newkey;
+ /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
+ robj *po;
/* Get a preview of the object in memory */
po = vmPreviewObject(key);
- /* Also duplicate the key object, to pass around a standard
- * string object. */
- newkey = dupStringObject(key);
/* Save type, key, value */
if (rdbSaveType(fp,key->vtype) == -1) goto werr;
- if (rdbSaveStringObject(fp,newkey) == -1) goto werr;
+ if (rdbSaveStringObject(fp,key) == -1) goto werr;
if (rdbSaveObject(fp,po) == -1) goto werr;
/* Remove the loaded object from memory */
decrRefCount(po);
- decrRefCount(newkey);
}
}
dictReleaseIterator(di);
redisDb *db = server.db+0;
char buf[1024];
time_t expiretime = -1, now = time(NULL);
+ long long loadedkeys = 0;
fp = fopen(filename,"r");
if (!fp) return REDIS_ERR;
expiretime = -1;
}
keyobj = o = NULL;
+ /* Handle swapping while loading big datasets when VM is on */
+ loadedkeys++;
+ if (server.vm_enabled && (loadedkeys % 5000) == 0) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
+ }
+ }
}
fclose(fp);
return REDIS_OK;
zfree(vector);
}
+/* Convert an amount of bytes into a human readable string in the form
+ * of 100B, 2G, 100M, 4K, and so forth. */
+static void bytesToHuman(char *s, unsigned long long n) {
+ double d;
+
+ if (n < 1024) {
+ /* Bytes */
+ sprintf(s,"%lluB",n);
+ return;
+ } else if (n < (1024*1024)) {
+ d = (double)n/(1024);
+ sprintf(s,"%.2fK",d);
+ } else if (n < (1024LL*1024*1024)) {
+ d = (double)n/(1024*1024);
+ sprintf(s,"%.2fM",d);
+ } else if (n < (1024LL*1024*1024*1024)) {
+ d = (double)n/(1024LL*1024*1024);
+ sprintf(s,"%.2fM",d);
+ }
+}
+
/* Create the string returned by the INFO command. This is decoupled
* by the INFO command itself as we need to report the same information
* on memory corruption problems. */
sds info;
time_t uptime = time(NULL)-server.stat_starttime;
int j;
-
+ char hmem[64];
+
+ bytesToHuman(hmem,server.usedmemory);
info = sdscatprintf(sdsempty(),
"redis_version:%s\r\n"
"arch_bits:%s\r\n"
"multiplexing_api:%s\r\n"
+ "process_id:%ld\r\n"
"uptime_in_seconds:%ld\r\n"
"uptime_in_days:%ld\r\n"
"connected_clients:%d\r\n"
"connected_slaves:%d\r\n"
"blocked_clients:%d\r\n"
"used_memory:%zu\r\n"
+ "used_memory_human:%s\r\n"
"changes_since_last_save:%lld\r\n"
"bgsave_in_progress:%d\r\n"
"last_save_time:%ld\r\n"
"bgrewriteaof_in_progress:%d\r\n"
"total_connections_received:%lld\r\n"
"total_commands_processed:%lld\r\n"
+ "vm_enabled:%d\r\n"
"role:%s\r\n"
,REDIS_VERSION,
(sizeof(long) == 8) ? "64" : "32",
aeGetApiName(),
+ (long) getpid(),
uptime,
uptime/(3600*24),
listLength(server.clients)-listLength(server.slaves),
listLength(server.slaves),
server.blockedclients,
server.usedmemory,
+ hmem,
server.dirty,
server.bgsavechildpid != -1,
server.lastsave,
server.bgrewritechildpid != -1,
server.stat_numconnections,
server.stat_numcommands,
+ server.vm_enabled != 0,
server.masterhost == NULL ? "master" : "slave"
);
if (server.masterhost) {
server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
);
}
+ if (server.vm_enabled) {
+ info = sdscatprintf(info,
+ "vm_conf_max_memory:%llu\r\n"
+ "vm_conf_page_size:%llu\r\n"
+ "vm_conf_pages:%llu\r\n"
+ "vm_stats_used_pages:%llu\r\n"
+ "vm_stats_swapped_objects:%llu\r\n"
+ "vm_stats_swappin_count:%llu\r\n"
+ "vm_stats_swappout_count:%llu\r\n"
+ "vm_stats_io_newjobs_len:%lu\r\n"
+ "vm_stats_io_processing_len:%lu\r\n"
+ "vm_stats_io_processed_len:%lu\r\n"
+ "vm_stats_io_waiting_clients:%lu\r\n"
+ ,(unsigned long long) server.vm_max_memory,
+ (unsigned long long) server.vm_page_size,
+ (unsigned long long) server.vm_pages,
+ (unsigned long long) server.vm_stats_used_pages,
+ (unsigned long long) server.vm_stats_swapped_objects,
+ (unsigned long long) server.vm_stats_swapins,
+ (unsigned long long) server.vm_stats_swapouts,
+ (unsigned long) listLength(server.io_newjobs),
+ (unsigned long) listLength(server.io_processing),
+ (unsigned long) listLength(server.io_processed),
+ (unsigned long) listLength(server.io_clients)
+ );
+ }
for (j = 0; j < server.dbnum; j++) {
long long keys, vkeys;
return;
}
if ((nwritten = write(fd,buf,buflen)) == -1) {
- redisLog(REDIS_DEBUG,"Write error sending DB to slave: %s",
+ redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
strerror(errno));
freeClient(slave);
return;
/* ============================ Maxmemory directive ======================== */
+/* Free one object form the pre-allocated objects free list. This is useful
+ * under low mem conditions as by default we take 1 million free objects
+ * allocated. */
+static void freeOneObjectFromFreelist(void) {
+ robj *o;
+
+ listNode *head = listFirst(server.objfreelist);
+ o = listNodeValue(head);
+ listDelNode(server.objfreelist,head);
+ zfree(o);
+}
+
/* This function gets called when 'maxmemory' is set on the config file to limit
* the max memory used by the server, and we are out of memory.
* This function will try to, in order:
static void freeMemoryIfNeeded(void) {
while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
if (listLength(server.objfreelist)) {
- robj *o;
-
- listNode *head = listFirst(server.objfreelist);
- o = listNodeValue(head);
- listDelNode(server.objfreelist,head);
- zfree(o);
+ freeOneObjectFromFreelist();
} else {
int j, k, freed = 0;
struct redisClient *fakeClient;
FILE *fp = fopen(filename,"r");
struct redis_stat sb;
+ unsigned long long loadedkeys = 0;
if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
return REDIS_ERR;
/* Clean up, ready for the next command */
for (j = 0; j < argc; j++) decrRefCount(argv[j]);
zfree(argv);
+ /* Handle swapping while loading big datasets when VM is on */
+ loadedkeys++;
+ if (server.vm_enabled && (loadedkeys % 5000) == 0) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
+ }
+ }
}
fclose(fp);
freeFakeClient(fakeClient);
/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
static int fwriteBulk(FILE *fp, robj *obj) {
char buf[128];
- obj = getDecodedObject(obj);
+ int decrrc = 0;
+
+ if (obj->storage == REDIS_VM_MEMORY && obj->encoding != REDIS_ENCODING_RAW){
+ obj = getDecodedObject(obj);
+ decrrc = 1;
+ }
snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
goto err;
if (fwrite("\r\n",2,1,fp) == 0) goto err;
- decrRefCount(obj);
+ if (decrrc) decrRefCount(obj);
return 1;
err:
- decrRefCount(obj);
+ if (decrrc) decrRefCount(obj);
return 0;
}
int swapped;
key = dictGetEntryKey(de);
- if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY) {
+ /* If the value for this key is swapped, load a preview in memory.
+ * We use a "swapped" flag to remember if we need to free the
+ * value object instead to just increment the ref count anyway
+ * in order to avoid copy-on-write of pages if we are forked() */
+ if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
+ key->storage == REDIS_VM_SWAPPING) {
o = dictGetEntryVal(de);
swapped = 0;
} else {
o = vmPreviewObject(key);
- key = dupStringObject(key);
swapped = 1;
}
expiretime = getExpire(db,key);
if (fwriteBulk(fp,key) == 0) goto werr;
if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
}
- /* We created a few temp objects if the key->value pair
- * was about a swapped out object. Free both. */
- if (swapped) {
- decrRefCount(key);
- decrRefCount(o);
- }
+ if (swapped) decrRefCount(o);
}
dictReleaseIterator(di);
}
unlink(tmpfile);
}
-/* =============================== Virtual Memory =========================== */
+/* Virtual Memory is composed mainly of two subsystems:
+ * - Blocking Virutal Memory
+ * - Threaded Virtual Memory I/O
+ * The two parts are not fully decoupled, but functions are split among two
+ * different sections of the source code (delimited by comments) in order to
+ * make more clear what functionality is about the blocking VM and what about
+ * the threaded (not blocking) VM.
+ *
+ * Redis VM design:
+ *
+ * Redis VM is a blocking VM (one that blocks reading swapped values from
+ * disk into memory when a value swapped out is needed in memory) that is made
+ * unblocking by trying to examine the command argument vector in order to
+ * load in background values that will likely be needed in order to exec
+ * the command. The command is executed only once all the relevant keys
+ * are loaded into memory.
+ *
+ * This basically is almost as simple of a blocking VM, but almost as parallel
+ * as a fully non-blocking VM.
+ */
+
+/* =================== Virtual Memory - Blocking Side ====================== */
static void vmInit(void) {
off_t totsize;
+ int pipefds[2];
server.vm_fp = fopen("/tmp/redisvm","w+b");
if (server.vm_fp == NULL) {
server.vm_fd = fileno(server.vm_fp);
server.vm_next_page = 0;
server.vm_near_pages = 0;
+ server.vm_stats_used_pages = 0;
+ server.vm_stats_swapped_objects = 0;
+ server.vm_stats_swapouts = 0;
+ server.vm_stats_swapins = 0;
totsize = server.vm_pages*server.vm_page_size;
redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
if (ftruncate(server.vm_fd,totsize) == -1) {
redisLog(REDIS_NOTICE,"Swap file allocated with success");
}
server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
- redisLog(REDIS_DEBUG,"Allocated %lld bytes page table for %lld pages",
+ redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
(long long) (server.vm_pages+7)/8, server.vm_pages);
memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
/* Try to remove the swap file, so the OS will really delete it from the
* file system when Redis exists. */
unlink("/tmp/redisvm");
+
+ /* Initialize threaded I/O (used by Virtual Memory) */
+ server.io_newjobs = listCreate();
+ server.io_processing = listCreate();
+ server.io_processed = listCreate();
+ server.io_clients = listCreate();
+ pthread_mutex_init(&server.io_mutex,NULL);
+ server.io_active_threads = 0;
+ if (pipe(pipefds) == -1) {
+ redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
+ ,strerror(errno));
+ exit(1);
+ }
+ server.io_ready_pipe_read = pipefds[0];
+ server.io_ready_pipe_write = pipefds[1];
+ redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
+ /* Listen for events in the threaded I/O pipe */
+ if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
+ vmThreadedIOCompletedJob, NULL) == AE_ERR)
+ oom("creating file event");
}
/* Mark the page as used */
off_t byte = page/8;
int bit = page&7;
server.vm_bitmap[byte] |= 1<<bit;
- printf("Mark used: %lld (byte:%lld bit:%d)\n", (long long)page,
- (long long)byte, bit);
+ redisLog(REDIS_DEBUG,"Mark used: %lld (byte:%lld bit:%d)\n",
+ (long long)page, (long long)byte, bit);
}
/* Mark N contiguous pages as used, with 'page' being the first. */
for (j = 0; j < count; j++)
vmMarkPageUsed(page+j);
+ server.vm_stats_used_pages += count;
}
/* Mark the page as free */
for (j = 0; j < count; j++)
vmMarkPageFree(page+j);
+ server.vm_stats_used_pages -= count;
}
/* Test if the page is free */
while(offset < server.vm_pages) {
off_t this = base+offset;
- printf("THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
+ redisLog(REDIS_DEBUG, "THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
/* If we overflow, restart from page zero */
if (this >= server.vm_pages) {
this -= server.vm_pages;
* needed to later retrieve the object into the key object.
* If we can't find enough contiguous empty pages to swap the object on disk
* REDIS_ERR is returned. */
-static int vmSwapObject(robj *key, robj *val) {
- off_t pages = rdbSavedObjectPages(val);
+static int vmSwapObjectBlocking(robj *key, robj *val) {
+ off_t pages = rdbSavedObjectPages(val,NULL);
off_t page;
assert(key->storage == REDIS_VM_MEMORY);
if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
redisLog(REDIS_WARNING,
- "Critical VM problem in vmSwapObject(): can't seek: %s",
+ "Critical VM problem in vmSwapObjectBlocking(): can't seek: %s",
strerror(errno));
return REDIS_ERR;
}
redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
(unsigned char*) key->ptr,
(unsigned long long) page, (unsigned long long) pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ fflush(server.vm_fp);
return REDIS_OK;
}
vmMarkPagesFree(key->vm.page,key->vm.usedpages);
redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
(unsigned char*) key->ptr);
+ server.vm_stats_swapped_objects--;
+ } else {
+ redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
+ (unsigned char*) key->ptr);
}
+ server.vm_stats_swapins++;
return val;
}
/* Plain object loading, from swap to memory */
static robj *vmLoadObject(robj *key) {
+ /* If we are loading the object in background, stop it, we
+ * need to load this object synchronously ASAP. */
+ if (key->storage == REDIS_VM_LOADING)
+ vmCancelThreadedIOJob(key);
return vmGenericLoadObject(key,0);
}
/* Try to swap an object that's a good candidate for swapping.
* Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
- * to swap any object at all. */
-static int vmSwapOneObject(void) {
+ * to swap any object at all.
+ *
+ * If 'usethreaded' is true, Redis will try to swap the object in background
+ * using I/O threads. */
+static int vmSwapOneObject(int usethreads) {
int j, i;
struct dictEntry *best = NULL;
double best_swappability = 0;
+ redisDb *best_db = NULL;
robj *key, *val;
for (j = 0; j < server.dbnum; j++) {
if (!best || swappability > best_swappability) {
best = de;
best_swappability = swappability;
+ best_db = db;
}
}
}
key = dictGetEntryKey(best) = newkey;
}
/* Swap it */
- if (vmSwapObject(key,val) == REDIS_OK) {
- dictGetEntryVal(best) = NULL;
+ if (usethreads) {
+ vmSwapObjectThreaded(key,val,best_db);
return REDIS_OK;
} else {
- return REDIS_ERR;
+ if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
+ dictGetEntryVal(best) = NULL;
+ return REDIS_OK;
+ } else {
+ return REDIS_ERR;
+ }
}
}
+static int vmSwapOneObjectBlocking() {
+ return vmSwapOneObject(0);
+}
+
+static int vmSwapOneObjectThreaded() {
+ return vmSwapOneObject(1);
+}
+
/* Return true if it's safe to swap out objects in a given moment.
* Basically we don't want to swap objects out while there is a BGSAVE
* or a BGAEOREWRITE running in backgroud. */
return 1;
}
+/* =================== Virtual Memory - Threaded I/O ======================= */
+
+static void freeIOJob(iojob *j) {
+ if (j->type == REDIS_IOJOB_PREPARE_SWAP ||
+ j->type == REDIS_IOJOB_DO_SWAP)
+ decrRefCount(j->val);
+ decrRefCount(j->key);
+ zfree(j);
+}
+
+/* Every time a thread finished a Job, it writes a byte into the write side
+ * of an unix pipe in order to "awake" the main thread, and this function
+ * is called. */
+static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
+ int mask)
+{
+ char buf[1];
+ int retval;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+ REDIS_NOTUSED(privdata);
+
+ /* For every byte we read in the read side of the pipe, there is one
+ * I/O job completed to process. */
+ while((retval = read(fd,buf,1)) == 1) {
+ iojob *j;
+ listNode *ln;
+ robj *key;
+ struct dictEntry *de;
+
+ redisLog(REDIS_DEBUG,"Processing I/O completed job");
+ assert(listLength(server.io_processed) != 0);
+
+ /* Get the processed element (the oldest one) */
+ lockThreadedIO();
+ ln = listFirst(server.io_processed);
+ j = ln->value;
+ listDelNode(server.io_processed,ln);
+ unlockThreadedIO();
+ /* If this job is marked as canceled, just ignore it */
+ if (j->canceled) {
+ freeIOJob(j);
+ continue;
+ }
+ /* Post process it in the main thread, as there are things we
+ * can do just here to avoid race conditions and/or invasive locks */
+ redisLog(REDIS_DEBUG,"Job type: %d, key at %p (%s) refcount: %d\n", j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
+ if (j->key->refcount <= 0) {
+ printf("Ooops ref count is <= 0!\n");
+ exit(1);
+ }
+ de = dictFind(j->db->dict,j->key);
+ assert(de != NULL);
+ key = dictGetEntryKey(de);
+ if (j->type == REDIS_IOJOB_LOAD) {
+ /* Key loaded, bring it at home */
+ key->storage = REDIS_VM_MEMORY;
+ key->vm.atime = server.unixtime;
+ vmMarkPagesFree(key->vm.page,key->vm.usedpages);
+ redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
+ (unsigned char*) key->ptr);
+ server.vm_stats_swapped_objects--;
+ server.vm_stats_swapins++;
+ freeIOJob(j);
+ } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
+ /* Now we know the amount of pages required to swap this object.
+ * Let's find some space for it, and queue this task again
+ * rebranded as REDIS_IOJOB_DO_SWAP. */
+ if (vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) {
+ /* Ooops... no space! */
+ freeIOJob(j);
+ } else {
+ j->type = REDIS_IOJOB_DO_SWAP;
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ }
+ } else if (j->type == REDIS_IOJOB_DO_SWAP) {
+ robj *val;
+
+ /* Key swapped. We can finally free some memory. */
+ val = dictGetEntryVal(de);
+ key->vm.page = j->page;
+ key->vm.usedpages = j->pages;
+ key->storage = REDIS_VM_SWAPPED;
+ key->vtype = j->val->type;
+ decrRefCount(val); /* Deallocate the object from memory. */
+ dictGetEntryVal(de) = NULL;
+ vmMarkPagesUsed(j->page,j->pages);
+ redisLog(REDIS_DEBUG,
+ "VM: object %s swapped out at %lld (%lld pages) (threaded)",
+ (unsigned char*) key->ptr,
+ (unsigned long long) j->page, (unsigned long long) j->pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ freeIOJob(j);
+ /* Put a few more swap requests in queue if we are still
+ * out of memory */
+ if (zmalloc_used_memory() > server.vm_max_memory) {
+ int more = 1;
+ while(more) {
+ lockThreadedIO();
+ more = listLength(server.io_newjobs) <
+ (unsigned) server.vm_max_threads;
+ unlockThreadedIO();
+ /* Don't waste CPU time if swappable objects are rare. */
+ if (vmSwapOneObjectThreaded() == REDIS_ERR) break;
+ }
+ }
+ }
+ }
+ if (retval < 0 && errno != EAGAIN) {
+ redisLog(REDIS_WARNING,
+ "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
+ strerror(errno));
+ }
+}
+
+static void lockThreadedIO(void) {
+ pthread_mutex_lock(&server.io_mutex);
+}
+
+static void unlockThreadedIO(void) {
+ pthread_mutex_unlock(&server.io_mutex);
+}
+
+/* Remove the specified object from the threaded I/O queue if still not
+ * processed, otherwise make sure to flag it as canceled. */
+static void vmCancelThreadedIOJob(robj *o) {
+ list *lists[3] = {
+ server.io_newjobs, server.io_processing, server.io_processed
+ };
+ int i;
+
+ assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
+ lockThreadedIO();
+ /* Search for a matching key in one of the queues */
+ for (i = 0; i < 3; i++) {
+ listNode *ln;
+
+ listRewind(lists[i]);
+ while ((ln = listYield(lists[i])) != NULL) {
+ iojob *job = ln->value;
+
+ if (compareStringObjects(job->key,o) == 0) {
+ switch(i) {
+ case 0: /* io_newjobs */
+ /* If the job was not yet processed the best thing to do
+ * is to remove it from the queue at all */
+ decrRefCount(job->key);
+ if (job->type == REDIS_IOJOB_PREPARE_SWAP ||
+ job->type == REDIS_IOJOB_DO_SWAP)
+ decrRefCount(job->val);
+ listDelNode(lists[i],ln);
+ zfree(job);
+ break;
+ case 1: /* io_processing */
+ case 2: /* io_processed */
+ job->canceled = 1;
+ break;
+ }
+ if (o->storage == REDIS_VM_LOADING)
+ o->storage = REDIS_VM_SWAPPED;
+ else if (o->storage == REDIS_VM_SWAPPING)
+ o->storage = REDIS_VM_MEMORY;
+ unlockThreadedIO();
+ return;
+ }
+ }
+ }
+ unlockThreadedIO();
+ assert(1 != 1); /* We should never reach this */
+}
+
+static void *IOThreadEntryPoint(void *arg) {
+ iojob *j;
+ listNode *ln;
+ REDIS_NOTUSED(arg);
+
+ pthread_detach(pthread_self());
+ while(1) {
+ /* Get a new job to process */
+ lockThreadedIO();
+ if (listLength(server.io_newjobs) == 0) {
+ /* No new jobs in queue, exit. */
+ printf("Thread %lld exiting, nothing to do\n",
+ (long long) pthread_self());
+ server.io_active_threads--;
+ unlockThreadedIO();
+ return NULL;
+ }
+ ln = listFirst(server.io_newjobs);
+ j = ln->value;
+ listDelNode(server.io_newjobs,ln);
+ /* Add the job in the processing queue */
+ j->thread = pthread_self();
+ listAddNodeTail(server.io_processing,j);
+ ln = listLast(server.io_processing); /* We use ln later to remove it */
+ unlockThreadedIO();
+ printf("Thread %lld got a new job: %p about key '%s'\n",
+ (long long) pthread_self(), (void*)j, (char*)j->key->ptr);
+
+ /* Process the Job */
+ if (j->type == REDIS_IOJOB_LOAD) {
+ } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
+ FILE *fp = fopen("/dev/null","w+");
+ j->pages = rdbSavedObjectPages(j->val,fp);
+ fclose(fp);
+ } else if (j->type == REDIS_IOJOB_DO_SWAP) {
+ }
+
+ /* Done: insert the job into the processed queue */
+ printf("Thread %lld completed the job: %p\n",
+ (long long) pthread_self(), (void*)j);
+ lockThreadedIO();
+ listDelNode(server.io_processing,ln);
+ listAddNodeTail(server.io_processed,j);
+ unlockThreadedIO();
+
+ /* Signal the main thread there is new stuff to process */
+ assert(write(server.io_ready_pipe_write,"x",1) == 1);
+ }
+ return NULL; /* never reached */
+}
+
+static void spawnIOThread(void) {
+ pthread_t thread;
+
+ pthread_create(&thread,NULL,IOThreadEntryPoint,NULL);
+ server.io_active_threads++;
+}
+
+/* This function must be called while with threaded IO locked */
+static void queueIOJob(iojob *j) {
+ listAddNodeTail(server.io_newjobs,j);
+ if (server.io_active_threads < server.vm_max_threads)
+ spawnIOThread();
+}
+
+static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
+ iojob *j;
+
+ assert(key->storage == REDIS_VM_MEMORY);
+ assert(key->refcount == 1);
+
+ j = zmalloc(sizeof(*j));
+ j->type = REDIS_IOJOB_PREPARE_SWAP;
+ j->db = db;
+ j->key = dupStringObject(key);
+ j->val = val;
+ incrRefCount(val);
+ j->canceled = 0;
+ j->thread = (pthread_t) -1;
+ key->storage = REDIS_VM_SWAPPING;
+
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ return REDIS_OK;
+}
+
/* ================================= Debugging ============================== */
static void debugCommand(redisClient *c) {
}
key = dictGetEntryKey(de);
val = dictGetEntryVal(de);
- addReplySds(c,sdscatprintf(sdsempty(),
- "+Key at:%p refcount:%d, value at:%p refcount:%d encoding:%d serializedlength:%lld\r\n",
+ if (server.vm_enabled && (key->storage == REDIS_VM_MEMORY ||
+ key->storage == REDIS_VM_SWAPPING)) {
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "+Key at:%p refcount:%d, value at:%p refcount:%d "
+ "encoding:%d serializedlength:%lld\r\n",
(void*)key, key->refcount, (void*)val, val->refcount,
- val->encoding, rdbSavedObjectLen(val)));
+ val->encoding, rdbSavedObjectLen(val,NULL)));
+ } else {
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "+Key at:%p refcount:%d, value swapped at: page %llu "
+ "using %llu pages\r\n",
+ (void*)key, key->refcount, (unsigned long long) key->vm.page,
+ (unsigned long long) key->vm.usedpages));
+ }
} else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
dictEntry *de = dictFind(c->db->dict,c->argv[2]);
robj *key, *val;
/* Swap it */
if (key->storage != REDIS_VM_MEMORY) {
addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
- } else if (vmSwapObject(key,val) == REDIS_OK) {
+ } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
dictGetEntryVal(de) = NULL;
addReply(c,shared.ok);
} else {
if (rdbLoad(server.dbfilename) == REDIS_OK)
redisLog(REDIS_NOTICE,"DB loaded from disk");
}
- if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
- acceptHandler, NULL) == AE_ERR) oom("creating file event");
redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
aeMain(server.el);
aeDeleteEventLoop(server.el);