#include "lzf.h" /* LZF compression library */
#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
+/* #define REDIS_HELGRIND_FRIENDLY */
+#if defined(__GNUC__) && defined(REDIS_HELGRIND_FRIENDLY)
+#warning "Remember to undef REDIS_HELGRIND_FRIENDLY before to commit"
+#endif
+
/* Error codes */
#define REDIS_OK 0
#define REDIS_ERR -1
#define REDIS_VM_MAX_NEAR_PAGES 65536
#define REDIS_VM_MAX_RANDOM_JUMP 4096
#define REDIS_VM_MAX_THREADS 32
+#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
+/* The following is the number of completed I/O jobs to process when the
+ * handelr is called. 1 is the minimum, and also the default, as it allows
+ * to block as little as possible other accessing clients. While Virtual
+ * Memory I/O operations are performed by threads, this operations must
+ * be processed by the main thread when completed to take effect. */
+#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
/* Client flags */
#define REDIS_CLOSE 1 /* This client connection should be closed ASAP */
pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
+ pthread_attr_t io_threads_attr; /* attributes for threads creation */
int io_active_threads; /* Number of running I/O threads */
int vm_max_threads; /* Max number of I/O threads running at the same time */
/* Our main thread is blocked on the event loop, locking for sockets ready
static void queueIOJob(iojob *j);
static int vmWriteObjectOnSwap(robj *o, off_t page);
static robj *vmReadObjectFromSwap(off_t page, int type);
+static void waitZeroActiveThreads(void);
static void authCommand(redisClient *c);
static void pingCommand(redisClient *c);
return hash;
}
+/* Sets type and expires */
static dictType setDictType = {
dictEncObjHash, /* hash function */
NULL, /* key dup */
NULL /* val destructor */
};
+/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
static dictType zsetDictType = {
dictEncObjHash, /* hash function */
NULL, /* key dup */
dictVanillaFree /* val destructor of malloc(sizeof(double)) */
};
+/* Db->dict */
static dictType hashDictType = {
dictObjHash, /* hash function */
NULL, /* key dup */
dictRedisObjectDestructor /* val destructor */
};
+/* Db->expires */
+static dictType keyptrDictType = {
+ dictObjHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictObjKeyCompare, /* key compare */
+ dictRedisObjectDestructor, /* key destructor */
+ NULL /* val destructor */
+};
+
/* Keylist hash table type has unencoded redis objects as keys and
* lists as values. It's used for blocking operations (BLPOP) */
static dictType keylistDictType = {
redisClient *c;
listNode *ln;
time_t now = time(NULL);
+ listIter li;
- listRewind(server.clients);
- while ((ln = listYield(server.clients)) != NULL) {
+ listRewind(server.clients,&li);
+ while ((ln = listNext(&li)) != NULL) {
c = listNodeValue(ln);
if (server.maxidletime &&
!(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
while (server.vm_enabled && zmalloc_used_memory() >
server.vm_max_memory)
{
+ int retval;
+
if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
- if (vmSwapOneObjectThreaded() == REDIS_ERR) {
- if ((loops % 30) == 0 && zmalloc_used_memory() >
- (server.vm_max_memory+server.vm_max_memory/10)) {
- redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
- }
+ retval = (server.vm_max_threads == 0) ?
+ vmSwapOneObjectBlocking() :
+ vmSwapOneObjectThreaded();
+ if (retval == REDIS_ERR && (loops % 30) == 0 &&
+ zmalloc_used_memory() >
+ (server.vm_max_memory+server.vm_max_memory/10))
+ {
+ redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
}
- /* Note that we freed just one object, because anyway when
- * the I/O thread in charge to swap this object out will
- * do its work, the handler of completed jobs will try to swap
- * more objects if we are out of memory. */
- break;
+ /* Note that when using threade I/O we free just one object,
+ * because anyway when the I/O thread in charge to swap this
+ * object out will finish, the handler of completed jobs
+ * will try to swap more objects if we are still out of memory. */
+ if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
}
}
}
for (j = 0; j < server.dbnum; j++) {
server.db[j].dict = dictCreate(&hashDictType,NULL);
- server.db[j].expires = dictCreate(&setDictType,NULL);
+ server.db[j].expires = dictCreate(&keyptrDictType,NULL);
server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
server.db[j].id = j;
}
int copylen = 0;
char buf[GLUEREPLY_UP_TO];
listNode *ln;
+ listIter li;
robj *o;
- listRewind(c->reply);
- while((ln = listYield(c->reply))) {
+ listRewind(c->reply,&li);
+ while((ln = listNext(&li))) {
int objlen;
o = ln->value;
static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
listNode *ln;
+ listIter li;
int outc = 0, j;
robj **outv;
/* (args*2)+1 is enough room for args, spaces, newlines */
* be sure to free objects if there is no slave in a replication state
* able to be feed with commands */
for (j = 0; j < outc; j++) incrRefCount(outv[j]);
- listRewind(slaves);
- while((ln = listYield(slaves))) {
+ listRewind(slaves,&li);
+ while((ln = listNext(&li))) {
redisClient *slave = ln->value;
/* Don't feed slaves that are still waiting for BGSAVE to start */
o->ptr = ptr;
o->refcount = 1;
if (server.vm_enabled) {
+ /* Note that this code may run in the context of an I/O thread
+ * and accessing to server.unixtime in theory is an error
+ * (no locks). But in practice this is safe, and even if we read
+ * garbage Redis will not fail, as it's just a statistical info */
o->vm.atime = server.unixtime;
o->storage = REDIS_VM_MEMORY;
}
static int rdbSaveStringObject(FILE *fp, robj *obj) {
int retval;
- if (obj->storage == REDIS_VM_MEMORY &&
- obj->encoding != REDIS_ENCODING_RAW)
- {
+ /* Avoid incr/decr ref count business when possible.
+ * This plays well with copy-on-write given that we are probably
+ * in a child process (BGSAVE). Also this makes sure key objects
+ * of swapped objects are not incRefCount-ed (an assert does not allow
+ * this in order to avoid bugs) */
+ if (obj->encoding != REDIS_ENCODING_RAW) {
obj = getDecodedObject(obj);
retval = rdbSaveStringObjectRaw(fp,obj);
decrRefCount(obj);
} else {
- /* This is a fast path when we are sure the object is not encoded.
- * Note that's any *faster* actually as we needed to add the conditional
- * but because this may happen in a background process we don't want
- * to touch the object fields with incr/decrRefCount in order to
- * preveny copy on write of pages.
- *
- * Also incrRefCount() will have a failing assert() if we try to call
- * it against an object with storage != REDIS_VM_MEMORY. */
retval = rdbSaveStringObjectRaw(fp,obj);
}
return retval;
} else if (o->type == REDIS_LIST) {
/* Save a list value */
list *list = o->ptr;
+ listIter li;
listNode *ln;
- listRewind(list);
if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
- while((ln = listYield(list))) {
+ listRewind(list,&li);
+ while((ln = listNext(&li))) {
robj *eleobj = listNodeValue(ln);
if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
pid_t childpid;
if (server.bgsavechildpid != -1) return REDIS_ERR;
+ if (server.vm_enabled) waitZeroActiveThreads();
if ((childpid = fork()) == 0) {
/* Child */
close(server.fd);
if (sortval->type == REDIS_LIST) {
list *list = sortval->ptr;
listNode *ln;
+ listIter li;
- listRewind(list);
- while((ln = listYield(list))) {
+ listRewind(list,&li);
+ while((ln = listNext(&li))) {
robj *ele = ln->value;
vector[j].obj = ele;
vector[j].u.score = 0;
addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
for (j = start; j <= end; j++) {
listNode *ln;
+ listIter li;
+
if (!getop) {
addReplyBulkLen(c,vector[j].obj);
addReply(c,vector[j].obj);
addReply(c,shared.crlf);
}
- listRewind(operations);
- while((ln = listYield(operations))) {
+ listRewind(operations,&li);
+ while((ln = listNext(&li))) {
redisSortOperation *sop = ln->value;
robj *val = lookupKeyByPattern(c->db,sop->pattern,
vector[j].obj);
/* STORE option specified, set the sorting result as a List object */
for (j = start; j <= end; j++) {
listNode *ln;
+ listIter li;
+
if (!getop) {
listAddNodeTail(listPtr,vector[j].obj);
incrRefCount(vector[j].obj);
}
- listRewind(operations);
- while((ln = listYield(operations))) {
+ listRewind(operations,&li);
+ while((ln = listNext(&li))) {
redisSortOperation *sop = ln->value;
robj *val = lookupKeyByPattern(c->db,sop->pattern,
vector[j].obj);
);
}
if (server.vm_enabled) {
+ lockThreadedIO();
info = sdscatprintf(info,
"vm_conf_max_memory:%llu\r\n"
"vm_conf_page_size:%llu\r\n"
"vm_stats_io_processing_len:%lu\r\n"
"vm_stats_io_processed_len:%lu\r\n"
"vm_stats_io_waiting_clients:%lu\r\n"
+ "vm_stats_io_active_threads:%lu\r\n"
,(unsigned long long) server.vm_max_memory,
(unsigned long long) server.vm_page_size,
(unsigned long long) server.vm_pages,
(unsigned long) listLength(server.io_newjobs),
(unsigned long) listLength(server.io_processing),
(unsigned long) listLength(server.io_processed),
- (unsigned long) listLength(server.io_clients)
+ (unsigned long) listLength(server.io_clients),
+ (unsigned long) server.io_active_threads
);
+ unlockThreadedIO();
}
for (j = 0; j < server.dbnum; j++) {
long long keys, vkeys;
* registering differences since the server forked to save */
redisClient *slave;
listNode *ln;
+ listIter li;
- listRewind(server.slaves);
- while((ln = listYield(server.slaves))) {
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
}
static void updateSlavesWaitingBgsave(int bgsaveerr) {
listNode *ln;
int startbgsave = 0;
+ listIter li;
- listRewind(server.slaves);
- while((ln = listYield(server.slaves))) {
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
redisClient *slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
}
if (startbgsave) {
if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
- listRewind(server.slaves);
+ listIter li;
+
+ listRewind(server.slaves,&li);
redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
- while((ln = listYield(server.slaves))) {
+ while((ln = listNext(&li))) {
redisClient *slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
char buf[128];
int decrrc = 0;
- if (obj->storage == REDIS_VM_MEMORY && obj->encoding != REDIS_ENCODING_RAW){
+ /* Avoid the incr/decr ref count business if possible to help
+ * copy-on-write (we are often in a child process when this function
+ * is called).
+ * Also makes sure that key objects don't get incrRefCount-ed when VM
+ * is enabled */
+ if (obj->encoding != REDIS_ENCODING_RAW) {
obj = getDecodedObject(obj);
decrrc = 1;
}
/* Emit the RPUSHes needed to rebuild the list */
list *list = o->ptr;
listNode *ln;
+ listIter li;
- listRewind(list);
- while((ln = listYield(list))) {
+ listRewind(list,&li);
+ while((ln = listNext(&li))) {
char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
robj *eleobj = listNodeValue(ln);
pid_t childpid;
if (server.bgrewritechildpid != -1) return REDIS_ERR;
+ if (server.vm_enabled) waitZeroActiveThreads();
if ((childpid = fork()) == 0) {
/* Child */
char tmpfile[256];
static void vmInit(void) {
off_t totsize;
int pipefds[2];
+ size_t stacksize;
+
+ if (server.vm_max_threads != 0)
+ zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
server.vm_fp = fopen("/tmp/redisvm","w+b");
if (server.vm_fp == NULL) {
server.io_ready_pipe_read = pipefds[0];
server.io_ready_pipe_write = pipefds[1];
redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
+ /* LZF requires a lot of stack */
+ pthread_attr_init(&server.io_threads_attr);
+ pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
+ while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
+ pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
/* Listen for events in the threaded I/O pipe */
if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
vmThreadedIOCompletedJob, NULL) == AE_ERR)
* note: I implemented this function just after watching an episode of
* Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
*/
-static int vmFindContiguousPages(off_t *first, int n) {
+static int vmFindContiguousPages(off_t *first, off_t n) {
off_t base, offset = 0, since_jump = 0, numfree = 0;
if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
de = dictGetRandomKey(db->dict);
key = dictGetEntryKey(de);
val = dictGetEntryVal(de);
- if (key->storage != REDIS_VM_MEMORY) {
+ /* Only swap objects that are currently in memory.
+ *
+ * Also don't swap shared objects if threaded VM is on, as we
+ * try to ensure that the main thread does not touch the
+ * object while the I/O thread is using it, but we can't
+ * control other keys without adding additional mutex. */
+ if (key->storage != REDIS_VM_MEMORY ||
+ (server.vm_max_threads != 0 && val->refcount != 1)) {
if (maxtries) i--; /* don't count this try */
continue;
}
{
char buf[1];
int retval;
+ int processed = 0;
REDIS_NOTUSED(el);
REDIS_NOTUSED(mask);
REDIS_NOTUSED(privdata);
struct dictEntry *de;
redisLog(REDIS_DEBUG,"Processing I/O completed job");
- assert(listLength(server.io_processed) != 0);
/* Get the processed element (the oldest one) */
lockThreadedIO();
+ assert(listLength(server.io_processed) != 0);
ln = listFirst(server.io_processed);
j = ln->value;
listDelNode(server.io_processed,ln);
/* Ooops... no space! */
freeIOJob(j);
} else {
+ /* Note that we need to mark this pages as used now,
+ * if the job will be canceled, we'll mark them as freed
+ * again. */
+ vmMarkPagesUsed(j->page,j->pages);
j->type = REDIS_IOJOB_DO_SWAP;
lockThreadedIO();
queueIOJob(j);
key->vtype = j->val->type;
decrRefCount(val); /* Deallocate the object from memory. */
dictGetEntryVal(de) = NULL;
- vmMarkPagesUsed(j->page,j->pages);
redisLog(REDIS_DEBUG,
"VM: object %s swapped out at %lld (%lld pages) (threaded)",
(unsigned char*) key->ptr,
}
}
}
- return; /* XXX REMOVE ME */
+ processed++;
+ if (processed == REDIS_MAX_COMPLETED_JOBS_PROCESSED) return;
}
if (retval < 0 && errno != EAGAIN) {
redisLog(REDIS_WARNING,
int i;
assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
+again:
lockThreadedIO();
/* Search for a matching key in one of the queues */
for (i = 0; i < 3; i++) {
listNode *ln;
+ listIter li;
- listRewind(lists[i]);
- while ((ln = listYield(lists[i])) != NULL) {
+ listRewind(lists[i],&li);
+ while ((ln = listNext(&li)) != NULL) {
iojob *job = ln->value;
if (job->canceled) continue; /* Skip this, already canceled. */
if (compareStringObjects(job->key,o) == 0) {
- redisLog(REDIS_DEBUG,"*** CANCELED %p (%s)\n",
- (void*)job, (char*)o->ptr);
+ redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (LIST ID %d)\n",
+ (void*)job, (char*)o->ptr, i);
+ /* Mark the pages as free since the swap didn't happened
+ * or happened but is now discarded. */
+ if (job->type == REDIS_IOJOB_DO_SWAP)
+ vmMarkPagesFree(job->page,job->pages);
+ /* Cancel the job. It depends on the list the job is
+ * living in. */
switch(i) {
case 0: /* io_newjobs */
/* If the job was yet not processed the best thing to do
listDelNode(lists[i],ln);
break;
case 1: /* io_processing */
+ /* Oh Shi- the thread is messing with the Job, and
+ * probably with the object if this is a
+ * PREPARE_SWAP or DO_SWAP job. Better to wait for the
+ * job to move into the next queue... */
+ if (job->type != REDIS_IOJOB_LOAD) {
+ /* Yes, we try again and again until the job
+ * is completed. */
+ unlockThreadedIO();
+ /* But let's wait some time for the I/O thread
+ * to finish with this job. After all this condition
+ * should be very rare. */
+ usleep(1);
+ goto again;
+ } else {
+ job->canceled = 1;
+ break;
+ }
case 2: /* io_processed */
+ /* The job was already processed, that's easy...
+ * just mark it as canceled so that we'll ignore it
+ * when processing completed jobs. */
job->canceled = 1;
break;
}
+ /* Finally we have to adjust the storage type of the object
+ * in order to "UNDO" the operaiton. */
if (o->storage == REDIS_VM_LOADING)
o->storage = REDIS_VM_SWAPPED;
else if (o->storage == REDIS_VM_SWAPPING)
/* Get a new job to process */
lockThreadedIO();
if (listLength(server.io_newjobs) == 0) {
+#ifdef REDIS_HELGRIND_FRIENDLY
+ /* No new jobs? Wait and retry, because to be Helgrind
+ * (valgrind --tool=helgrind) what's needed is to take
+ * the same threads running instead to create/destroy threads
+ * as needed (otherwise valgrind will fail) */
+ unlockThreadedIO();
+ usleep(1); /* Give some time for the I/O thread to work. */
+ continue;
+#endif
/* No new jobs in queue, exit. */
- redisLog(REDIS_DEBUG,"Thread %lld exiting, nothing to do\n",
+ redisLog(REDIS_DEBUG,"Thread %lld exiting, nothing to do",
(long long) pthread_self());
server.io_active_threads--;
unlockThreadedIO();
listAddNodeTail(server.io_processing,j);
ln = listLast(server.io_processing); /* We use ln later to remove it */
unlockThreadedIO();
- redisLog(REDIS_DEBUG,"Thread %lld got a new job (type %d): %p about key '%s'\n",
+ redisLog(REDIS_DEBUG,"Thread %lld got a new job (type %d): %p about key '%s'",
(long long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
/* Process the Job */
}
/* Done: insert the job into the processed queue */
- redisLog(REDIS_DEBUG,"Thread %lld completed the job: %p (key %s)\n",
+ redisLog(REDIS_DEBUG,"Thread %lld completed the job: %p (key %s)",
(long long) pthread_self(), (void*)j, (char*)j->key->ptr);
lockThreadedIO();
listDelNode(server.io_processing,ln);
static void spawnIOThread(void) {
pthread_t thread;
- pthread_create(&thread,NULL,IOThreadEntryPoint,NULL);
+ pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
server.io_active_threads++;
}
+/* We need to wait for the last thread to exit before we are able to
+ * fork() in order to BGSAVE or BGREWRITEAOF. */
+static void waitZeroActiveThreads(void) {
+ while(1) {
+ lockThreadedIO();
+ if (server.io_active_threads == 0) {
+ unlockThreadedIO();
+ return;
+ }
+ unlockThreadedIO();
+ usleep(10000); /* 10 milliseconds */
+ }
+}
+
/* This function must be called while with threaded IO locked */
static void queueIOJob(iojob *j) {
redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",