unsigned long long vm_stats_swapped_objects;
unsigned long long vm_stats_swapouts;
unsigned long long vm_stats_swapins;
+ FILE *devnull;
};
typedef void redisCommandProc(redisClient *c);
static double R_Zero, R_PosInf, R_NegInf, R_Nan;
/* VM threaded I/O request message */
-#define REDIS_IOJOB_LOAD 0
-#define REDIS_IOJOB_SWAP 1
+#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
+#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
+#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
typedef struct iojon {
int type; /* Request type, REDIS_IOJOB_* */
- int dbid; /* Redis database ID */
+ redisDb *db;/* Redis database */
robj *key; /* This I/O request is about swapping this key */
- robj *val; /* the value to swap for REDIS_IOREQ_SWAP, otherwise this
+ robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
* field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
off_t page; /* Swap page where to read/write the object */
+ off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
int canceled; /* True if this command was canceled by blocking side of VM */
pthread_t thread; /* ID of the thread processing this entry */
} iojob;
static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
static void vmCancelThreadedIOJob(robj *o);
+static void lockThreadedIO(void);
+static void unlockThreadedIO(void);
+static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
+static void freeIOJob(iojob *j);
+static void queueIOJob(iojob *j);
static void authCommand(redisClient *c);
static void pingCommand(redisClient *c);
signal(SIGPIPE, SIG_IGN);
setupSigSegvAction();
+ server.devnull = fopen("/dev/null","w");
+ if (server.devnull == NULL) {
+ redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
+ exit(1);
+ }
server.clients = listCreate();
server.slaves = listCreate();
server.monitors = listCreate();
aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
acceptHandler, NULL) == AE_ERR) oom("creating file event");
- if (server.vm_enabled) {
- /* Listen for events in the threaded I/O pipe */
- if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
- vmThreadedIOCompletedJob, NULL) == AE_ERR)
- oom("creating file event");
- }
if (server.appendonly) {
server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
listRelease(c->reply);
- listRelease(c->io_keys);
freeClientArgv(c);
close(c->fd);
/* Remove from the list of clients */
if (ln) listDelNode(server.io_clients,ln);
listRelease(c->io_keys);
}
+ listRelease(c->io_keys);
/* Other cleanup */
if (c->flags & REDIS_SLAVE) {
if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
}
static robj *dupStringObject(robj *o) {
+ assert(o->encoding == REDIS_ENCODING_RAW);
return createStringObject(o->ptr,sdslen(o->ptr));
}
* the rdbSaveObject() function. Currently we use a trick to get
* this length with very little changes to the code. In the future
* we could switch to a faster solution. */
-static off_t rdbSavedObjectLen(robj *o) {
- static FILE *fp = NULL;
-
- if (fp == NULL) fp = fopen("/dev/null","w");
- assert(fp != NULL);
-
+static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
+ if (fp == NULL) fp = server.devnull;
rewind(fp);
assert(rdbSaveObject(fp,o) != 1);
return ftello(fp);
}
/* Return the number of pages required to save this object in the swap file */
-static off_t rdbSavedObjectPages(robj *o) {
- off_t bytes = rdbSavedObjectLen(o);
+static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
+ off_t bytes = rdbSavedObjectLen(o,fp);
return (bytes+(server.vm_page_size-1))/server.vm_page_size;
}
if (rdbSaveObject(fp,o) == -1) goto werr;
} else {
/* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
- robj *po, *newkey;
+ robj *po;
/* Get a preview of the object in memory */
po = vmPreviewObject(key);
- /* Also duplicate the key object, to pass around a standard
- * string object. */
- newkey = dupStringObject(key);
/* Save type, key, value */
if (rdbSaveType(fp,key->vtype) == -1) goto werr;
- if (rdbSaveStringObject(fp,newkey) == -1) goto werr;
+ if (rdbSaveStringObject(fp,key) == -1) goto werr;
if (rdbSaveObject(fp,po) == -1) goto werr;
/* Remove the loaded object from memory */
decrRefCount(po);
- decrRefCount(newkey);
}
}
dictReleaseIterator(di);
"vm_stats_swapped_objects:%llu\r\n"
"vm_stats_swappin_count:%llu\r\n"
"vm_stats_swappout_count:%llu\r\n"
+ "vm_stats_io_newjobs_len:%lu\r\n"
+ "vm_stats_io_processing_len:%lu\r\n"
+ "vm_stats_io_processed_len:%lu\r\n"
+ "vm_stats_io_waiting_clients:%lu\r\n"
,(unsigned long long) server.vm_max_memory,
(unsigned long long) server.vm_page_size,
(unsigned long long) server.vm_pages,
(unsigned long long) server.vm_stats_used_pages,
(unsigned long long) server.vm_stats_swapped_objects,
(unsigned long long) server.vm_stats_swapins,
- (unsigned long long) server.vm_stats_swapouts
+ (unsigned long long) server.vm_stats_swapouts,
+ (unsigned long) listLength(server.io_newjobs),
+ (unsigned long) listLength(server.io_processing),
+ (unsigned long) listLength(server.io_processed),
+ (unsigned long) listLength(server.io_clients)
);
}
for (j = 0; j < server.dbnum; j++) {
/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
static int fwriteBulk(FILE *fp, robj *obj) {
char buf[128];
- obj = getDecodedObject(obj);
+ int decrrc = 0;
+
+ if (obj->storage == REDIS_VM_MEMORY && obj->encoding != REDIS_ENCODING_RAW){
+ obj = getDecodedObject(obj);
+ decrrc = 1;
+ }
snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
goto err;
if (fwrite("\r\n",2,1,fp) == 0) goto err;
- decrRefCount(obj);
+ if (decrrc) decrRefCount(obj);
return 1;
err:
- decrRefCount(obj);
+ if (decrrc) decrRefCount(obj);
return 0;
}
int swapped;
key = dictGetEntryKey(de);
+ /* If the value for this key is swapped, load a preview in memory.
+ * We use a "swapped" flag to remember if we need to free the
+ * value object instead to just increment the ref count anyway
+ * in order to avoid copy-on-write of pages if we are forked() */
if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
key->storage == REDIS_VM_SWAPPING) {
o = dictGetEntryVal(de);
swapped = 0;
} else {
o = vmPreviewObject(key);
- key = dupStringObject(key);
swapped = 1;
}
expiretime = getExpire(db,key);
if (fwriteBulk(fp,key) == 0) goto werr;
if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
}
- /* We created a few temp objects if the key->value pair
- * was about a swapped out object. Free both. */
- if (swapped) {
- decrRefCount(key);
- decrRefCount(o);
- }
+ if (swapped) decrRefCount(o);
}
dictReleaseIterator(di);
}
server.io_ready_pipe_read = pipefds[0];
server.io_ready_pipe_write = pipefds[1];
redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
+ /* Listen for events in the threaded I/O pipe */
+ if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
+ vmThreadedIOCompletedJob, NULL) == AE_ERR)
+ oom("creating file event");
}
/* Mark the page as used */
* If we can't find enough contiguous empty pages to swap the object on disk
* REDIS_ERR is returned. */
static int vmSwapObjectBlocking(robj *key, robj *val) {
- off_t pages = rdbSavedObjectPages(val);
+ off_t pages = rdbSavedObjectPages(val,NULL);
off_t page;
assert(key->storage == REDIS_VM_MEMORY);
return REDIS_OK;
}
-static int vmSwapObjectThreaded(robj *key, robj *val) {
-
- key = key;
- val = val;
- return REDIS_OK;
-}
-
/* Load the value object relative to the 'key' object from swap to memory.
* The newly allocated object is returned.
*
int j, i;
struct dictEntry *best = NULL;
double best_swappability = 0;
+ redisDb *best_db = NULL;
robj *key, *val;
for (j = 0; j < server.dbnum; j++) {
if (!best || swappability > best_swappability) {
best = de;
best_swappability = swappability;
+ best_db = db;
}
}
}
}
/* Swap it */
if (usethreads) {
- vmSwapObjectThreaded(key,val);
+ vmSwapObjectThreaded(key,val,best_db);
return REDIS_OK;
} else {
if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
/* =================== Virtual Memory - Threaded I/O ======================= */
+static void freeIOJob(iojob *j) {
+ if (j->type == REDIS_IOJOB_PREPARE_SWAP ||
+ j->type == REDIS_IOJOB_DO_SWAP)
+ decrRefCount(j->val);
+ decrRefCount(j->key);
+ zfree(j);
+}
+
/* Every time a thread finished a Job, it writes a byte into the write side
* of an unix pipe in order to "awake" the main thread, and this function
* is called. */
/* For every byte we read in the read side of the pipe, there is one
* I/O job completed to process. */
while((retval = read(fd,buf,1)) == 1) {
+ iojob *j;
+ listNode *ln;
+ robj *key;
+ struct dictEntry *de;
+
redisLog(REDIS_DEBUG,"Processing I/O completed job");
+ assert(listLength(server.io_processed) != 0);
+
+ /* Get the processed element (the oldest one) */
+ lockThreadedIO();
+ ln = listFirst(server.io_processed);
+ j = ln->value;
+ listDelNode(server.io_processed,ln);
+ unlockThreadedIO();
+ /* If this job is marked as canceled, just ignore it */
+ if (j->canceled) {
+ freeIOJob(j);
+ continue;
+ }
+ /* Post process it in the main thread, as there are things we
+ * can do just here to avoid race conditions and/or invasive locks */
+ redisLog(REDIS_DEBUG,"Job type: %d, key at %p (%s) refcount: %d\n", j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
+ if (j->key->refcount <= 0) {
+ printf("Ooops ref count is <= 0!\n");
+ exit(1);
+ }
+ de = dictFind(j->db->dict,j->key);
+ assert(de != NULL);
+ key = dictGetEntryKey(de);
+ if (j->type == REDIS_IOJOB_LOAD) {
+ /* Key loaded, bring it at home */
+ key->storage = REDIS_VM_MEMORY;
+ key->vm.atime = server.unixtime;
+ vmMarkPagesFree(key->vm.page,key->vm.usedpages);
+ redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
+ (unsigned char*) key->ptr);
+ server.vm_stats_swapped_objects--;
+ server.vm_stats_swapins++;
+ freeIOJob(j);
+ } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
+ /* Now we know the amount of pages required to swap this object.
+ * Let's find some space for it, and queue this task again
+ * rebranded as REDIS_IOJOB_DO_SWAP. */
+ if (vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) {
+ /* Ooops... no space! */
+ freeIOJob(j);
+ } else {
+ j->type = REDIS_IOJOB_DO_SWAP;
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ }
+ } else if (j->type == REDIS_IOJOB_DO_SWAP) {
+ robj *val;
+
+ /* Key swapped. We can finally free some memory. */
+ val = dictGetEntryVal(de);
+ key->vm.page = j->page;
+ key->vm.usedpages = j->pages;
+ key->storage = REDIS_VM_SWAPPED;
+ key->vtype = j->val->type;
+ decrRefCount(val); /* Deallocate the object from memory. */
+ dictGetEntryVal(de) = NULL;
+ vmMarkPagesUsed(j->page,j->pages);
+ redisLog(REDIS_DEBUG,
+ "VM: object %s swapped out at %lld (%lld pages) (threaded)",
+ (unsigned char*) key->ptr,
+ (unsigned long long) j->page, (unsigned long long) j->pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ freeIOJob(j);
+ /* Put a few more swap requests in queue if we are still
+ * out of memory */
+ if (zmalloc_used_memory() > server.vm_max_memory) {
+ int more = 1;
+ while(more) {
+ lockThreadedIO();
+ more = listLength(server.io_newjobs) <
+ (unsigned) server.vm_max_threads;
+ unlockThreadedIO();
+ /* Don't waste CPU time if swappable objects are rare. */
+ if (vmSwapOneObjectThreaded() == REDIS_ERR) break;
+ }
+ }
+ }
}
if (retval < 0 && errno != EAGAIN) {
redisLog(REDIS_WARNING,
/* If the job was not yet processed the best thing to do
* is to remove it from the queue at all */
decrRefCount(job->key);
- if (job->type == REDIS_IOJOB_SWAP)
+ if (job->type == REDIS_IOJOB_PREPARE_SWAP ||
+ job->type == REDIS_IOJOB_DO_SWAP)
decrRefCount(job->val);
listDelNode(lists[i],ln);
zfree(job);
assert(1 != 1); /* We should never reach this */
}
+static void *IOThreadEntryPoint(void *arg) {
+ iojob *j;
+ listNode *ln;
+ REDIS_NOTUSED(arg);
+
+ pthread_detach(pthread_self());
+ while(1) {
+ /* Get a new job to process */
+ lockThreadedIO();
+ if (listLength(server.io_newjobs) == 0) {
+ /* No new jobs in queue, exit. */
+ printf("Thread %lld exiting, nothing to do\n",
+ (long long) pthread_self());
+ server.io_active_threads--;
+ unlockThreadedIO();
+ return NULL;
+ }
+ ln = listFirst(server.io_newjobs);
+ j = ln->value;
+ listDelNode(server.io_newjobs,ln);
+ /* Add the job in the processing queue */
+ j->thread = pthread_self();
+ listAddNodeTail(server.io_processing,j);
+ ln = listLast(server.io_processing); /* We use ln later to remove it */
+ unlockThreadedIO();
+ printf("Thread %lld got a new job: %p about key '%s'\n",
+ (long long) pthread_self(), (void*)j, (char*)j->key->ptr);
+
+ /* Process the Job */
+ if (j->type == REDIS_IOJOB_LOAD) {
+ } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
+ FILE *fp = fopen("/dev/null","w+");
+ j->pages = rdbSavedObjectPages(j->val,fp);
+ fclose(fp);
+ } else if (j->type == REDIS_IOJOB_DO_SWAP) {
+ }
+
+ /* Done: insert the job into the processed queue */
+ printf("Thread %lld completed the job: %p\n",
+ (long long) pthread_self(), (void*)j);
+ lockThreadedIO();
+ listDelNode(server.io_processing,ln);
+ listAddNodeTail(server.io_processed,j);
+ unlockThreadedIO();
+
+ /* Signal the main thread there is new stuff to process */
+ assert(write(server.io_ready_pipe_write,"x",1) == 1);
+ }
+ return NULL; /* never reached */
+}
+
+static void spawnIOThread(void) {
+ pthread_t thread;
+
+ pthread_create(&thread,NULL,IOThreadEntryPoint,NULL);
+ server.io_active_threads++;
+}
+
+/* This function must be called while with threaded IO locked */
+static void queueIOJob(iojob *j) {
+ listAddNodeTail(server.io_newjobs,j);
+ if (server.io_active_threads < server.vm_max_threads)
+ spawnIOThread();
+}
+
+static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
+ iojob *j;
+
+ assert(key->storage == REDIS_VM_MEMORY);
+ assert(key->refcount == 1);
+
+ j = zmalloc(sizeof(*j));
+ j->type = REDIS_IOJOB_PREPARE_SWAP;
+ j->db = db;
+ j->key = dupStringObject(key);
+ j->val = val;
+ incrRefCount(val);
+ j->canceled = 0;
+ j->thread = (pthread_t) -1;
+ key->storage = REDIS_VM_SWAPPING;
+
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ return REDIS_OK;
+}
+
/* ================================= Debugging ============================== */
static void debugCommand(redisClient *c) {
}
key = dictGetEntryKey(de);
val = dictGetEntryVal(de);
- if (server.vm_enabled && key->storage == REDIS_VM_MEMORY) {
+ if (server.vm_enabled && (key->storage == REDIS_VM_MEMORY ||
+ key->storage == REDIS_VM_SWAPPING)) {
addReplySds(c,sdscatprintf(sdsempty(),
"+Key at:%p refcount:%d, value at:%p refcount:%d "
"encoding:%d serializedlength:%lld\r\n",
(void*)key, key->refcount, (void*)val, val->refcount,
- val->encoding, rdbSavedObjectLen(val)));
+ val->encoding, rdbSavedObjectLen(val,NULL)));
} else {
addReplySds(c,sdscatprintf(sdsempty(),
"+Key at:%p refcount:%d, value swapped at: page %llu "