+ if (slave->repldboff == 0) {
+ /* Write the bulk write count before to transfer the DB. In theory here
+ * we don't know how much room there is in the output buffer of the
+ * socket, but in pratice SO_SNDLOWAT (the minimum count for output
+ * operations) will never be smaller than the few bytes we need. */
+ sds bulkcount;
+
+ bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
+ slave->repldbsize);
+ if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
+ {
+ sdsfree(bulkcount);
+ freeClient(slave);
+ return;
+ }
+ sdsfree(bulkcount);
+ }
+ lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
+ buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
+ if (buflen <= 0) {
+ redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
+ (buflen == 0) ? "premature EOF" : strerror(errno));
+ freeClient(slave);
+ return;
+ }
+ if ((nwritten = write(fd,buf,buflen)) == -1) {
+ redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
+ strerror(errno));
+ freeClient(slave);
+ return;
+ }
+ slave->repldboff += nwritten;
+ if (slave->repldboff == slave->repldbsize) {
+ close(slave->repldbfd);
+ slave->repldbfd = -1;
+ aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
+ slave->replstate = REDIS_REPL_ONLINE;
+ if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
+ sendReplyToClient, slave) == AE_ERR) {
+ freeClient(slave);
+ return;
+ }
+ addReplySds(slave,sdsempty());
+ redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
+ }
+}
+
+/* This function is called at the end of every backgrond saving.
+ * The argument bgsaveerr is REDIS_OK if the background saving succeeded
+ * otherwise REDIS_ERR is passed to the function.
+ *
+ * The goal of this function is to handle slaves waiting for a successful
+ * background saving in order to perform non-blocking synchronization. */
+static void updateSlavesWaitingBgsave(int bgsaveerr) {
+ listNode *ln;
+ int startbgsave = 0;
+ listIter li;
+
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
+ redisClient *slave = ln->value;
+
+ if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
+ startbgsave = 1;
+ slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
+ } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
+ struct redis_stat buf;
+
+ if (bgsaveerr != REDIS_OK) {
+ freeClient(slave);
+ redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
+ continue;
+ }
+ if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
+ redis_fstat(slave->repldbfd,&buf) == -1) {
+ freeClient(slave);
+ redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
+ continue;
+ }
+ slave->repldboff = 0;
+ slave->repldbsize = buf.st_size;
+ slave->replstate = REDIS_REPL_SEND_BULK;
+ aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
+ if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
+ freeClient(slave);
+ continue;
+ }
+ }
+ }
+ if (startbgsave) {
+ if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
+ listIter li;
+
+ listRewind(server.slaves,&li);
+ redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
+ while((ln = listNext(&li))) {
+ redisClient *slave = ln->value;
+
+ if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
+ freeClient(slave);
+ }
+ }
+ }
+}
+
+static int syncWithMaster(void) {
+ char buf[1024], tmpfile[256], authcmd[1024];
+ int dumpsize;
+ int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
+ int dfd;
+
+ if (fd == -1) {
+ redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+
+ /* AUTH with the master if required. */
+ if(server.masterauth) {
+ snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
+ if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ /* Read the AUTH result. */
+ if (syncReadLine(fd,buf,1024,3600) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ if (buf[0] != '+') {
+ close(fd);
+ redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
+ return REDIS_ERR;
+ }
+ }
+
+ /* Issue the SYNC command */
+ if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ /* Read the bulk write count */
+ if (syncReadLine(fd,buf,1024,3600) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ if (buf[0] != '$') {
+ close(fd);
+ redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
+ return REDIS_ERR;
+ }
+ dumpsize = atoi(buf+1);
+ redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
+ /* Read the bulk write data on a temp file */
+ snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
+ dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
+ if (dfd == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
+ return REDIS_ERR;
+ }
+ while(dumpsize) {
+ int nread, nwritten;
+
+ nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
+ if (nread == -1) {
+ redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
+ strerror(errno));
+ close(fd);
+ close(dfd);
+ return REDIS_ERR;
+ }
+ nwritten = write(dfd,buf,nread);
+ if (nwritten == -1) {
+ redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
+ close(fd);
+ close(dfd);
+ return REDIS_ERR;
+ }
+ dumpsize -= nread;
+ }
+ close(dfd);
+ if (rename(tmpfile,server.dbfilename) == -1) {
+ redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
+ unlink(tmpfile);
+ close(fd);
+ return REDIS_ERR;
+ }
+ emptyDb();
+ if (rdbLoad(server.dbfilename) != REDIS_OK) {
+ redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
+ close(fd);
+ return REDIS_ERR;
+ }
+ server.master = createClient(fd);
+ server.master->flags |= REDIS_MASTER;
+ server.master->authenticated = 1;
+ server.replstate = REDIS_REPL_CONNECTED;
+ return REDIS_OK;
+}
+
+static void slaveofCommand(redisClient *c) {
+ if (!strcasecmp(c->argv[1]->ptr,"no") &&
+ !strcasecmp(c->argv[2]->ptr,"one")) {
+ if (server.masterhost) {
+ sdsfree(server.masterhost);
+ server.masterhost = NULL;
+ if (server.master) freeClient(server.master);
+ server.replstate = REDIS_REPL_NONE;
+ redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
+ }
+ } else {
+ sdsfree(server.masterhost);
+ server.masterhost = sdsdup(c->argv[1]->ptr);
+ server.masterport = atoi(c->argv[2]->ptr);
+ if (server.master) freeClient(server.master);
+ server.replstate = REDIS_REPL_CONNECT;
+ redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
+ server.masterhost, server.masterport);
+ }
+ addReply(c,shared.ok);
+}
+
+/* ============================ Maxmemory directive ======================== */
+
+/* Try to free one object form the pre-allocated objects free list.
+ * This is useful under low mem conditions as by default we take 1 million
+ * free objects allocated. On success REDIS_OK is returned, otherwise
+ * REDIS_ERR. */
+static int tryFreeOneObjectFromFreelist(void) {
+ robj *o;
+
+ if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
+ if (listLength(server.objfreelist)) {
+ listNode *head = listFirst(server.objfreelist);
+ o = listNodeValue(head);
+ listDelNode(server.objfreelist,head);
+ if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
+ zfree(o);
+ return REDIS_OK;
+ } else {
+ if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
+ return REDIS_ERR;
+ }
+}
+
+/* This function gets called when 'maxmemory' is set on the config file to limit
+ * the max memory used by the server, and we are out of memory.
+ * This function will try to, in order:
+ *
+ * - Free objects from the free list
+ * - Try to remove keys with an EXPIRE set
+ *
+ * It is not possible to free enough memory to reach used-memory < maxmemory
+ * the server will start refusing commands that will enlarge even more the
+ * memory usage.
+ */
+static void freeMemoryIfNeeded(void) {
+ while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
+ int j, k, freed = 0;
+
+ if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
+ for (j = 0; j < server.dbnum; j++) {
+ int minttl = -1;
+ robj *minkey = NULL;
+ struct dictEntry *de;
+
+ if (dictSize(server.db[j].expires)) {
+ freed = 1;
+ /* From a sample of three keys drop the one nearest to
+ * the natural expire */
+ for (k = 0; k < 3; k++) {
+ time_t t;
+
+ de = dictGetRandomKey(server.db[j].expires);
+ t = (time_t) dictGetEntryVal(de);
+ if (minttl == -1 || t < minttl) {
+ minkey = dictGetEntryKey(de);
+ minttl = t;
+ }
+ }
+ deleteKey(server.db+j,minkey);
+ }
+ }
+ if (!freed) return; /* nothing to free... */
+ }
+}
+
+/* ============================== Append Only file ========================== */
+
+static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
+ sds buf = sdsempty();
+ int j;
+ ssize_t nwritten;
+ time_t now;
+ robj *tmpargv[3];
+
+ /* The DB this command was targetting is not the same as the last command
+ * we appendend. To issue a SELECT command is needed. */
+ if (dictid != server.appendseldb) {
+ char seldb[64];
+
+ snprintf(seldb,sizeof(seldb),"%d",dictid);
+ buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
+ (unsigned long)strlen(seldb),seldb);
+ server.appendseldb = dictid;
+ }
+
+ /* "Fix" the argv vector if the command is EXPIRE. We want to translate
+ * EXPIREs into EXPIREATs calls */
+ if (cmd->proc == expireCommand) {
+ long when;
+
+ tmpargv[0] = createStringObject("EXPIREAT",8);
+ tmpargv[1] = argv[1];
+ incrRefCount(argv[1]);
+ when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
+ tmpargv[2] = createObject(REDIS_STRING,
+ sdscatprintf(sdsempty(),"%ld",when));
+ argv = tmpargv;
+ }
+
+ /* Append the actual command */
+ buf = sdscatprintf(buf,"*%d\r\n",argc);
+ for (j = 0; j < argc; j++) {
+ robj *o = argv[j];
+
+ o = getDecodedObject(o);
+ buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
+ buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
+ buf = sdscatlen(buf,"\r\n",2);
+ decrRefCount(o);
+ }
+
+ /* Free the objects from the modified argv for EXPIREAT */
+ if (cmd->proc == expireCommand) {
+ for (j = 0; j < 3; j++)
+ decrRefCount(argv[j]);
+ }
+
+ /* We want to perform a single write. This should be guaranteed atomic
+ * at least if the filesystem we are writing is a real physical one.
+ * While this will save us against the server being killed I don't think
+ * there is much to do about the whole server stopping for power problems
+ * or alike */
+ nwritten = write(server.appendfd,buf,sdslen(buf));
+ if (nwritten != (signed)sdslen(buf)) {
+ /* Ooops, we are in troubles. The best thing to do for now is
+ * to simply exit instead to give the illusion that everything is
+ * working as expected. */
+ if (nwritten == -1) {
+ redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
+ } else {
+ redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
+ }
+ exit(1);
+ }
+ /* If a background append only file rewriting is in progress we want to
+ * accumulate the differences between the child DB and the current one
+ * in a buffer, so that when the child process will do its work we
+ * can append the differences to the new append only file. */
+ if (server.bgrewritechildpid != -1)
+ server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
+
+ sdsfree(buf);
+ now = time(NULL);
+ if (server.appendfsync == APPENDFSYNC_ALWAYS ||
+ (server.appendfsync == APPENDFSYNC_EVERYSEC &&
+ now-server.lastfsync > 1))
+ {
+ fsync(server.appendfd); /* Let's try to get this data on the disk */
+ server.lastfsync = now;
+ }
+}
+
+/* In Redis commands are always executed in the context of a client, so in
+ * order to load the append only file we need to create a fake client. */
+static struct redisClient *createFakeClient(void) {
+ struct redisClient *c = zmalloc(sizeof(*c));
+
+ selectDb(c,0);
+ c->fd = -1;
+ c->querybuf = sdsempty();
+ c->argc = 0;
+ c->argv = NULL;
+ c->flags = 0;
+ /* We set the fake client as a slave waiting for the synchronization
+ * so that Redis will not try to send replies to this client. */
+ c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
+ c->reply = listCreate();
+ listSetFreeMethod(c->reply,decrRefCount);
+ listSetDupMethod(c->reply,dupClientReplyValue);
+ return c;
+}
+
+static void freeFakeClient(struct redisClient *c) {
+ sdsfree(c->querybuf);
+ listRelease(c->reply);
+ zfree(c);
+}
+
+/* Replay the append log file. On error REDIS_OK is returned. On non fatal
+ * error (the append only file is zero-length) REDIS_ERR is returned. On
+ * fatal error an error message is logged and the program exists. */
+int loadAppendOnlyFile(char *filename) {
+ struct redisClient *fakeClient;
+ FILE *fp = fopen(filename,"r");
+ struct redis_stat sb;
+ unsigned long long loadedkeys = 0;
+
+ if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
+ return REDIS_ERR;
+
+ if (fp == NULL) {
+ redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
+ exit(1);
+ }
+
+ fakeClient = createFakeClient();
+ while(1) {
+ int argc, j;
+ unsigned long len;
+ robj **argv;
+ char buf[128];
+ sds argsds;
+ struct redisCommand *cmd;
+
+ if (fgets(buf,sizeof(buf),fp) == NULL) {
+ if (feof(fp))
+ break;
+ else
+ goto readerr;
+ }
+ if (buf[0] != '*') goto fmterr;
+ argc = atoi(buf+1);
+ argv = zmalloc(sizeof(robj*)*argc);
+ for (j = 0; j < argc; j++) {
+ if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
+ if (buf[0] != '$') goto fmterr;
+ len = strtol(buf+1,NULL,10);
+ argsds = sdsnewlen(NULL,len);
+ if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
+ argv[j] = createObject(REDIS_STRING,argsds);
+ if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
+ }
+
+ /* Command lookup */
+ cmd = lookupCommand(argv[0]->ptr);
+ if (!cmd) {
+ redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
+ exit(1);
+ }
+ /* Try object sharing and encoding */
+ if (server.shareobjects) {
+ int j;
+ for(j = 1; j < argc; j++)
+ argv[j] = tryObjectSharing(argv[j]);
+ }
+ if (cmd->flags & REDIS_CMD_BULK)
+ tryObjectEncoding(argv[argc-1]);
+ /* Run the command in the context of a fake client */
+ fakeClient->argc = argc;
+ fakeClient->argv = argv;
+ cmd->proc(fakeClient);
+ /* Discard the reply objects list from the fake client */
+ while(listLength(fakeClient->reply))
+ listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
+ /* Clean up, ready for the next command */
+ for (j = 0; j < argc; j++) decrRefCount(argv[j]);
+ zfree(argv);
+ /* Handle swapping while loading big datasets when VM is on */
+ loadedkeys++;
+ if (server.vm_enabled && (loadedkeys % 5000) == 0) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
+ }
+ }
+ }
+ fclose(fp);
+ freeFakeClient(fakeClient);
+ return REDIS_OK;
+
+readerr:
+ if (feof(fp)) {
+ redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
+ } else {
+ redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
+ }
+ exit(1);
+fmterr:
+ redisLog(REDIS_WARNING,"Bad file format reading the append only file");
+ exit(1);
+}
+
+/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
+static int fwriteBulk(FILE *fp, robj *obj) {
+ char buf[128];
+ int decrrc = 0;
+
+ /* Avoid the incr/decr ref count business if possible to help
+ * copy-on-write (we are often in a child process when this function
+ * is called).
+ * Also makes sure that key objects don't get incrRefCount-ed when VM
+ * is enabled */
+ if (obj->encoding != REDIS_ENCODING_RAW) {
+ obj = getDecodedObject(obj);
+ decrrc = 1;
+ }
+ snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
+ if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
+ if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
+ goto err;
+ if (fwrite("\r\n",2,1,fp) == 0) goto err;
+ if (decrrc) decrRefCount(obj);
+ return 1;
+err:
+ if (decrrc) decrRefCount(obj);
+ return 0;
+}
+
+/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
+static int fwriteBulkDouble(FILE *fp, double d) {
+ char buf[128], dbuf[128];
+
+ snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
+ snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
+ if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
+ if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
+ return 1;
+}
+
+/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
+static int fwriteBulkLong(FILE *fp, long l) {
+ char buf[128], lbuf[128];
+
+ snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
+ snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
+ if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
+ if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
+ return 1;
+}
+
+/* Write a sequence of commands able to fully rebuild the dataset into
+ * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
+static int rewriteAppendOnlyFile(char *filename) {
+ dictIterator *di = NULL;
+ dictEntry *de;
+ FILE *fp;
+ char tmpfile[256];
+ int j;
+ time_t now = time(NULL);
+
+ /* Note that we have to use a different temp name here compared to the
+ * one used by rewriteAppendOnlyFileBackground() function. */
+ snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
+ fp = fopen(tmpfile,"w");
+ if (!fp) {
+ redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
+ return REDIS_ERR;
+ }
+ for (j = 0; j < server.dbnum; j++) {
+ char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
+ redisDb *db = server.db+j;
+ dict *d = db->dict;
+ if (dictSize(d) == 0) continue;
+ di = dictGetIterator(d);
+ if (!di) {
+ fclose(fp);
+ return REDIS_ERR;
+ }
+
+ /* SELECT the new DB */
+ if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkLong(fp,j) == 0) goto werr;
+
+ /* Iterate this DB writing every entry */
+ while((de = dictNext(di)) != NULL) {
+ robj *key, *o;
+ time_t expiretime;
+ int swapped;
+
+ key = dictGetEntryKey(de);
+ /* If the value for this key is swapped, load a preview in memory.
+ * We use a "swapped" flag to remember if we need to free the
+ * value object instead to just increment the ref count anyway
+ * in order to avoid copy-on-write of pages if we are forked() */
+ if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
+ key->storage == REDIS_VM_SWAPPING) {
+ o = dictGetEntryVal(de);
+ swapped = 0;
+ } else {
+ o = vmPreviewObject(key);
+ swapped = 1;
+ }
+ expiretime = getExpire(db,key);
+
+ /* Save the key and associated value */
+ if (o->type == REDIS_STRING) {
+ /* Emit a SET command */
+ char cmd[]="*3\r\n$3\r\nSET\r\n";
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ /* Key and value */
+ if (fwriteBulk(fp,key) == 0) goto werr;
+ if (fwriteBulk(fp,o) == 0) goto werr;
+ } else if (o->type == REDIS_LIST) {
+ /* Emit the RPUSHes needed to rebuild the list */
+ list *list = o->ptr;
+ listNode *ln;
+ listIter li;
+
+ listRewind(list,&li);
+ while((ln = listNext(&li))) {
+ char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
+ robj *eleobj = listNodeValue(ln);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulk(fp,key) == 0) goto werr;
+ if (fwriteBulk(fp,eleobj) == 0) goto werr;
+ }
+ } else if (o->type == REDIS_SET) {
+ /* Emit the SADDs needed to rebuild the set */
+ dict *set = o->ptr;
+ dictIterator *di = dictGetIterator(set);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ char cmd[]="*3\r\n$4\r\nSADD\r\n";
+ robj *eleobj = dictGetEntryKey(de);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulk(fp,key) == 0) goto werr;
+ if (fwriteBulk(fp,eleobj) == 0) goto werr;
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_ZSET) {
+ /* Emit the ZADDs needed to rebuild the sorted set */
+ zset *zs = o->ptr;
+ dictIterator *di = dictGetIterator(zs->dict);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ char cmd[]="*4\r\n$4\r\nZADD\r\n";
+ robj *eleobj = dictGetEntryKey(de);
+ double *score = dictGetEntryVal(de);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulk(fp,key) == 0) goto werr;
+ if (fwriteBulkDouble(fp,*score) == 0) goto werr;
+ if (fwriteBulk(fp,eleobj) == 0) goto werr;
+ }
+ dictReleaseIterator(di);
+ } else {
+ redisAssert(0 != 0);
+ }
+ /* Save the expire time */
+ if (expiretime != -1) {
+ char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
+ /* If this key is already expired skip it */
+ if (expiretime < now) continue;
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulk(fp,key) == 0) goto werr;
+ if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
+ }
+ if (swapped) decrRefCount(o);
+ }
+ dictReleaseIterator(di);
+ }
+
+ /* Make sure data will not remain on the OS's output buffers */
+ fflush(fp);
+ fsync(fileno(fp));
+ fclose(fp);
+
+ /* Use RENAME to make sure the DB file is changed atomically only
+ * if the generate DB file is ok. */
+ if (rename(tmpfile,filename) == -1) {
+ redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
+ unlink(tmpfile);
+ return REDIS_ERR;
+ }
+ redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
+ return REDIS_OK;
+
+werr:
+ fclose(fp);
+ unlink(tmpfile);
+ redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
+ if (di) dictReleaseIterator(di);
+ return REDIS_ERR;
+}
+
+/* This is how rewriting of the append only file in background works:
+ *
+ * 1) The user calls BGREWRITEAOF
+ * 2) Redis calls this function, that forks():
+ * 2a) the child rewrite the append only file in a temp file.
+ * 2b) the parent accumulates differences in server.bgrewritebuf.
+ * 3) When the child finished '2a' exists.
+ * 4) The parent will trap the exit code, if it's OK, will append the
+ * data accumulated into server.bgrewritebuf into the temp file, and
+ * finally will rename(2) the temp file in the actual file name.
+ * The the new file is reopened as the new append only file. Profit!
+ */
+static int rewriteAppendOnlyFileBackground(void) {
+ pid_t childpid;
+
+ if (server.bgrewritechildpid != -1) return REDIS_ERR;
+ if (server.vm_enabled) waitEmptyIOJobsQueue();
+ if ((childpid = fork()) == 0) {
+ /* Child */
+ char tmpfile[256];
+
+ if (server.vm_enabled) vmReopenSwapFile();
+ close(server.fd);
+ snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
+ if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
+ _exit(0);
+ } else {
+ _exit(1);
+ }
+ } else {
+ /* Parent */
+ if (childpid == -1) {
+ redisLog(REDIS_WARNING,
+ "Can't rewrite append only file in background: fork: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ redisLog(REDIS_NOTICE,
+ "Background append only file rewriting started by pid %d",childpid);
+ server.bgrewritechildpid = childpid;
+ /* We set appendseldb to -1 in order to force the next call to the
+ * feedAppendOnlyFile() to issue a SELECT command, so the differences
+ * accumulated by the parent into server.bgrewritebuf will start
+ * with a SELECT statement and it will be safe to merge. */
+ server.appendseldb = -1;
+ return REDIS_OK;
+ }
+ return REDIS_OK; /* unreached */
+}
+
+static void bgrewriteaofCommand(redisClient *c) {
+ if (server.bgrewritechildpid != -1) {
+ addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
+ return;
+ }
+ if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
+ char *status = "+Background append only file rewriting started\r\n";
+ addReplySds(c,sdsnew(status));
+ } else {
+ addReply(c,shared.err);
+ }
+}
+
+static void aofRemoveTempFile(pid_t childpid) {
+ char tmpfile[256];
+
+ snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
+ unlink(tmpfile);
+}
+
+/* Virtual Memory is composed mainly of two subsystems:
+ * - Blocking Virutal Memory
+ * - Threaded Virtual Memory I/O
+ * The two parts are not fully decoupled, but functions are split among two
+ * different sections of the source code (delimited by comments) in order to
+ * make more clear what functionality is about the blocking VM and what about
+ * the threaded (not blocking) VM.
+ *
+ * Redis VM design:
+ *
+ * Redis VM is a blocking VM (one that blocks reading swapped values from
+ * disk into memory when a value swapped out is needed in memory) that is made
+ * unblocking by trying to examine the command argument vector in order to
+ * load in background values that will likely be needed in order to exec
+ * the command. The command is executed only once all the relevant keys
+ * are loaded into memory.
+ *
+ * This basically is almost as simple of a blocking VM, but almost as parallel
+ * as a fully non-blocking VM.
+ */
+
+/* =================== Virtual Memory - Blocking Side ====================== */
+
+/* substitute the first occurrence of '%p' with the process pid in the
+ * swap file name. */
+static void expandVmSwapFilename(void) {
+ char *p = strstr(server.vm_swap_file,"%p");
+ sds new;
+
+ if (!p) return;
+ new = sdsempty();
+ *p = '\0';
+ new = sdscat(new,server.vm_swap_file);
+ new = sdscatprintf(new,"%ld",(long) getpid());
+ new = sdscat(new,p+2);
+ zfree(server.vm_swap_file);
+ server.vm_swap_file = new;
+}
+
+static void vmInit(void) {
+ off_t totsize;
+ int pipefds[2];
+ size_t stacksize;
+
+ if (server.vm_max_threads != 0)
+ zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
+
+ expandVmSwapFilename();
+ redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
+ if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
+ server.vm_fp = fopen(server.vm_swap_file,"w+b");
+ }
+ if (server.vm_fp == NULL) {
+ redisLog(REDIS_WARNING,
+ "Impossible to open the swap file: %s. Exiting.",
+ strerror(errno));
+ exit(1);
+ }
+ server.vm_fd = fileno(server.vm_fp);
+ server.vm_next_page = 0;
+ server.vm_near_pages = 0;
+ server.vm_stats_used_pages = 0;
+ server.vm_stats_swapped_objects = 0;
+ server.vm_stats_swapouts = 0;
+ server.vm_stats_swapins = 0;
+ totsize = server.vm_pages*server.vm_page_size;
+ redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
+ if (ftruncate(server.vm_fd,totsize) == -1) {
+ redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
+ strerror(errno));
+ exit(1);
+ } else {
+ redisLog(REDIS_NOTICE,"Swap file allocated with success");
+ }
+ server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
+ redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
+ (long long) (server.vm_pages+7)/8, server.vm_pages);
+ memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
+
+ /* Initialize threaded I/O (used by Virtual Memory) */
+ server.io_newjobs = listCreate();
+ server.io_processing = listCreate();
+ server.io_processed = listCreate();
+ server.io_ready_clients = listCreate();
+ pthread_mutex_init(&server.io_mutex,NULL);
+ pthread_mutex_init(&server.obj_freelist_mutex,NULL);
+ pthread_mutex_init(&server.io_swapfile_mutex,NULL);
+ server.io_active_threads = 0;
+ if (pipe(pipefds) == -1) {
+ redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
+ ,strerror(errno));
+ exit(1);
+ }
+ server.io_ready_pipe_read = pipefds[0];
+ server.io_ready_pipe_write = pipefds[1];
+ redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
+ /* LZF requires a lot of stack */
+ pthread_attr_init(&server.io_threads_attr);
+ pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
+ while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
+ pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
+ /* Listen for events in the threaded I/O pipe */
+ if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
+ vmThreadedIOCompletedJob, NULL) == AE_ERR)
+ oom("creating file event");
+}
+
+/* Mark the page as used */
+static void vmMarkPageUsed(off_t page) {
+ off_t byte = page/8;
+ int bit = page&7;
+ redisAssert(vmFreePage(page) == 1);
+ server.vm_bitmap[byte] |= 1<<bit;
+}
+
+/* Mark N contiguous pages as used, with 'page' being the first. */
+static void vmMarkPagesUsed(off_t page, off_t count) {
+ off_t j;
+
+ for (j = 0; j < count; j++)
+ vmMarkPageUsed(page+j);
+ server.vm_stats_used_pages += count;
+ redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
+ (long long)count, (long long)page);
+}
+
+/* Mark the page as free */
+static void vmMarkPageFree(off_t page) {
+ off_t byte = page/8;
+ int bit = page&7;
+ redisAssert(vmFreePage(page) == 0);
+ server.vm_bitmap[byte] &= ~(1<<bit);
+}
+
+/* Mark N contiguous pages as free, with 'page' being the first. */
+static void vmMarkPagesFree(off_t page, off_t count) {
+ off_t j;
+
+ for (j = 0; j < count; j++)
+ vmMarkPageFree(page+j);
+ server.vm_stats_used_pages -= count;
+ redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
+ (long long)count, (long long)page);
+}
+
+/* Test if the page is free */
+static int vmFreePage(off_t page) {
+ off_t byte = page/8;
+ int bit = page&7;
+ return (server.vm_bitmap[byte] & (1<<bit)) == 0;
+}
+
+/* Find N contiguous free pages storing the first page of the cluster in *first.
+ * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
+ * REDIS_ERR is returned.
+ *
+ * This function uses a simple algorithm: we try to allocate
+ * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
+ * again from the start of the swap file searching for free spaces.
+ *
+ * If it looks pretty clear that there are no free pages near our offset
+ * we try to find less populated places doing a forward jump of
+ * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
+ * without hurry, and then we jump again and so forth...
+ *
+ * This function can be improved using a free list to avoid to guess
+ * too much, since we could collect data about freed pages.
+ *
+ * note: I implemented this function just after watching an episode of
+ * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
+ */
+static int vmFindContiguousPages(off_t *first, off_t n) {
+ off_t base, offset = 0, since_jump = 0, numfree = 0;
+
+ if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
+ server.vm_near_pages = 0;
+ server.vm_next_page = 0;
+ }
+ server.vm_near_pages++; /* Yet another try for pages near to the old ones */
+ base = server.vm_next_page;
+
+ while(offset < server.vm_pages) {
+ off_t this = base+offset;
+
+ /* If we overflow, restart from page zero */
+ if (this >= server.vm_pages) {
+ this -= server.vm_pages;
+ if (this == 0) {
+ /* Just overflowed, what we found on tail is no longer
+ * interesting, as it's no longer contiguous. */
+ numfree = 0;
+ }
+ }
+ if (vmFreePage(this)) {
+ /* This is a free page */
+ numfree++;
+ /* Already got N free pages? Return to the caller, with success */
+ if (numfree == n) {
+ *first = this-(n-1);
+ server.vm_next_page = this+1;
+ redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
+ return REDIS_OK;
+ }
+ } else {
+ /* The current one is not a free page */
+ numfree = 0;
+ }
+
+ /* Fast-forward if the current page is not free and we already
+ * searched enough near this place. */
+ since_jump++;
+ if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
+ offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
+ since_jump = 0;
+ /* Note that even if we rewind after the jump, we are don't need
+ * to make sure numfree is set to zero as we only jump *if* it
+ * is set to zero. */
+ } else {
+ /* Otherwise just check the next page */
+ offset++;
+ }
+ }
+ return REDIS_ERR;
+}
+
+/* Write the specified object at the specified page of the swap file */
+static int vmWriteObjectOnSwap(robj *o, off_t page) {
+ if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
+ if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
+ if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
+ redisLog(REDIS_WARNING,
+ "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ rdbSaveObject(server.vm_fp,o);
+ fflush(server.vm_fp);
+ if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
+ return REDIS_OK;
+}
+
+/* Swap the 'val' object relative to 'key' into disk. Store all the information
+ * needed to later retrieve the object into the key object.
+ * If we can't find enough contiguous empty pages to swap the object on disk
+ * REDIS_ERR is returned. */
+static int vmSwapObjectBlocking(robj *key, robj *val) {
+ off_t pages = rdbSavedObjectPages(val,NULL);
+ off_t page;
+
+ assert(key->storage == REDIS_VM_MEMORY);
+ assert(key->refcount == 1);
+ if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
+ if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
+ key->vm.page = page;
+ key->vm.usedpages = pages;
+ key->storage = REDIS_VM_SWAPPED;
+ key->vtype = val->type;
+ decrRefCount(val); /* Deallocate the object from memory. */
+ vmMarkPagesUsed(page,pages);
+ redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
+ (unsigned char*) key->ptr,
+ (unsigned long long) page, (unsigned long long) pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ return REDIS_OK;
+}
+
+static robj *vmReadObjectFromSwap(off_t page, int type) {
+ robj *o;
+
+ if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
+ if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
+ redisLog(REDIS_WARNING,
+ "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
+ strerror(errno));
+ _exit(1);
+ }
+ o = rdbLoadObject(type,server.vm_fp);
+ if (o == NULL) {
+ redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
+ _exit(1);
+ }
+ if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
+ return o;
+}
+
+/* Load the value object relative to the 'key' object from swap to memory.
+ * The newly allocated object is returned.
+ *
+ * If preview is true the unserialized object is returned to the caller but
+ * no changes are made to the key object, nor the pages are marked as freed */
+static robj *vmGenericLoadObject(robj *key, int preview) {
+ robj *val;
+
+ redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
+ val = vmReadObjectFromSwap(key->vm.page,key->vtype);
+ if (!preview) {
+ key->storage = REDIS_VM_MEMORY;
+ key->vm.atime = server.unixtime;
+ vmMarkPagesFree(key->vm.page,key->vm.usedpages);
+ redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
+ (unsigned char*) key->ptr);
+ server.vm_stats_swapped_objects--;
+ } else {
+ redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
+ (unsigned char*) key->ptr);
+ }
+ server.vm_stats_swapins++;
+ return val;
+}
+
+/* Plain object loading, from swap to memory */
+static robj *vmLoadObject(robj *key) {
+ /* If we are loading the object in background, stop it, we
+ * need to load this object synchronously ASAP. */
+ if (key->storage == REDIS_VM_LOADING)
+ vmCancelThreadedIOJob(key);
+ return vmGenericLoadObject(key,0);
+}
+
+/* Just load the value on disk, without to modify the key.
+ * This is useful when we want to perform some operation on the value
+ * without to really bring it from swap to memory, like while saving the
+ * dataset or rewriting the append only log. */
+static robj *vmPreviewObject(robj *key) {
+ return vmGenericLoadObject(key,1);
+}
+
+/* How a good candidate is this object for swapping?
+ * The better candidate it is, the greater the returned value.
+ *
+ * Currently we try to perform a fast estimation of the object size in
+ * memory, and combine it with aging informations.
+ *
+ * Basically swappability = idle-time * log(estimated size)
+ *
+ * Bigger objects are preferred over smaller objects, but not
+ * proportionally, this is why we use the logarithm. This algorithm is
+ * just a first try and will probably be tuned later. */
+static double computeObjectSwappability(robj *o) {
+ time_t age = server.unixtime - o->vm.atime;
+ long asize = 0;
+ list *l;
+ dict *d;
+ struct dictEntry *de;
+ int z;
+
+ if (age <= 0) return 0;
+ switch(o->type) {
+ case REDIS_STRING:
+ if (o->encoding != REDIS_ENCODING_RAW) {
+ asize = sizeof(*o);
+ } else {
+ asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
+ }
+ break;
+ case REDIS_LIST:
+ l = o->ptr;
+ listNode *ln = listFirst(l);