X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/a3e60027e7d4419971406c6da7d6e8accbcaf9b0..02925dd96e3ad5e31a3cdd9abbc2415949de8700:/src/aof.c?ds=sidebyside diff --git a/src/aof.c b/src/aof.c index dffe95ed..45dbc0db 100644 --- a/src/aof.c +++ b/src/aof.c @@ -8,6 +8,8 @@ #include #include +void aofUpdateCurrentSize(void); + /* Called when the user switches from "appendonly yes" to "appendonly no" * at runtime using the CONFIG command. */ void stopAppendOnly(void) { @@ -19,15 +21,15 @@ void stopAppendOnly(void) { server.appendseldb = -1; server.appendonly = 0; /* rewrite operation in progress? kill it, wait child exit */ - if (server.bgsavechildpid != -1) { + if (server.bgrewritechildpid != -1) { int statloc; - if (kill(server.bgsavechildpid,SIGKILL) != -1) + if (kill(server.bgrewritechildpid,SIGKILL) != -1) wait3(&statloc,0,NULL); /* reset the buffer accumulating changes while the child saves */ sdsfree(server.bgrewritebuf); server.bgrewritebuf = sdsempty(); - server.bgsavechildpid = -1; + server.bgrewritechildpid = -1; } } @@ -58,7 +60,6 @@ int startAppendOnly(void) { * buffer and write it on disk using this function just before entering * the event loop again. */ void flushAppendOnlyFile(void) { - time_t now; ssize_t nwritten; if (sdslen(server.aofbuf) == 0) return; @@ -68,50 +69,70 @@ void flushAppendOnlyFile(void) { * While this will save us against the server being killed I don't think * there is much to do about the whole server stopping for power problems * or alike */ - nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); - if (nwritten != (signed)sdslen(server.aofbuf)) { + nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); + if (nwritten != (signed)sdslen(server.aofbuf)) { /* Ooops, we are in troubles. The best thing to do for now is * aborting instead of giving the illusion that everything is * working as expected. */ - if (nwritten == -1) { + if (nwritten == -1) { redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); - } else { + } else { redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); - } - exit(1); + } + exit(1); + } + server.appendonly_current_size += nwritten; + + /* Re-use AOF buffer when it is small enough. The maximum comes from the + * arena size of 4k minus some overhead (but is otherwise arbitrary). */ + if ((sdslen(server.aofbuf)+sdsavail(server.aofbuf)) < 4000) { + sdsclear(server.aofbuf); + } else { + sdsfree(server.aofbuf); + server.aofbuf = sdsempty(); } - sdsfree(server.aofbuf); - server.aofbuf = sdsempty(); - /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have - * childs performing heavy I/O on disk. */ + /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are + * children doing I/O in the background. */ if (server.no_appendfsync_on_rewrite && (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) return; - /* Fsync if needed */ - now = time(NULL); + + /* Perform the fsync if needed. */ if (server.appendfsync == APPENDFSYNC_ALWAYS || (server.appendfsync == APPENDFSYNC_EVERYSEC && - now-server.lastfsync > 1)) + server.unixtime > server.lastfsync)) { /* aof_fsync is defined as fdatasync() for Linux in order to avoid * flushing metadata. */ aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ - server.lastfsync = now; + server.lastfsync = server.unixtime; } } -sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { - int j; - buf = sdscatprintf(buf,"*%d\r\n",argc); +sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { + char buf[32]; + int len, j; + robj *o; + + buf[0] = '*'; + len = 1+ll2string(buf+1,sizeof(buf)-1,argc); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + for (j = 0; j < argc; j++) { - robj *o = getDecodedObject(argv[j]); - buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); - buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); - buf = sdscatlen(buf,"\r\n",2); + o = getDecodedObject(argv[j]); + buf[0] = '$'; + len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); + dst = sdscatlen(dst,"\r\n",2); decrRefCount(o); } - return buf; + return dst; } sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { @@ -218,9 +239,13 @@ int loadAppendOnlyFile(char *filename) { FILE *fp = fopen(filename,"r"); struct redis_stat sb; int appendonly = server.appendonly; + long loops = 0; - if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) + if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { + server.appendonly_current_size = 0; + fclose(fp); return REDIS_ERR; + } if (fp == NULL) { redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); @@ -232,6 +257,8 @@ int loadAppendOnlyFile(char *filename) { server.appendonly = 0; fakeClient = createFakeClient(); + startLoading(fp); + while(1) { int argc, j; unsigned long len; @@ -239,7 +266,12 @@ int loadAppendOnlyFile(char *filename) { char buf[128]; sds argsds; struct redisCommand *cmd; - int force_swapout; + + /* Serve the clients from time to time */ + if (!(loops++ % 1000)) { + loadingProgress(ftello(fp)); + aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); + } if (fgets(buf,sizeof(buf),fp) == NULL) { if (feof(fp)) @@ -273,21 +305,14 @@ int loadAppendOnlyFile(char *filename) { /* The fake client should not have a reply */ redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); - - /* Clean up, ready for the next command */ - for (j = 0; j < argc; j++) decrRefCount(argv[j]); - zfree(argv); - - /* Handle swapping while loading big datasets when VM is on */ - force_swapout = 0; - if ((redisEstimateRSS() - server.vm_max_memory) > 1024*1024*32) - force_swapout = 1; - - if (server.vm_enabled && force_swapout) { - while (redisEstimateRSS() > server.vm_max_memory) { - if (vmSwapOneObjectBlocking() == REDIS_ERR) break; - } - } + /* The fake client should never get blocked */ + redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0); + + /* Clean up. Command code may have changed argv/argc so we use the + * argv/argc of the client instead of the local variables. */ + for (j = 0; j < fakeClient->argc; j++) + decrRefCount(fakeClient->argv[j]); + zfree(fakeClient->argv); } /* This point can only be reached when EOF is reached without errors. @@ -297,6 +322,9 @@ int loadAppendOnlyFile(char *filename) { fclose(fp); freeFakeClient(fakeClient); server.appendonly = appendonly; + stopLoading(); + aofUpdateCurrentSize(); + server.auto_aofrewrite_base_size = server.appendonly_current_size; return REDIS_OK; readerr: @@ -334,7 +362,7 @@ int rewriteAppendOnlyFile(char *filename) { redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; - di = dictGetIterator(d); + di = dictGetSafeIterator(d); if (!di) { fclose(fp); return REDIS_ERR; @@ -346,25 +374,14 @@ int rewriteAppendOnlyFile(char *filename) { /* Iterate this DB writing every entry */ while((de = dictNext(di)) != NULL) { - sds keystr = dictGetEntryKey(de); + sds keystr; robj key, *o; time_t expiretime; - int swapped; keystr = dictGetEntryKey(de); o = dictGetEntryVal(de); initStaticStringObject(key,keystr); - /* If the value for this key is swapped, load a preview in memory. - * We use a "swapped" flag to remember if we need to free the - * value object instead to just increment the ref count anyway - * in order to avoid copy-on-write of pages if we are forked() */ - if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || - o->storage == REDIS_VM_SWAPPING) { - swapped = 0; - } else { - o = vmPreviewObject(o); - swapped = 1; - } + expiretime = getExpire(db,&key); /* Save the key and associated value */ @@ -440,21 +457,55 @@ int rewriteAppendOnlyFile(char *filename) { } } else if (o->type == REDIS_ZSET) { /* Emit the ZADDs needed to rebuild the sorted set */ - zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - char cmd[]="*4\r\n$4\r\nZADD\r\n"; - robj *eleobj = dictGetEntryKey(de); - double *score = dictGetEntryVal(de); - - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkDouble(fp,*score) == 0) goto werr; - if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + char cmd[]="*4\r\n$4\r\nZADD\r\n"; + + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *zl = o->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vll; + double score; + + eptr = ziplistIndex(zl,0); + redisAssert(eptr != NULL); + sptr = ziplistNext(zl,eptr); + redisAssert(sptr != NULL); + + while (eptr != NULL) { + redisAssert(ziplistGet(eptr,&vstr,&vlen,&vll)); + score = zzlGetScore(sptr); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkDouble(fp,score) == 0) goto werr; + if (vstr != NULL) { + if (fwriteBulkString(fp,(char*)vstr,vlen) == 0) + goto werr; + } else { + if (fwriteBulkLongLong(fp,vll) == 0) + goto werr; + } + zzlNext(zl,&eptr,&sptr); + } + } else if (o->encoding == REDIS_ENCODING_SKIPLIST) { + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkDouble(fp,*score) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + dictReleaseIterator(di); + } else { + redisPanic("Unknown sorted set encoding"); } - dictReleaseIterator(di); } else if (o->type == REDIS_HASH) { char cmd[]="*4\r\n$4\r\nHSET\r\n"; @@ -499,7 +550,6 @@ int rewriteAppendOnlyFile(char *filename) { if (fwriteBulkObject(fp,&key) == 0) goto werr; if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr; } - if (swapped) decrRefCount(o); } dictReleaseIterator(di); } @@ -541,15 +591,16 @@ werr: */ int rewriteAppendOnlyFileBackground(void) { pid_t childpid; + long long start; if (server.bgrewritechildpid != -1) return REDIS_ERR; - if (server.vm_enabled) waitEmptyIOJobsQueue(); + start = ustime(); if ((childpid = fork()) == 0) { - /* Child */ char tmpfile[256]; - if (server.vm_enabled) vmReopenSwapFile(); - close(server.fd); + /* Child */ + if (server.ipfd > 0) close(server.ipfd); + if (server.sofd > 0) close(server.sofd); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { _exit(0); @@ -558,6 +609,7 @@ int rewriteAppendOnlyFileBackground(void) { } } else { /* Parent */ + server.stat_fork_time = ustime()-start; if (childpid == -1) { redisLog(REDIS_WARNING, "Can't rewrite append only file in background: fork: %s", @@ -581,9 +633,10 @@ int rewriteAppendOnlyFileBackground(void) { void bgrewriteaofCommand(redisClient *c) { if (server.bgrewritechildpid != -1) { addReplyError(c,"Background append only file rewriting already in progress"); - return; - } - if (rewriteAppendOnlyFileBackground() == REDIS_OK) { + } else if (server.bgsavechildpid != -1) { + server.aofrewrite_scheduled = 1; + addReplyStatus(c,"Background append only file rewriting scheduled"); + } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) { addReplyStatus(c,"Background append only file rewriting started"); } else { addReply(c,shared.err); @@ -597,12 +650,24 @@ void aofRemoveTempFile(pid_t childpid) { unlink(tmpfile); } +/* Update the server.appendonly_current_size filed explicitly using stat(2) + * to check the size of the file. This is useful after a rewrite or after + * a restart, normally the size is updated just adding the write length + * to the current lenght, that is much faster. */ +void aofUpdateCurrentSize(void) { + struct redis_stat sb; + + if (redis_fstat(server.appendfd,&sb) == -1) { + redisLog(REDIS_WARNING,"Unable to check the AOF length: %s", + strerror(errno)); + } else { + server.appendonly_current_size = sb.st_size; + } +} + /* A background append only file rewriting (BGREWRITEAOF) terminated its work. * Handle this. */ -void backgroundRewriteDoneHandler(int statloc) { - int exitcode = WEXITSTATUS(statloc); - int bysignal = WIFSIGNALED(statloc); - +void backgroundRewriteDoneHandler(int exitcode, int bysignal) { if (!bysignal && exitcode == 0) { int fd; char tmpfile[256]; @@ -640,6 +705,8 @@ void backgroundRewriteDoneHandler(int statloc) { if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); server.appendseldb = -1; /* Make sure it will issue SELECT */ redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); + aofUpdateCurrentSize(); + server.auto_aofrewrite_base_size = server.appendonly_current_size; } else { /* If append only is disabled we just generate a dump in this * format. Why not? */ @@ -650,7 +717,7 @@ void backgroundRewriteDoneHandler(int statloc) { } else { redisLog(REDIS_WARNING, "Background append only file rewriting terminated by signal %d", - WTERMSIG(statloc)); + bysignal); } cleanup: sdsfree(server.bgrewritebuf);