X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/bfe85f7ca97259256e8089349e1a462b6c7dbd00..b39a4d0b3941be82629d94dfd06f1ddc13fb260b:/src/aof.c diff --git a/src/aof.c b/src/aof.c index ef72a2b1..b1b0d9dc 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1,4 +1,5 @@ #include "redis.h" +#include "bio.h" #include #include @@ -8,6 +9,8 @@ #include #include +void aofUpdateCurrentSize(void); + /* Called when the user switches from "appendonly yes" to "appendonly no" * at runtime using the CONFIG command. */ void stopAppendOnly(void) { @@ -19,15 +22,15 @@ void stopAppendOnly(void) { server.appendseldb = -1; server.appendonly = 0; /* rewrite operation in progress? kill it, wait child exit */ - if (server.bgsavechildpid != -1) { + if (server.bgrewritechildpid != -1) { int statloc; - if (kill(server.bgsavechildpid,SIGKILL) != -1) + if (kill(server.bgrewritechildpid,SIGKILL) != -1) wait3(&statloc,0,NULL); /* reset the buffer accumulating changes while the child saves */ sdsfree(server.bgrewritebuf); server.bgrewritebuf = sdsempty(); - server.bgsavechildpid = -1; + server.bgrewritechildpid = -1; } } @@ -58,7 +61,6 @@ int startAppendOnly(void) { * buffer and write it on disk using this function just before entering * the event loop again. */ void flushAppendOnlyFile(void) { - time_t now; ssize_t nwritten; if (sdslen(server.aofbuf) == 0) return; @@ -68,50 +70,70 @@ void flushAppendOnlyFile(void) { * While this will save us against the server being killed I don't think * there is much to do about the whole server stopping for power problems * or alike */ - nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); - if (nwritten != (signed)sdslen(server.aofbuf)) { + nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); + if (nwritten != (signed)sdslen(server.aofbuf)) { /* Ooops, we are in troubles. The best thing to do for now is * aborting instead of giving the illusion that everything is * working as expected. */ - if (nwritten == -1) { + if (nwritten == -1) { redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); - } else { + } else { redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); - } - exit(1); + } + exit(1); } - sdsfree(server.aofbuf); - server.aofbuf = sdsempty(); + server.appendonly_current_size += nwritten; - /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have - * childs performing heavy I/O on disk. */ + /* Re-use AOF buffer when it is small enough. The maximum comes from the + * arena size of 4k minus some overhead (but is otherwise arbitrary). */ + if ((sdslen(server.aofbuf)+sdsavail(server.aofbuf)) < 4000) { + sdsclear(server.aofbuf); + } else { + sdsfree(server.aofbuf); + server.aofbuf = sdsempty(); + } + + /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are + * children doing I/O in the background. */ if (server.no_appendfsync_on_rewrite && (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) return; - /* Fsync if needed */ - now = time(NULL); + + /* Perform the fsync if needed. */ if (server.appendfsync == APPENDFSYNC_ALWAYS || (server.appendfsync == APPENDFSYNC_EVERYSEC && - now-server.lastfsync > 1)) + server.unixtime > server.lastfsync)) { /* aof_fsync is defined as fdatasync() for Linux in order to avoid * flushing metadata. */ aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ - server.lastfsync = now; + server.lastfsync = server.unixtime; } } -sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { - int j; - buf = sdscatprintf(buf,"*%d\r\n",argc); +sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { + char buf[32]; + int len, j; + robj *o; + + buf[0] = '*'; + len = 1+ll2string(buf+1,sizeof(buf)-1,argc); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + for (j = 0; j < argc; j++) { - robj *o = getDecodedObject(argv[j]); - buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); - buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); - buf = sdscatlen(buf,"\r\n",2); + o = getDecodedObject(argv[j]); + buf[0] = '$'; + len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); + dst = sdscatlen(dst,"\r\n",2); decrRefCount(o); } - return buf; + return dst; } sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { @@ -221,6 +243,7 @@ int loadAppendOnlyFile(char *filename) { long loops = 0; if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { + server.appendonly_current_size = 0; fclose(fp); return REDIS_ERR; } @@ -283,6 +306,8 @@ int loadAppendOnlyFile(char *filename) { /* The fake client should not have a reply */ redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); + /* The fake client should never get blocked */ + redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0); /* Clean up. Command code may have changed argv/argc so we use the * argv/argc of the client instead of the local variables. */ @@ -299,6 +324,8 @@ int loadAppendOnlyFile(char *filename) { freeFakeClient(fakeClient); server.appendonly = appendonly; stopLoading(); + aofUpdateCurrentSize(); + server.auto_aofrewrite_base_size = server.appendonly_current_size; return REDIS_OK; readerr: @@ -336,7 +363,7 @@ int rewriteAppendOnlyFile(char *filename) { redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; - di = dictGetIterator(d); + di = dictGetSafeIterator(d); if (!di) { fclose(fp); return REDIS_ERR; @@ -565,16 +592,14 @@ werr: */ int rewriteAppendOnlyFileBackground(void) { pid_t childpid; + long long start; if (server.bgrewritechildpid != -1) return REDIS_ERR; - if (server.ds_enabled != 0) { - redisLog(REDIS_WARNING,"BGREWRITEAOF called with diskstore enabled: AOF is not supported when diskstore is enabled. Operation not performed."); - return REDIS_ERR; - } + start = ustime(); if ((childpid = fork()) == 0) { - /* Child */ char tmpfile[256]; + /* Child */ if (server.ipfd > 0) close(server.ipfd); if (server.sofd > 0) close(server.sofd); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); @@ -585,6 +610,7 @@ int rewriteAppendOnlyFileBackground(void) { } } else { /* Parent */ + server.stat_fork_time = ustime()-start; if (childpid == -1) { redisLog(REDIS_WARNING, "Can't rewrite append only file in background: fork: %s", @@ -608,9 +634,10 @@ int rewriteAppendOnlyFileBackground(void) { void bgrewriteaofCommand(redisClient *c) { if (server.bgrewritechildpid != -1) { addReplyError(c,"Background append only file rewriting already in progress"); - return; - } - if (rewriteAppendOnlyFileBackground() == REDIS_OK) { + } else if (server.bgsavechildpid != -1) { + server.aofrewrite_scheduled = 1; + addReplyStatus(c,"Background append only file rewriting scheduled"); + } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) { addReplyStatus(c,"Background append only file rewriting started"); } else { addReply(c,shared.err); @@ -624,58 +651,138 @@ void aofRemoveTempFile(pid_t childpid) { unlink(tmpfile); } +/* Update the server.appendonly_current_size filed explicitly using stat(2) + * to check the size of the file. This is useful after a rewrite or after + * a restart, normally the size is updated just adding the write length + * to the current lenght, that is much faster. */ +void aofUpdateCurrentSize(void) { + struct redis_stat sb; + + if (redis_fstat(server.appendfd,&sb) == -1) { + redisLog(REDIS_WARNING,"Unable to check the AOF length: %s", + strerror(errno)); + } else { + server.appendonly_current_size = sb.st_size; + } +} + /* A background append only file rewriting (BGREWRITEAOF) terminated its work. * Handle this. */ void backgroundRewriteDoneHandler(int exitcode, int bysignal) { if (!bysignal && exitcode == 0) { - int fd; + int newfd, oldfd; + int nwritten; char tmpfile[256]; + long long now = ustime(); redisLog(REDIS_NOTICE, - "Background append only file rewriting terminated with success"); - /* Now it's time to flush the differences accumulated by the parent */ - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid); - fd = open(tmpfile,O_WRONLY|O_APPEND); - if (fd == -1) { - redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno)); + "Background AOF rewrite terminated with success"); + + /* Flush the differences accumulated by the parent to the + * rewritten AOF. */ + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", + (int)server.bgrewritechildpid); + newfd = open(tmpfile,O_WRONLY|O_APPEND); + if (newfd == -1) { + redisLog(REDIS_WARNING, + "Unable to open the temporary AOF produced by the child: %s", strerror(errno)); goto cleanup; } - /* Flush our data... */ - if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) != - (signed) sdslen(server.bgrewritebuf)) { - redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno)); - close(fd); + + nwritten = write(newfd,server.bgrewritebuf,sdslen(server.bgrewritebuf)); + if (nwritten != (signed)sdslen(server.bgrewritebuf)) { + if (nwritten == -1) { + redisLog(REDIS_WARNING, + "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); + } else { + redisLog(REDIS_WARNING, + "Short write trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); + } + close(newfd); goto cleanup; } - redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf)); - /* Now our work is to rename the temp file into the stable file. And - * switch the file descriptor used by the server for append only. */ + + redisLog(REDIS_NOTICE, + "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", nwritten); + + /* The only remaining thing to do is to rename the temporary file to + * the configured file and switch the file descriptor used to do AOF + * writes. We don't want close(2) or rename(2) calls to block the + * server on old file deletion. + * + * There are two possible scenarios: + * + * 1) AOF is DISABLED and this was a one time rewrite. The temporary + * file will be renamed to the configured file. When this file already + * exists, it will be unlinked, which may block the server. + * + * 2) AOF is ENABLED and the rewritten AOF will immediately start + * receiving writes. After the temporary file is renamed to the + * configured file, the original AOF file descriptor will be closed. + * Since this will be the last reference to that file, closing it + * causes the underlying file to be unlinked, which may block the + * server. + * + * To mitigate the blocking effect of the unlink operation (either + * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we + * use a background thread to take care of this. First, we + * make scenario 1 identical to scenario 2 by opening the target file + * when it exists. The unlink operation after the rename(2) will then + * be executed upon calling close(2) for its descriptor. Everything to + * guarantee atomicity for this switch has already happened by then, so + * we don't care what the outcome or duration of that close operation + * is, as long as the file descriptor is released again. */ + if (server.appendfd == -1) { + /* AOF disabled */ + + /* Don't care if this fails: oldfd will be -1 and we handle that. + * One notable case of -1 return is if the old file does + * not exist. */ + oldfd = open(server.appendfilename,O_RDONLY|O_NONBLOCK); + } else { + /* AOF enabled */ + oldfd = -1; /* We'll set this to the current AOF filedes later. */ + } + + /* Rename the temporary file. This will not unlink the target file if + * it exists, because we reference it with "oldfd". */ if (rename(tmpfile,server.appendfilename) == -1) { - redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno)); - close(fd); + redisLog(REDIS_WARNING, + "Error trying to rename the temporary AOF: %s", strerror(errno)); + close(newfd); + if (oldfd != -1) close(oldfd); goto cleanup; } - /* Mission completed... almost */ - redisLog(REDIS_NOTICE,"Append only file successfully rewritten."); - if (server.appendfd != -1) { - /* If append only is actually enabled... */ - close(server.appendfd); - server.appendfd = fd; - if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); - server.appendseldb = -1; /* Make sure it will issue SELECT */ - redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); + + if (server.appendfd == -1) { + /* AOF disabled, we don't need to set the AOF file descriptor + * to this new file, so we can close it. */ + close(newfd); } else { - /* If append only is disabled we just generate a dump in this - * format. Why not? */ - close(fd); + /* AOF enabled, replace the old fd with the new one. */ + oldfd = server.appendfd; + server.appendfd = newfd; + if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(newfd); + server.appendseldb = -1; /* Make sure SELECT is re-issued */ + aofUpdateCurrentSize(); + server.auto_aofrewrite_base_size = server.appendonly_current_size; } + + redisLog(REDIS_NOTICE, "Background AOF rewrite successful"); + + /* Asynchronously close the overwritten AOF. */ + if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL); + + redisLog(REDIS_VERBOSE, + "Background AOF rewrite signal handler took %lldus", ustime()-now); } else if (!bysignal && exitcode != 0) { - redisLog(REDIS_WARNING, "Background append only file rewriting error"); + redisLog(REDIS_WARNING, + "Background AOF rewrite terminated with error"); } else { redisLog(REDIS_WARNING, - "Background append only file rewriting terminated by signal %d", - bysignal); + "Background AOF rewrite terminated by signal %d", bysignal); } + cleanup: sdsfree(server.bgrewritebuf); server.bgrewritebuf = sdsempty();