X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/0d1650f8a9ee2df13a11685ff0eb739305b4fe1b..986630afad7310f4ad548b355be3df076a50032c:/src/aof.c diff --git a/src/aof.c b/src/aof.c index 03097269..7b15acb8 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1,4 +1,5 @@ #include "redis.h" +#include "bio.h" #include #include @@ -60,7 +61,6 @@ int startAppendOnly(void) { * buffer and write it on disk using this function just before entering * the event loop again. */ void flushAppendOnlyFile(void) { - time_t now; ssize_t nwritten; if (sdslen(server.aofbuf) == 0) return; @@ -70,51 +70,70 @@ void flushAppendOnlyFile(void) { * While this will save us against the server being killed I don't think * there is much to do about the whole server stopping for power problems * or alike */ - nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); - if (nwritten != (signed)sdslen(server.aofbuf)) { + nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); + if (nwritten != (signed)sdslen(server.aofbuf)) { /* Ooops, we are in troubles. The best thing to do for now is * aborting instead of giving the illusion that everything is * working as expected. */ - if (nwritten == -1) { + if (nwritten == -1) { redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); - } else { + } else { redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); - } - exit(1); + } + exit(1); } - sdsfree(server.aofbuf); - server.aofbuf = sdsempty(); server.appendonly_current_size += nwritten; - /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have - * childs performing heavy I/O on disk. */ + /* Re-use AOF buffer when it is small enough. The maximum comes from the + * arena size of 4k minus some overhead (but is otherwise arbitrary). */ + if ((sdslen(server.aofbuf)+sdsavail(server.aofbuf)) < 4000) { + sdsclear(server.aofbuf); + } else { + sdsfree(server.aofbuf); + server.aofbuf = sdsempty(); + } + + /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are + * children doing I/O in the background. */ if (server.no_appendfsync_on_rewrite && (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) return; - /* Fsync if needed */ - now = time(NULL); + + /* Perform the fsync if needed. */ if (server.appendfsync == APPENDFSYNC_ALWAYS || (server.appendfsync == APPENDFSYNC_EVERYSEC && - now-server.lastfsync > 1)) + server.unixtime > server.lastfsync)) { /* aof_fsync is defined as fdatasync() for Linux in order to avoid * flushing metadata. */ aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ - server.lastfsync = now; + server.lastfsync = server.unixtime; } } -sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { - int j; - buf = sdscatprintf(buf,"*%d\r\n",argc); +sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { + char buf[32]; + int len, j; + robj *o; + + buf[0] = '*'; + len = 1+ll2string(buf+1,sizeof(buf)-1,argc); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + for (j = 0; j < argc; j++) { - robj *o = getDecodedObject(argv[j]); - buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); - buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); - buf = sdscatlen(buf,"\r\n",2); + o = getDecodedObject(argv[j]); + buf[0] = '$'; + len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); + dst = sdscatlen(dst,"\r\n",2); decrRefCount(o); } - return buf; + return dst; } sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { @@ -287,6 +306,8 @@ int loadAppendOnlyFile(char *filename) { /* The fake client should not have a reply */ redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); + /* The fake client should never get blocked */ + redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0); /* Clean up. Command code may have changed argv/argc so we use the * argv/argc of the client instead of the local variables. */ @@ -342,7 +363,7 @@ int rewriteAppendOnlyFile(char *filename) { redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; - di = dictGetIterator(d); + di = dictGetSafeIterator(d); if (!di) { fclose(fp); return REDIS_ERR; @@ -574,10 +595,6 @@ int rewriteAppendOnlyFileBackground(void) { long long start; if (server.bgrewritechildpid != -1) return REDIS_ERR; - if (server.ds_enabled != 0) { - redisLog(REDIS_WARNING,"BGREWRITEAOF called with diskstore enabled: AOF is not supported when diskstore is enabled. Operation not performed."); - return REDIS_ERR; - } start = ustime(); if ((childpid = fork()) == 0) { char tmpfile[256]; @@ -653,56 +670,119 @@ void aofUpdateCurrentSize(void) { * Handle this. */ void backgroundRewriteDoneHandler(int exitcode, int bysignal) { if (!bysignal && exitcode == 0) { - int fd; + int newfd, oldfd; + int nwritten; char tmpfile[256]; + long long now = ustime(); redisLog(REDIS_NOTICE, - "Background append only file rewriting terminated with success"); - /* Now it's time to flush the differences accumulated by the parent */ - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid); - fd = open(tmpfile,O_WRONLY|O_APPEND); - if (fd == -1) { - redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno)); + "Background AOF rewrite terminated with success"); + + /* Flush the differences accumulated by the parent to the + * rewritten AOF. */ + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", + (int)server.bgrewritechildpid); + newfd = open(tmpfile,O_WRONLY|O_APPEND); + if (newfd == -1) { + redisLog(REDIS_WARNING, + "Unable to open the temporary AOF produced by the child: %s", strerror(errno)); goto cleanup; } - /* Flush our data... */ - if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) != - (signed) sdslen(server.bgrewritebuf)) { - redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno)); - close(fd); + + nwritten = write(newfd,server.bgrewritebuf,sdslen(server.bgrewritebuf)); + if (nwritten != (signed)sdslen(server.bgrewritebuf)) { + if (nwritten == -1) { + redisLog(REDIS_WARNING, + "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); + } else { + redisLog(REDIS_WARNING, + "Short write trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); + } + close(newfd); goto cleanup; } - redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf)); - /* Now our work is to rename the temp file into the stable file. And - * switch the file descriptor used by the server for append only. */ + + redisLog(REDIS_NOTICE, + "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", nwritten); + + /* The only remaining thing to do is to rename the temporary file to + * the configured file and switch the file descriptor used to do AOF + * writes. We don't want close(2) or rename(2) calls to block the + * server on old file deletion. + * + * There are two possible scenarios: + * + * 1) AOF is DISABLED and this was a one time rewrite. The temporary + * file will be renamed to the configured file. When this file already + * exists, it will be unlinked, which may block the server. + * + * 2) AOF is ENABLED and the rewritten AOF will immediately start + * receiving writes. After the temporary file is renamed to the + * configured file, the original AOF file descriptor will be closed. + * Since this will be the last reference to that file, closing it + * causes the underlying file to be unlinked, which may block the + * server. + * + * To mitigate the blocking effect of the unlink operation (either + * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we + * use a background thread to take care of this. First, we + * make scenario 1 identical to scenario 2 by opening the target file + * when it exists. The unlink operation after the rename(2) will then + * be executed upon calling close(2) for its descriptor. Everything to + * guarantee atomicity for this switch has already happened by then, so + * we don't care what the outcome or duration of that close operation + * is, as long as the file descriptor is released again. */ + if (server.appendfd == -1) { + /* AOF disabled */ + + /* Don't care if this fails: oldfd will be -1 and we handle that. + * One notable case of -1 return is if the old file does + * not exist. */ + oldfd = open(server.appendfilename,O_RDONLY|O_NONBLOCK); + } else { + /* AOF enabled */ + oldfd = -1; /* We'll set this to the current AOF filedes later. */ + } + + /* Rename the temporary file. This will not unlink the target file if + * it exists, because we reference it with "oldfd". */ if (rename(tmpfile,server.appendfilename) == -1) { - redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno)); - close(fd); + redisLog(REDIS_WARNING, + "Error trying to rename the temporary AOF: %s", strerror(errno)); + close(newfd); + if (oldfd != -1) close(oldfd); goto cleanup; } - /* Mission completed... almost */ - redisLog(REDIS_NOTICE,"Append only file successfully rewritten."); - if (server.appendfd != -1) { - /* If append only is actually enabled... */ - close(server.appendfd); - server.appendfd = fd; - if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); - server.appendseldb = -1; /* Make sure it will issue SELECT */ - redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); + + if (server.appendfd == -1) { + /* AOF disabled, we don't need to set the AOF file descriptor + * to this new file, so we can close it. */ + close(newfd); + } else { + /* AOF enabled, replace the old fd with the new one. */ + oldfd = server.appendfd; + server.appendfd = newfd; + if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(newfd); + server.appendseldb = -1; /* Make sure SELECT is re-issued */ aofUpdateCurrentSize(); server.auto_aofrewrite_base_size = server.appendonly_current_size; - } else { - /* If append only is disabled we just generate a dump in this - * format. Why not? */ - close(fd); } + + redisLog(REDIS_NOTICE, "Background AOF rewrite successful"); + + /* Asynchronously close the overwritten AOF. */ + if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd); + + redisLog(REDIS_VERBOSE, + "Background AOF rewrite signal handler took %lldus", ustime()-now); } else if (!bysignal && exitcode != 0) { - redisLog(REDIS_WARNING, "Background append only file rewriting error"); + redisLog(REDIS_WARNING, + "Background AOF rewrite terminated with error"); } else { redisLog(REDIS_WARNING, - "Background append only file rewriting terminated by signal %d", - bysignal); + "Background AOF rewrite terminated by signal %d", bysignal); } + cleanup: sdsfree(server.bgrewritebuf); server.bgrewritebuf = sdsempty();