From db3c2a4fb4b2d4a9dd619151b7efc5e5253a8e3f Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 16 Sep 2011 12:35:12 +0200 Subject: [PATCH 1/1] postpone the AOF fsync if policy is everysec and there is a background fsync already going. --- src/aof.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++------- src/redis.c | 8 +++++++- src/redis.h | 1 + 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/aof.c b/src/aof.c index ffd46869..517a19d3 100644 --- a/src/aof.c +++ b/src/aof.c @@ -18,7 +18,7 @@ void aof_background_fsync(int fd) { /* Called when the user switches from "appendonly yes" to "appendonly no" * at runtime using the CONFIG command. */ void stopAppendOnly(void) { - flushAppendOnlyFile(); + flushAppendOnlyFile(1); aof_fsync(server.appendfd); close(server.appendfd); @@ -63,12 +63,50 @@ int startAppendOnly(void) { * and the only way the client socket can get a write is entering when the * the event loop, we accumulate all the AOF writes in a memory * buffer and write it on disk using this function just before entering - * the event loop again. */ -void flushAppendOnlyFile(void) { + * the event loop again. + * + * About the 'force' argument: + * + * When the fsync policy is set to 'everysec' we may delay the flush if there + * is still an fsync() going on in the background thread, since for instance + * on Linux write(2) will be blocked by the background fsync anyway. + * When this happens we remember that there is some aof buffer to be + * flushed ASAP, and will try to do that in the serverCron() function. + * + * However if force is set to 1 we'll write regardless of the background + * fsync. */ +void flushAppendOnlyFile(int force) { ssize_t nwritten; + int sync_in_progress = 0; if (sdslen(server.aofbuf) == 0) return; + if (server.appendfsync == APPENDFSYNC_EVERYSEC) + sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0; + + if (server.appendfsync == APPENDFSYNC_EVERYSEC && !force) { + /* With this append fsync policy we do background fsyncing. + * If the fsync is still in progress we can try to delay + * the write for a couple of seconds. */ + if (sync_in_progress) { + if (server.aof_flush_postponed_start == 0) { + /* No previous write postponinig, remember that we are + * postponing the flush and return. */ + server.aof_flush_postponed_start = server.unixtime; + return; + } else if (server.unixtime - server.aof_flush_postponed_start < 2) { + /* We were already writing for fsync to finish, but for less + * than two seconds this is still ok. Postpone again. */ + return; + } + /* Otherwise fall trough, and go write since we can't wait + * over two seconds. */ + } + } + /* If you are following this code path, then we are going to write so + * set reset the postponed flush sentinel to zero. */ + server.aof_flush_postponed_start = 0; + /* We want to perform a single write. This should be guaranteed atomic * at least if the filesystem we are writing is a real physical one. * While this will save us against the server being killed I don't think @@ -104,14 +142,15 @@ void flushAppendOnlyFile(void) { return; /* Perform the fsync if needed. */ - if (server.appendfsync == APPENDFSYNC_ALWAYS || - (server.appendfsync == APPENDFSYNC_EVERYSEC && - server.unixtime > server.lastfsync)) - { + if (server.appendfsync == APPENDFSYNC_ALWAYS) { /* aof_fsync is defined as fdatasync() for Linux in order to avoid * flushing metadata. */ aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ server.lastfsync = server.unixtime; + } else if ((server.appendfsync == APPENDFSYNC_EVERYSEC && + server.unixtime > server.lastfsync)) { + if (!sync_in_progress) aof_background_fsync(server.appendfd); + server.lastfsync = server.unixtime; } } diff --git a/src/redis.c b/src/redis.c index 268398f0..af750582 100644 --- a/src/redis.c +++ b/src/redis.c @@ -697,6 +697,11 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { } } + + /* If we postponed an AOF buffer flush, let's try to do it every time the + * cron function is called. */ + if (server.aof_flush_postponed_start) flushAppendOnlyFile(0); + /* Expire a few keys per cycle, only if this is a master. * On slaves we wait for DEL operations synthesized by the master * in order to guarantee a strict consistency. */ @@ -735,7 +740,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { } /* Write the AOF buffer on disk */ - flushAppendOnlyFile(); + flushAppendOnlyFile(0); } /* =========================== Server initialization ======================== */ @@ -822,6 +827,7 @@ void initServerConfig() { server.lastfsync = time(NULL); server.appendfd = -1; server.appendseldb = -1; /* Make sure the first time will not match */ + server.aof_flush_postponed_start = 0; server.pidfile = zstrdup("/var/run/redis.pid"); server.dbfilename = zstrdup("dump.rdb"); server.appendfilename = zstrdup("appendonly.aof"); diff --git a/src/redis.h b/src/redis.h index 768322c3..d431c7e4 100644 --- a/src/redis.h +++ b/src/redis.h @@ -559,6 +559,7 @@ struct redisServer { time_t lastfsync; int appendfd; int appendseldb; + time_t aof_flush_postponed_start; char *pidfile; pid_t bgsavechildpid; pid_t bgrewritechildpid; -- 2.45.2