]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com> | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions are met: | |
7 | * | |
8 | * * Redistributions of source code must retain the above copyright notice, | |
9 | * this list of conditions and the following disclaimer. | |
10 | * * Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * * Neither the name of Redis nor the names of its contributors may be used | |
14 | * to endorse or promote products derived from this software without | |
15 | * specific prior written permission. | |
16 | * | |
17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
18 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
21 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
22 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
23 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
24 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
25 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
26 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
27 | * POSSIBILITY OF SUCH DAMAGE. | |
28 | */ | |
29 | ||
30 | #include "redis.h" | |
31 | #include "bio.h" | |
32 | #include "rio.h" | |
33 | ||
34 | #include <signal.h> | |
35 | #include <fcntl.h> | |
36 | #include <sys/stat.h> | |
37 | #include <sys/types.h> | |
38 | #include <sys/time.h> | |
39 | #include <sys/resource.h> | |
40 | #include <sys/wait.h> | |
41 | ||
42 | void aofUpdateCurrentSize(void); | |
43 | ||
44 | /* ---------------------------------------------------------------------------- | |
45 | * AOF rewrite buffer implementation. | |
46 | * | |
47 | * The following code implement a simple buffer used in order to accumulate | |
48 | * changes while the background process is rewriting the AOF file. | |
49 | * | |
50 | * We only need to append, but can't just use realloc with a large block | |
51 | * because 'huge' reallocs are not always handled as one could expect | |
52 | * (via remapping of pages at OS level) but may involve copying data. | |
53 | * | |
54 | * For this reason we use a list of blocks, every block is | |
55 | * AOF_RW_BUF_BLOCK_SIZE bytes. | |
56 | * ------------------------------------------------------------------------- */ | |
57 | ||
58 | #define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10) /* 10 MB per block */ | |
59 | ||
60 | typedef struct aofrwblock { | |
61 | unsigned long used, free; | |
62 | char buf[AOF_RW_BUF_BLOCK_SIZE]; | |
63 | } aofrwblock; | |
64 | ||
65 | /* This function free the old AOF rewrite buffer if needed, and initialize | |
66 | * a fresh new one. It tests for server.aof_rewrite_buf_blocks equal to NULL | |
67 | * so can be used for the first initialization as well. */ | |
68 | void aofRewriteBufferReset(void) { | |
69 | if (server.aof_rewrite_buf_blocks) | |
70 | listRelease(server.aof_rewrite_buf_blocks); | |
71 | ||
72 | server.aof_rewrite_buf_blocks = listCreate(); | |
73 | listSetFreeMethod(server.aof_rewrite_buf_blocks,zfree); | |
74 | } | |
75 | ||
76 | /* Return the current size of the AOF rerwite buffer. */ | |
77 | unsigned long aofRewriteBufferSize(void) { | |
78 | listNode *ln = listLast(server.aof_rewrite_buf_blocks); | |
79 | aofrwblock *block = ln ? ln->value : NULL; | |
80 | ||
81 | if (block == NULL) return 0; | |
82 | unsigned long size = | |
83 | (listLength(server.aof_rewrite_buf_blocks)-1) * AOF_RW_BUF_BLOCK_SIZE; | |
84 | size += block->used; | |
85 | return size; | |
86 | } | |
87 | ||
88 | /* Append data to the AOF rewrite buffer, allocating new blocks if needed. */ | |
89 | void aofRewriteBufferAppend(unsigned char *s, unsigned long len) { | |
90 | listNode *ln = listLast(server.aof_rewrite_buf_blocks); | |
91 | aofrwblock *block = ln ? ln->value : NULL; | |
92 | ||
93 | while(len) { | |
94 | /* If we already got at least an allocated block, try appending | |
95 | * at least some piece into it. */ | |
96 | if (block) { | |
97 | unsigned long thislen = (block->free < len) ? block->free : len; | |
98 | if (thislen) { /* The current block is not already full. */ | |
99 | memcpy(block->buf+block->used, s, thislen); | |
100 | block->used += thislen; | |
101 | block->free -= thislen; | |
102 | s += thislen; | |
103 | len -= thislen; | |
104 | } | |
105 | } | |
106 | ||
107 | if (len) { /* First block to allocate, or need another block. */ | |
108 | int numblocks; | |
109 | ||
110 | block = zmalloc(sizeof(*block)); | |
111 | block->free = AOF_RW_BUF_BLOCK_SIZE; | |
112 | block->used = 0; | |
113 | listAddNodeTail(server.aof_rewrite_buf_blocks,block); | |
114 | ||
115 | /* Log every time we cross more 10 or 100 blocks, respectively | |
116 | * as a notice or warning. */ | |
117 | numblocks = listLength(server.aof_rewrite_buf_blocks); | |
118 | if (((numblocks+1) % 10) == 0) { | |
119 | int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING : | |
120 | REDIS_NOTICE; | |
121 | redisLog(level,"Background AOF buffer size: %lu MB", | |
122 | aofRewriteBufferSize()/(1024*1024)); | |
123 | } | |
124 | } | |
125 | } | |
126 | } | |
127 | ||
128 | /* Write the buffer (possibly composed of multiple blocks) into the specified | |
129 | * fd. If no short write or any other error happens -1 is returned, | |
130 | * otherwise the number of bytes written is returned. */ | |
131 | ssize_t aofRewriteBufferWrite(int fd) { | |
132 | listNode *ln; | |
133 | listIter li; | |
134 | ssize_t count = 0; | |
135 | ||
136 | listRewind(server.aof_rewrite_buf_blocks,&li); | |
137 | while((ln = listNext(&li))) { | |
138 | aofrwblock *block = listNodeValue(ln); | |
139 | ssize_t nwritten; | |
140 | ||
141 | if (block->used) { | |
142 | nwritten = write(fd,block->buf,block->used); | |
143 | if (nwritten != block->used) { | |
144 | if (nwritten == 0) errno = EIO; | |
145 | return -1; | |
146 | } | |
147 | count += nwritten; | |
148 | } | |
149 | } | |
150 | return count; | |
151 | } | |
152 | ||
153 | /* ---------------------------------------------------------------------------- | |
154 | * AOF file implementation | |
155 | * ------------------------------------------------------------------------- */ | |
156 | ||
157 | /* Starts a background task that performs fsync() against the specified | |
158 | * file descriptor (the one of the AOF file) in another thread. */ | |
159 | void aof_background_fsync(int fd) { | |
160 | bioCreateBackgroundJob(REDIS_BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL); | |
161 | } | |
162 | ||
163 | /* Called when the user switches from "appendonly yes" to "appendonly no" | |
164 | * at runtime using the CONFIG command. */ | |
165 | void stopAppendOnly(void) { | |
166 | redisAssert(server.aof_state != REDIS_AOF_OFF); | |
167 | flushAppendOnlyFile(1); | |
168 | aof_fsync(server.aof_fd); | |
169 | close(server.aof_fd); | |
170 | ||
171 | server.aof_fd = -1; | |
172 | server.aof_selected_db = -1; | |
173 | server.aof_state = REDIS_AOF_OFF; | |
174 | /* rewrite operation in progress? kill it, wait child exit */ | |
175 | if (server.aof_child_pid != -1) { | |
176 | int statloc; | |
177 | ||
178 | redisLog(REDIS_NOTICE,"Killing running AOF rewrite child: %ld", | |
179 | (long) server.aof_child_pid); | |
180 | if (kill(server.aof_child_pid,SIGKILL) != -1) | |
181 | wait3(&statloc,0,NULL); | |
182 | /* reset the buffer accumulating changes while the child saves */ | |
183 | aofRewriteBufferReset(); | |
184 | aofRemoveTempFile(server.aof_child_pid); | |
185 | server.aof_child_pid = -1; | |
186 | server.aof_rewrite_time_start = -1; | |
187 | } | |
188 | } | |
189 | ||
190 | /* Called when the user switches from "appendonly no" to "appendonly yes" | |
191 | * at runtime using the CONFIG command. */ | |
192 | int startAppendOnly(void) { | |
193 | server.aof_last_fsync = server.unixtime; | |
194 | server.aof_fd = open(server.aof_filename,O_WRONLY|O_APPEND|O_CREAT,0644); | |
195 | redisAssert(server.aof_state == REDIS_AOF_OFF); | |
196 | if (server.aof_fd == -1) { | |
197 | redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't open the append only file: %s",strerror(errno)); | |
198 | return REDIS_ERR; | |
199 | } | |
200 | if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { | |
201 | close(server.aof_fd); | |
202 | redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't trigger a background AOF rewrite operation. Check the above logs for more info about the error."); | |
203 | return REDIS_ERR; | |
204 | } | |
205 | /* We correctly switched on AOF, now wait for the rerwite to be complete | |
206 | * in order to append data on disk. */ | |
207 | server.aof_state = REDIS_AOF_WAIT_REWRITE; | |
208 | return REDIS_OK; | |
209 | } | |
210 | ||
211 | /* Write the append only file buffer on disk. | |
212 | * | |
213 | * Since we are required to write the AOF before replying to the client, | |
214 | * and the only way the client socket can get a write is entering when the | |
215 | * the event loop, we accumulate all the AOF writes in a memory | |
216 | * buffer and write it on disk using this function just before entering | |
217 | * the event loop again. | |
218 | * | |
219 | * About the 'force' argument: | |
220 | * | |
221 | * When the fsync policy is set to 'everysec' we may delay the flush if there | |
222 | * is still an fsync() going on in the background thread, since for instance | |
223 | * on Linux write(2) will be blocked by the background fsync anyway. | |
224 | * When this happens we remember that there is some aof buffer to be | |
225 | * flushed ASAP, and will try to do that in the serverCron() function. | |
226 | * | |
227 | * However if force is set to 1 we'll write regardless of the background | |
228 | * fsync. */ | |
229 | void flushAppendOnlyFile(int force) { | |
230 | ssize_t nwritten; | |
231 | int sync_in_progress = 0; | |
232 | ||
233 | if (sdslen(server.aof_buf) == 0) return; | |
234 | ||
235 | if (server.aof_fsync == AOF_FSYNC_EVERYSEC) | |
236 | sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0; | |
237 | ||
238 | if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { | |
239 | /* With this append fsync policy we do background fsyncing. | |
240 | * If the fsync is still in progress we can try to delay | |
241 | * the write for a couple of seconds. */ | |
242 | if (sync_in_progress) { | |
243 | if (server.aof_flush_postponed_start == 0) { | |
244 | /* No previous write postponinig, remember that we are | |
245 | * postponing the flush and return. */ | |
246 | server.aof_flush_postponed_start = server.unixtime; | |
247 | return; | |
248 | } else if (server.unixtime - server.aof_flush_postponed_start < 2) { | |
249 | /* We were already waiting for fsync to finish, but for less | |
250 | * than two seconds this is still ok. Postpone again. */ | |
251 | return; | |
252 | } | |
253 | /* Otherwise fall trough, and go write since we can't wait | |
254 | * over two seconds. */ | |
255 | server.aof_delayed_fsync++; | |
256 | redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis."); | |
257 | } | |
258 | } | |
259 | /* If you are following this code path, then we are going to write so | |
260 | * set reset the postponed flush sentinel to zero. */ | |
261 | server.aof_flush_postponed_start = 0; | |
262 | ||
263 | /* We want to perform a single write. This should be guaranteed atomic | |
264 | * at least if the filesystem we are writing is a real physical one. | |
265 | * While this will save us against the server being killed I don't think | |
266 | * there is much to do about the whole server stopping for power problems | |
267 | * or alike */ | |
268 | nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf)); | |
269 | if (nwritten != (signed)sdslen(server.aof_buf)) { | |
270 | /* Ooops, we are in troubles. The best thing to do for now is | |
271 | * aborting instead of giving the illusion that everything is | |
272 | * working as expected. */ | |
273 | if (nwritten == -1) { | |
274 | redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); | |
275 | } else { | |
276 | redisLog(REDIS_WARNING,"Exiting on short write while writing to " | |
277 | "the append-only file: %s (nwritten=%ld, " | |
278 | "expected=%ld)", | |
279 | strerror(errno), | |
280 | (long)nwritten, | |
281 | (long)sdslen(server.aof_buf)); | |
282 | ||
283 | if (ftruncate(server.aof_fd, server.aof_current_size) == -1) { | |
284 | redisLog(REDIS_WARNING, "Could not remove short write " | |
285 | "from the append-only file. Redis may refuse " | |
286 | "to load the AOF the next time it starts. " | |
287 | "ftruncate: %s", strerror(errno)); | |
288 | } | |
289 | } | |
290 | exit(1); | |
291 | } | |
292 | server.aof_current_size += nwritten; | |
293 | ||
294 | /* Re-use AOF buffer when it is small enough. The maximum comes from the | |
295 | * arena size of 4k minus some overhead (but is otherwise arbitrary). */ | |
296 | if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) { | |
297 | sdsclear(server.aof_buf); | |
298 | } else { | |
299 | sdsfree(server.aof_buf); | |
300 | server.aof_buf = sdsempty(); | |
301 | } | |
302 | ||
303 | /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are | |
304 | * children doing I/O in the background. */ | |
305 | if (server.aof_no_fsync_on_rewrite && | |
306 | (server.aof_child_pid != -1 || server.rdb_child_pid != -1)) | |
307 | return; | |
308 | ||
309 | /* Perform the fsync if needed. */ | |
310 | if (server.aof_fsync == AOF_FSYNC_ALWAYS) { | |
311 | /* aof_fsync is defined as fdatasync() for Linux in order to avoid | |
312 | * flushing metadata. */ | |
313 | aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */ | |
314 | server.aof_last_fsync = server.unixtime; | |
315 | } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC && | |
316 | server.unixtime > server.aof_last_fsync)) { | |
317 | if (!sync_in_progress) aof_background_fsync(server.aof_fd); | |
318 | server.aof_last_fsync = server.unixtime; | |
319 | } | |
320 | } | |
321 | ||
322 | sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { | |
323 | char buf[32]; | |
324 | int len, j; | |
325 | robj *o; | |
326 | ||
327 | buf[0] = '*'; | |
328 | len = 1+ll2string(buf+1,sizeof(buf)-1,argc); | |
329 | buf[len++] = '\r'; | |
330 | buf[len++] = '\n'; | |
331 | dst = sdscatlen(dst,buf,len); | |
332 | ||
333 | for (j = 0; j < argc; j++) { | |
334 | o = getDecodedObject(argv[j]); | |
335 | buf[0] = '$'; | |
336 | len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); | |
337 | buf[len++] = '\r'; | |
338 | buf[len++] = '\n'; | |
339 | dst = sdscatlen(dst,buf,len); | |
340 | dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); | |
341 | dst = sdscatlen(dst,"\r\n",2); | |
342 | decrRefCount(o); | |
343 | } | |
344 | return dst; | |
345 | } | |
346 | ||
347 | /* Create the sds representation of an PEXPIREAT command, using | |
348 | * 'seconds' as time to live and 'cmd' to understand what command | |
349 | * we are translating into a PEXPIREAT. | |
350 | * | |
351 | * This command is used in order to translate EXPIRE and PEXPIRE commands | |
352 | * into PEXPIREAT command so that we retain precision in the append only | |
353 | * file, and the time is always absolute and not relative. */ | |
354 | sds catAppendOnlyExpireAtCommand(sds buf, struct redisCommand *cmd, robj *key, robj *seconds) { | |
355 | long long when; | |
356 | robj *argv[3]; | |
357 | ||
358 | /* Make sure we can use strtol */ | |
359 | seconds = getDecodedObject(seconds); | |
360 | when = strtoll(seconds->ptr,NULL,10); | |
361 | /* Convert argument into milliseconds for EXPIRE, SETEX, EXPIREAT */ | |
362 | if (cmd->proc == expireCommand || cmd->proc == setexCommand || | |
363 | cmd->proc == expireatCommand) | |
364 | { | |
365 | when *= 1000; | |
366 | } | |
367 | /* Convert into absolute time for EXPIRE, PEXPIRE, SETEX, PSETEX */ | |
368 | if (cmd->proc == expireCommand || cmd->proc == pexpireCommand || | |
369 | cmd->proc == setexCommand || cmd->proc == psetexCommand) | |
370 | { | |
371 | when += mstime(); | |
372 | } | |
373 | decrRefCount(seconds); | |
374 | ||
375 | argv[0] = createStringObject("PEXPIREAT",9); | |
376 | argv[1] = key; | |
377 | argv[2] = createStringObjectFromLongLong(when); | |
378 | buf = catAppendOnlyGenericCommand(buf, 3, argv); | |
379 | decrRefCount(argv[0]); | |
380 | decrRefCount(argv[2]); | |
381 | return buf; | |
382 | } | |
383 | ||
384 | void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { | |
385 | sds buf = sdsempty(); | |
386 | robj *tmpargv[3]; | |
387 | ||
388 | /* The DB this command was targetting is not the same as the last command | |
389 | * we appendend. To issue a SELECT command is needed. */ | |
390 | if (dictid != server.aof_selected_db) { | |
391 | char seldb[64]; | |
392 | ||
393 | snprintf(seldb,sizeof(seldb),"%d",dictid); | |
394 | buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", | |
395 | (unsigned long)strlen(seldb),seldb); | |
396 | server.aof_selected_db = dictid; | |
397 | } | |
398 | ||
399 | if (cmd->proc == expireCommand || cmd->proc == pexpireCommand || | |
400 | cmd->proc == expireatCommand) { | |
401 | /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */ | |
402 | buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]); | |
403 | } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) { | |
404 | /* Translate SETEX/PSETEX to SET and PEXPIREAT */ | |
405 | tmpargv[0] = createStringObject("SET",3); | |
406 | tmpargv[1] = argv[1]; | |
407 | tmpargv[2] = argv[3]; | |
408 | buf = catAppendOnlyGenericCommand(buf,3,tmpargv); | |
409 | decrRefCount(tmpargv[0]); | |
410 | buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]); | |
411 | } else { | |
412 | /* All the other commands don't need translation or need the | |
413 | * same translation already operated in the command vector | |
414 | * for the replication itself. */ | |
415 | buf = catAppendOnlyGenericCommand(buf,argc,argv); | |
416 | } | |
417 | ||
418 | /* Append to the AOF buffer. This will be flushed on disk just before | |
419 | * of re-entering the event loop, so before the client will get a | |
420 | * positive reply about the operation performed. */ | |
421 | if (server.aof_state == REDIS_AOF_ON) | |
422 | server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf)); | |
423 | ||
424 | /* If a background append only file rewriting is in progress we want to | |
425 | * accumulate the differences between the child DB and the current one | |
426 | * in a buffer, so that when the child process will do its work we | |
427 | * can append the differences to the new append only file. */ | |
428 | if (server.aof_child_pid != -1) | |
429 | aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf)); | |
430 | ||
431 | sdsfree(buf); | |
432 | } | |
433 | ||
434 | /* ---------------------------------------------------------------------------- | |
435 | * AOF loading | |
436 | * ------------------------------------------------------------------------- */ | |
437 | ||
438 | /* In Redis commands are always executed in the context of a client, so in | |
439 | * order to load the append only file we need to create a fake client. */ | |
440 | struct redisClient *createFakeClient(void) { | |
441 | struct redisClient *c = zmalloc(sizeof(*c)); | |
442 | ||
443 | selectDb(c,0); | |
444 | c->fd = -1; | |
445 | c->querybuf = sdsempty(); | |
446 | c->querybuf_peak = 0; | |
447 | c->argc = 0; | |
448 | c->argv = NULL; | |
449 | c->bufpos = 0; | |
450 | c->flags = 0; | |
451 | /* We set the fake client as a slave waiting for the synchronization | |
452 | * so that Redis will not try to send replies to this client. */ | |
453 | c->replstate = REDIS_REPL_WAIT_BGSAVE_START; | |
454 | c->reply = listCreate(); | |
455 | c->reply_bytes = 0; | |
456 | c->obuf_soft_limit_reached_time = 0; | |
457 | c->watched_keys = listCreate(); | |
458 | listSetFreeMethod(c->reply,decrRefCount); | |
459 | listSetDupMethod(c->reply,dupClientReplyValue); | |
460 | initClientMultiState(c); | |
461 | return c; | |
462 | } | |
463 | ||
464 | void freeFakeClient(struct redisClient *c) { | |
465 | sdsfree(c->querybuf); | |
466 | listRelease(c->reply); | |
467 | listRelease(c->watched_keys); | |
468 | freeClientMultiState(c); | |
469 | zfree(c); | |
470 | } | |
471 | ||
472 | /* Replay the append log file. On error REDIS_OK is returned. On non fatal | |
473 | * error (the append only file is zero-length) REDIS_ERR is returned. On | |
474 | * fatal error an error message is logged and the program exists. */ | |
475 | int loadAppendOnlyFile(char *filename) { | |
476 | struct redisClient *fakeClient; | |
477 | FILE *fp = fopen(filename,"r"); | |
478 | struct redis_stat sb; | |
479 | int old_aof_state = server.aof_state; | |
480 | long loops = 0; | |
481 | ||
482 | if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { | |
483 | server.aof_current_size = 0; | |
484 | fclose(fp); | |
485 | return REDIS_ERR; | |
486 | } | |
487 | ||
488 | if (fp == NULL) { | |
489 | redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); | |
490 | exit(1); | |
491 | } | |
492 | ||
493 | /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI | |
494 | * to the same file we're about to read. */ | |
495 | server.aof_state = REDIS_AOF_OFF; | |
496 | ||
497 | fakeClient = createFakeClient(); | |
498 | startLoading(fp); | |
499 | ||
500 | while(1) { | |
501 | int argc, j; | |
502 | unsigned long len; | |
503 | robj **argv; | |
504 | char buf[128]; | |
505 | sds argsds; | |
506 | struct redisCommand *cmd; | |
507 | ||
508 | /* Serve the clients from time to time */ | |
509 | if (!(loops++ % 1000)) { | |
510 | loadingProgress(ftello(fp)); | |
511 | aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); | |
512 | } | |
513 | ||
514 | if (fgets(buf,sizeof(buf),fp) == NULL) { | |
515 | if (feof(fp)) | |
516 | break; | |
517 | else | |
518 | goto readerr; | |
519 | } | |
520 | if (buf[0] != '*') goto fmterr; | |
521 | argc = atoi(buf+1); | |
522 | if (argc < 1) goto fmterr; | |
523 | ||
524 | argv = zmalloc(sizeof(robj*)*argc); | |
525 | for (j = 0; j < argc; j++) { | |
526 | if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; | |
527 | if (buf[0] != '$') goto fmterr; | |
528 | len = strtol(buf+1,NULL,10); | |
529 | argsds = sdsnewlen(NULL,len); | |
530 | if (len && fread(argsds,len,1,fp) == 0) goto fmterr; | |
531 | argv[j] = createObject(REDIS_STRING,argsds); | |
532 | if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ | |
533 | } | |
534 | ||
535 | /* Command lookup */ | |
536 | cmd = lookupCommand(argv[0]->ptr); | |
537 | if (!cmd) { | |
538 | redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); | |
539 | exit(1); | |
540 | } | |
541 | /* Run the command in the context of a fake client */ | |
542 | fakeClient->argc = argc; | |
543 | fakeClient->argv = argv; | |
544 | cmd->proc(fakeClient); | |
545 | ||
546 | /* The fake client should not have a reply */ | |
547 | redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); | |
548 | /* The fake client should never get blocked */ | |
549 | redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0); | |
550 | ||
551 | /* Clean up. Command code may have changed argv/argc so we use the | |
552 | * argv/argc of the client instead of the local variables. */ | |
553 | for (j = 0; j < fakeClient->argc; j++) | |
554 | decrRefCount(fakeClient->argv[j]); | |
555 | zfree(fakeClient->argv); | |
556 | } | |
557 | ||
558 | /* This point can only be reached when EOF is reached without errors. | |
559 | * If the client is in the middle of a MULTI/EXEC, log error and quit. */ | |
560 | if (fakeClient->flags & REDIS_MULTI) goto readerr; | |
561 | ||
562 | fclose(fp); | |
563 | freeFakeClient(fakeClient); | |
564 | server.aof_state = old_aof_state; | |
565 | stopLoading(); | |
566 | aofUpdateCurrentSize(); | |
567 | server.aof_rewrite_base_size = server.aof_current_size; | |
568 | return REDIS_OK; | |
569 | ||
570 | readerr: | |
571 | if (feof(fp)) { | |
572 | redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); | |
573 | } else { | |
574 | redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); | |
575 | } | |
576 | exit(1); | |
577 | fmterr: | |
578 | redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix <filename>"); | |
579 | exit(1); | |
580 | } | |
581 | ||
582 | /* ---------------------------------------------------------------------------- | |
583 | * AOF rewrite | |
584 | * ------------------------------------------------------------------------- */ | |
585 | ||
586 | /* Delegate writing an object to writing a bulk string or bulk long long. | |
587 | * This is not placed in rio.c since that adds the redis.h dependency. */ | |
588 | int rioWriteBulkObject(rio *r, robj *obj) { | |
589 | /* Avoid using getDecodedObject to help copy-on-write (we are often | |
590 | * in a child process when this function is called). */ | |
591 | if (obj->encoding == REDIS_ENCODING_INT) { | |
592 | return rioWriteBulkLongLong(r,(long)obj->ptr); | |
593 | } else if (obj->encoding == REDIS_ENCODING_RAW) { | |
594 | return rioWriteBulkString(r,obj->ptr,sdslen(obj->ptr)); | |
595 | } else { | |
596 | redisPanic("Unknown string encoding"); | |
597 | } | |
598 | } | |
599 | ||
600 | /* Emit the commands needed to rebuild a list object. | |
601 | * The function returns 0 on error, 1 on success. */ | |
602 | int rewriteListObject(rio *r, robj *key, robj *o) { | |
603 | long long count = 0, items = listTypeLength(o); | |
604 | ||
605 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { | |
606 | unsigned char *zl = o->ptr; | |
607 | unsigned char *p = ziplistIndex(zl,0); | |
608 | unsigned char *vstr; | |
609 | unsigned int vlen; | |
610 | long long vlong; | |
611 | ||
612 | while(ziplistGet(p,&vstr,&vlen,&vlong)) { | |
613 | if (count == 0) { | |
614 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
615 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
616 | ||
617 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; | |
618 | if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0; | |
619 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
620 | } | |
621 | if (vstr) { | |
622 | if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0; | |
623 | } else { | |
624 | if (rioWriteBulkLongLong(r,vlong) == 0) return 0; | |
625 | } | |
626 | p = ziplistNext(zl,p); | |
627 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
628 | items--; | |
629 | } | |
630 | } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { | |
631 | list *list = o->ptr; | |
632 | listNode *ln; | |
633 | listIter li; | |
634 | ||
635 | listRewind(list,&li); | |
636 | while((ln = listNext(&li))) { | |
637 | robj *eleobj = listNodeValue(ln); | |
638 | ||
639 | if (count == 0) { | |
640 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
641 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
642 | ||
643 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; | |
644 | if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0; | |
645 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
646 | } | |
647 | if (rioWriteBulkObject(r,eleobj) == 0) return 0; | |
648 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
649 | items--; | |
650 | } | |
651 | } else { | |
652 | redisPanic("Unknown list encoding"); | |
653 | } | |
654 | return 1; | |
655 | } | |
656 | ||
657 | /* Emit the commands needed to rebuild a set object. | |
658 | * The function returns 0 on error, 1 on success. */ | |
659 | int rewriteSetObject(rio *r, robj *key, robj *o) { | |
660 | long long count = 0, items = setTypeSize(o); | |
661 | ||
662 | if (o->encoding == REDIS_ENCODING_INTSET) { | |
663 | int ii = 0; | |
664 | int64_t llval; | |
665 | ||
666 | while(intsetGet(o->ptr,ii++,&llval)) { | |
667 | if (count == 0) { | |
668 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
669 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
670 | ||
671 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; | |
672 | if (rioWriteBulkString(r,"SADD",4) == 0) return 0; | |
673 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
674 | } | |
675 | if (rioWriteBulkLongLong(r,llval) == 0) return 0; | |
676 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
677 | items--; | |
678 | } | |
679 | } else if (o->encoding == REDIS_ENCODING_HT) { | |
680 | dictIterator *di = dictGetIterator(o->ptr); | |
681 | dictEntry *de; | |
682 | ||
683 | while((de = dictNext(di)) != NULL) { | |
684 | robj *eleobj = dictGetKey(de); | |
685 | if (count == 0) { | |
686 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
687 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
688 | ||
689 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; | |
690 | if (rioWriteBulkString(r,"SADD",4) == 0) return 0; | |
691 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
692 | } | |
693 | if (rioWriteBulkObject(r,eleobj) == 0) return 0; | |
694 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
695 | items--; | |
696 | } | |
697 | dictReleaseIterator(di); | |
698 | } else { | |
699 | redisPanic("Unknown set encoding"); | |
700 | } | |
701 | return 1; | |
702 | } | |
703 | ||
704 | /* Emit the commands needed to rebuild a sorted set object. | |
705 | * The function returns 0 on error, 1 on success. */ | |
706 | int rewriteSortedSetObject(rio *r, robj *key, robj *o) { | |
707 | long long count = 0, items = zsetLength(o); | |
708 | ||
709 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { | |
710 | unsigned char *zl = o->ptr; | |
711 | unsigned char *eptr, *sptr; | |
712 | unsigned char *vstr; | |
713 | unsigned int vlen; | |
714 | long long vll; | |
715 | double score; | |
716 | ||
717 | eptr = ziplistIndex(zl,0); | |
718 | redisAssert(eptr != NULL); | |
719 | sptr = ziplistNext(zl,eptr); | |
720 | redisAssert(sptr != NULL); | |
721 | ||
722 | while (eptr != NULL) { | |
723 | redisAssert(ziplistGet(eptr,&vstr,&vlen,&vll)); | |
724 | score = zzlGetScore(sptr); | |
725 | ||
726 | if (count == 0) { | |
727 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
728 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
729 | ||
730 | if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0; | |
731 | if (rioWriteBulkString(r,"ZADD",4) == 0) return 0; | |
732 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
733 | } | |
734 | if (rioWriteBulkDouble(r,score) == 0) return 0; | |
735 | if (vstr != NULL) { | |
736 | if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0; | |
737 | } else { | |
738 | if (rioWriteBulkLongLong(r,vll) == 0) return 0; | |
739 | } | |
740 | zzlNext(zl,&eptr,&sptr); | |
741 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
742 | items--; | |
743 | } | |
744 | } else if (o->encoding == REDIS_ENCODING_SKIPLIST) { | |
745 | zset *zs = o->ptr; | |
746 | dictIterator *di = dictGetIterator(zs->dict); | |
747 | dictEntry *de; | |
748 | ||
749 | while((de = dictNext(di)) != NULL) { | |
750 | robj *eleobj = dictGetKey(de); | |
751 | double *score = dictGetVal(de); | |
752 | ||
753 | if (count == 0) { | |
754 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
755 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
756 | ||
757 | if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0; | |
758 | if (rioWriteBulkString(r,"ZADD",4) == 0) return 0; | |
759 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
760 | } | |
761 | if (rioWriteBulkDouble(r,*score) == 0) return 0; | |
762 | if (rioWriteBulkObject(r,eleobj) == 0) return 0; | |
763 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
764 | items--; | |
765 | } | |
766 | dictReleaseIterator(di); | |
767 | } else { | |
768 | redisPanic("Unknown sorted zset encoding"); | |
769 | } | |
770 | return 1; | |
771 | } | |
772 | ||
773 | /* Write either the key or the value of the currently selected item of an hash. | |
774 | * The 'hi' argument passes a valid Redis hash iterator. | |
775 | * The 'what' filed specifies if to write a key or a value and can be | |
776 | * either REDIS_HASH_KEY or REDIS_HASH_VALUE. | |
777 | * | |
778 | * The function returns 0 on error, non-zero on success. */ | |
779 | static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { | |
780 | if (hi->encoding == REDIS_ENCODING_ZIPLIST) { | |
781 | unsigned char *vstr = NULL; | |
782 | unsigned int vlen = UINT_MAX; | |
783 | long long vll = LLONG_MAX; | |
784 | ||
785 | hashTypeCurrentFromZiplist(hi, what, &vstr, &vlen, &vll); | |
786 | if (vstr) { | |
787 | return rioWriteBulkString(r, (char*)vstr, vlen); | |
788 | } else { | |
789 | return rioWriteBulkLongLong(r, vll); | |
790 | } | |
791 | ||
792 | } else if (hi->encoding == REDIS_ENCODING_HT) { | |
793 | robj *value; | |
794 | ||
795 | hashTypeCurrentFromHashTable(hi, what, &value); | |
796 | return rioWriteBulkObject(r, value); | |
797 | } | |
798 | ||
799 | redisPanic("Unknown hash encoding"); | |
800 | return 0; | |
801 | } | |
802 | ||
803 | /* Emit the commands needed to rebuild a hash object. | |
804 | * The function returns 0 on error, 1 on success. */ | |
805 | int rewriteHashObject(rio *r, robj *key, robj *o) { | |
806 | hashTypeIterator *hi; | |
807 | long long count = 0, items = hashTypeLength(o); | |
808 | ||
809 | hi = hashTypeInitIterator(o); | |
810 | while (hashTypeNext(hi) != REDIS_ERR) { | |
811 | if (count == 0) { | |
812 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? | |
813 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; | |
814 | ||
815 | if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0; | |
816 | if (rioWriteBulkString(r,"HMSET",5) == 0) return 0; | |
817 | if (rioWriteBulkObject(r,key) == 0) return 0; | |
818 | } | |
819 | ||
820 | if (rioWriteHashIteratorCursor(r, hi, REDIS_HASH_KEY) == 0) return 0; | |
821 | if (rioWriteHashIteratorCursor(r, hi, REDIS_HASH_VALUE) == 0) return 0; | |
822 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; | |
823 | items--; | |
824 | } | |
825 | ||
826 | hashTypeReleaseIterator(hi); | |
827 | ||
828 | return 1; | |
829 | } | |
830 | ||
831 | /* Write a sequence of commands able to fully rebuild the dataset into | |
832 | * "filename". Used both by REWRITEAOF and BGREWRITEAOF. | |
833 | * | |
834 | * In order to minimize the number of commands needed in the rewritten | |
835 | * log Redis uses variadic commands when possible, such as RPUSH, SADD | |
836 | * and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time | |
837 | * are inserted using a single command. */ | |
838 | int rewriteAppendOnlyFile(char *filename) { | |
839 | dictIterator *di = NULL; | |
840 | dictEntry *de; | |
841 | rio aof; | |
842 | FILE *fp; | |
843 | char tmpfile[256]; | |
844 | int j; | |
845 | long long now = mstime(); | |
846 | ||
847 | /* Note that we have to use a different temp name here compared to the | |
848 | * one used by rewriteAppendOnlyFileBackground() function. */ | |
849 | snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); | |
850 | fp = fopen(tmpfile,"w"); | |
851 | if (!fp) { | |
852 | redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno)); | |
853 | return REDIS_ERR; | |
854 | } | |
855 | ||
856 | rioInitWithFile(&aof,fp); | |
857 | for (j = 0; j < server.dbnum; j++) { | |
858 | char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; | |
859 | redisDb *db = server.db+j; | |
860 | dict *d = db->dict; | |
861 | if (dictSize(d) == 0) continue; | |
862 | di = dictGetSafeIterator(d); | |
863 | if (!di) { | |
864 | fclose(fp); | |
865 | return REDIS_ERR; | |
866 | } | |
867 | ||
868 | /* SELECT the new DB */ | |
869 | if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr; | |
870 | if (rioWriteBulkLongLong(&aof,j) == 0) goto werr; | |
871 | ||
872 | /* Iterate this DB writing every entry */ | |
873 | while((de = dictNext(di)) != NULL) { | |
874 | sds keystr; | |
875 | robj key, *o; | |
876 | long long expiretime; | |
877 | ||
878 | keystr = dictGetKey(de); | |
879 | o = dictGetVal(de); | |
880 | initStaticStringObject(key,keystr); | |
881 | ||
882 | expiretime = getExpire(db,&key); | |
883 | ||
884 | /* Save the key and associated value */ | |
885 | if (o->type == REDIS_STRING) { | |
886 | /* Emit a SET command */ | |
887 | char cmd[]="*3\r\n$3\r\nSET\r\n"; | |
888 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; | |
889 | /* Key and value */ | |
890 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
891 | if (rioWriteBulkObject(&aof,o) == 0) goto werr; | |
892 | } else if (o->type == REDIS_LIST) { | |
893 | if (rewriteListObject(&aof,&key,o) == 0) goto werr; | |
894 | } else if (o->type == REDIS_SET) { | |
895 | if (rewriteSetObject(&aof,&key,o) == 0) goto werr; | |
896 | } else if (o->type == REDIS_ZSET) { | |
897 | if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr; | |
898 | } else if (o->type == REDIS_HASH) { | |
899 | if (rewriteHashObject(&aof,&key,o) == 0) goto werr; | |
900 | } else { | |
901 | redisPanic("Unknown object type"); | |
902 | } | |
903 | /* Save the expire time */ | |
904 | if (expiretime != -1) { | |
905 | char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; | |
906 | /* If this key is already expired skip it */ | |
907 | if (expiretime < now) continue; | |
908 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; | |
909 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
910 | if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr; | |
911 | } | |
912 | } | |
913 | dictReleaseIterator(di); | |
914 | } | |
915 | ||
916 | /* Make sure data will not remain on the OS's output buffers */ | |
917 | fflush(fp); | |
918 | aof_fsync(fileno(fp)); | |
919 | fclose(fp); | |
920 | ||
921 | /* Use RENAME to make sure the DB file is changed atomically only | |
922 | * if the generate DB file is ok. */ | |
923 | if (rename(tmpfile,filename) == -1) { | |
924 | redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); | |
925 | unlink(tmpfile); | |
926 | return REDIS_ERR; | |
927 | } | |
928 | redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); | |
929 | return REDIS_OK; | |
930 | ||
931 | werr: | |
932 | fclose(fp); | |
933 | unlink(tmpfile); | |
934 | redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); | |
935 | if (di) dictReleaseIterator(di); | |
936 | return REDIS_ERR; | |
937 | } | |
938 | ||
939 | /* This is how rewriting of the append only file in background works: | |
940 | * | |
941 | * 1) The user calls BGREWRITEAOF | |
942 | * 2) Redis calls this function, that forks(): | |
943 | * 2a) the child rewrite the append only file in a temp file. | |
944 | * 2b) the parent accumulates differences in server.aof_rewrite_buf. | |
945 | * 3) When the child finished '2a' exists. | |
946 | * 4) The parent will trap the exit code, if it's OK, will append the | |
947 | * data accumulated into server.aof_rewrite_buf into the temp file, and | |
948 | * finally will rename(2) the temp file in the actual file name. | |
949 | * The the new file is reopened as the new append only file. Profit! | |
950 | */ | |
951 | int rewriteAppendOnlyFileBackground(void) { | |
952 | pid_t childpid; | |
953 | long long start; | |
954 | ||
955 | if (server.aof_child_pid != -1) return REDIS_ERR; | |
956 | start = ustime(); | |
957 | if ((childpid = fork()) == 0) { | |
958 | char tmpfile[256]; | |
959 | ||
960 | /* Child */ | |
961 | if (server.ipfd > 0) close(server.ipfd); | |
962 | if (server.sofd > 0) close(server.sofd); | |
963 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); | |
964 | if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { | |
965 | exitFromChild(0); | |
966 | } else { | |
967 | exitFromChild(1); | |
968 | } | |
969 | } else { | |
970 | /* Parent */ | |
971 | server.stat_fork_time = ustime()-start; | |
972 | if (childpid == -1) { | |
973 | redisLog(REDIS_WARNING, | |
974 | "Can't rewrite append only file in background: fork: %s", | |
975 | strerror(errno)); | |
976 | return REDIS_ERR; | |
977 | } | |
978 | redisLog(REDIS_NOTICE, | |
979 | "Background append only file rewriting started by pid %d",childpid); | |
980 | server.aof_rewrite_scheduled = 0; | |
981 | server.aof_rewrite_time_start = time(NULL); | |
982 | server.aof_child_pid = childpid; | |
983 | updateDictResizePolicy(); | |
984 | /* We set appendseldb to -1 in order to force the next call to the | |
985 | * feedAppendOnlyFile() to issue a SELECT command, so the differences | |
986 | * accumulated by the parent into server.aof_rewrite_buf will start | |
987 | * with a SELECT statement and it will be safe to merge. */ | |
988 | server.aof_selected_db = -1; | |
989 | return REDIS_OK; | |
990 | } | |
991 | return REDIS_OK; /* unreached */ | |
992 | } | |
993 | ||
994 | void bgrewriteaofCommand(redisClient *c) { | |
995 | if (server.aof_child_pid != -1) { | |
996 | addReplyError(c,"Background append only file rewriting already in progress"); | |
997 | } else if (server.rdb_child_pid != -1) { | |
998 | server.aof_rewrite_scheduled = 1; | |
999 | addReplyStatus(c,"Background append only file rewriting scheduled"); | |
1000 | } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) { | |
1001 | addReplyStatus(c,"Background append only file rewriting started"); | |
1002 | } else { | |
1003 | addReply(c,shared.err); | |
1004 | } | |
1005 | } | |
1006 | ||
1007 | void aofRemoveTempFile(pid_t childpid) { | |
1008 | char tmpfile[256]; | |
1009 | ||
1010 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); | |
1011 | unlink(tmpfile); | |
1012 | } | |
1013 | ||
1014 | /* Update the server.aof_current_size filed explicitly using stat(2) | |
1015 | * to check the size of the file. This is useful after a rewrite or after | |
1016 | * a restart, normally the size is updated just adding the write length | |
1017 | * to the current length, that is much faster. */ | |
1018 | void aofUpdateCurrentSize(void) { | |
1019 | struct redis_stat sb; | |
1020 | ||
1021 | if (redis_fstat(server.aof_fd,&sb) == -1) { | |
1022 | redisLog(REDIS_WARNING,"Unable to obtain the AOF file length. stat: %s", | |
1023 | strerror(errno)); | |
1024 | } else { | |
1025 | server.aof_current_size = sb.st_size; | |
1026 | } | |
1027 | } | |
1028 | ||
1029 | /* A background append only file rewriting (BGREWRITEAOF) terminated its work. | |
1030 | * Handle this. */ | |
1031 | void backgroundRewriteDoneHandler(int exitcode, int bysignal) { | |
1032 | if (!bysignal && exitcode == 0) { | |
1033 | int newfd, oldfd; | |
1034 | char tmpfile[256]; | |
1035 | long long now = ustime(); | |
1036 | ||
1037 | redisLog(REDIS_NOTICE, | |
1038 | "Background AOF rewrite terminated with success"); | |
1039 | ||
1040 | /* Flush the differences accumulated by the parent to the | |
1041 | * rewritten AOF. */ | |
1042 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", | |
1043 | (int)server.aof_child_pid); | |
1044 | newfd = open(tmpfile,O_WRONLY|O_APPEND); | |
1045 | if (newfd == -1) { | |
1046 | redisLog(REDIS_WARNING, | |
1047 | "Unable to open the temporary AOF produced by the child: %s", strerror(errno)); | |
1048 | goto cleanup; | |
1049 | } | |
1050 | ||
1051 | if (aofRewriteBufferWrite(newfd) == -1) { | |
1052 | redisLog(REDIS_WARNING, | |
1053 | "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); | |
1054 | close(newfd); | |
1055 | goto cleanup; | |
1056 | } | |
1057 | ||
1058 | redisLog(REDIS_NOTICE, | |
1059 | "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", aofRewriteBufferSize()); | |
1060 | ||
1061 | /* The only remaining thing to do is to rename the temporary file to | |
1062 | * the configured file and switch the file descriptor used to do AOF | |
1063 | * writes. We don't want close(2) or rename(2) calls to block the | |
1064 | * server on old file deletion. | |
1065 | * | |
1066 | * There are two possible scenarios: | |
1067 | * | |
1068 | * 1) AOF is DISABLED and this was a one time rewrite. The temporary | |
1069 | * file will be renamed to the configured file. When this file already | |
1070 | * exists, it will be unlinked, which may block the server. | |
1071 | * | |
1072 | * 2) AOF is ENABLED and the rewritten AOF will immediately start | |
1073 | * receiving writes. After the temporary file is renamed to the | |
1074 | * configured file, the original AOF file descriptor will be closed. | |
1075 | * Since this will be the last reference to that file, closing it | |
1076 | * causes the underlying file to be unlinked, which may block the | |
1077 | * server. | |
1078 | * | |
1079 | * To mitigate the blocking effect of the unlink operation (either | |
1080 | * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we | |
1081 | * use a background thread to take care of this. First, we | |
1082 | * make scenario 1 identical to scenario 2 by opening the target file | |
1083 | * when it exists. The unlink operation after the rename(2) will then | |
1084 | * be executed upon calling close(2) for its descriptor. Everything to | |
1085 | * guarantee atomicity for this switch has already happened by then, so | |
1086 | * we don't care what the outcome or duration of that close operation | |
1087 | * is, as long as the file descriptor is released again. */ | |
1088 | if (server.aof_fd == -1) { | |
1089 | /* AOF disabled */ | |
1090 | ||
1091 | /* Don't care if this fails: oldfd will be -1 and we handle that. | |
1092 | * One notable case of -1 return is if the old file does | |
1093 | * not exist. */ | |
1094 | oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK); | |
1095 | } else { | |
1096 | /* AOF enabled */ | |
1097 | oldfd = -1; /* We'll set this to the current AOF filedes later. */ | |
1098 | } | |
1099 | ||
1100 | /* Rename the temporary file. This will not unlink the target file if | |
1101 | * it exists, because we reference it with "oldfd". */ | |
1102 | if (rename(tmpfile,server.aof_filename) == -1) { | |
1103 | redisLog(REDIS_WARNING, | |
1104 | "Error trying to rename the temporary AOF file: %s", strerror(errno)); | |
1105 | close(newfd); | |
1106 | if (oldfd != -1) close(oldfd); | |
1107 | goto cleanup; | |
1108 | } | |
1109 | ||
1110 | if (server.aof_fd == -1) { | |
1111 | /* AOF disabled, we don't need to set the AOF file descriptor | |
1112 | * to this new file, so we can close it. */ | |
1113 | close(newfd); | |
1114 | } else { | |
1115 | /* AOF enabled, replace the old fd with the new one. */ | |
1116 | oldfd = server.aof_fd; | |
1117 | server.aof_fd = newfd; | |
1118 | if (server.aof_fsync == AOF_FSYNC_ALWAYS) | |
1119 | aof_fsync(newfd); | |
1120 | else if (server.aof_fsync == AOF_FSYNC_EVERYSEC) | |
1121 | aof_background_fsync(newfd); | |
1122 | server.aof_selected_db = -1; /* Make sure SELECT is re-issued */ | |
1123 | aofUpdateCurrentSize(); | |
1124 | server.aof_rewrite_base_size = server.aof_current_size; | |
1125 | ||
1126 | /* Clear regular AOF buffer since its contents was just written to | |
1127 | * the new AOF from the background rewrite buffer. */ | |
1128 | sdsfree(server.aof_buf); | |
1129 | server.aof_buf = sdsempty(); | |
1130 | } | |
1131 | ||
1132 | server.aof_lastbgrewrite_status = REDIS_OK; | |
1133 | ||
1134 | redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully"); | |
1135 | /* Change state from WAIT_REWRITE to ON if needed */ | |
1136 | if (server.aof_state == REDIS_AOF_WAIT_REWRITE) | |
1137 | server.aof_state = REDIS_AOF_ON; | |
1138 | ||
1139 | /* Asynchronously close the overwritten AOF. */ | |
1140 | if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL); | |
1141 | ||
1142 | redisLog(REDIS_VERBOSE, | |
1143 | "Background AOF rewrite signal handler took %lldus", ustime()-now); | |
1144 | } else if (!bysignal && exitcode != 0) { | |
1145 | server.aof_lastbgrewrite_status = REDIS_ERR; | |
1146 | ||
1147 | redisLog(REDIS_WARNING, | |
1148 | "Background AOF rewrite terminated with error"); | |
1149 | } else { | |
1150 | server.aof_lastbgrewrite_status = REDIS_ERR; | |
1151 | ||
1152 | redisLog(REDIS_WARNING, | |
1153 | "Background AOF rewrite terminated by signal %d", bysignal); | |
1154 | } | |
1155 | ||
1156 | cleanup: | |
1157 | aofRewriteBufferReset(); | |
1158 | aofRemoveTempFile(server.aof_child_pid); | |
1159 | server.aof_child_pid = -1; | |
1160 | server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start; | |
1161 | server.aof_rewrite_time_start = -1; | |
1162 | /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */ | |
1163 | if (server.aof_state == REDIS_AOF_WAIT_REWRITE) | |
1164 | server.aof_rewrite_scheduled = 1; | |
1165 | } |