]> git.saurik.com Git - redis.git/blob - src/aof.c
Fixed typo preventing compilation
[redis.git] / src / aof.c
1 #include "redis.h"
2 #include "bio.h"
3
4 #include <signal.h>
5 #include <fcntl.h>
6 #include <sys/stat.h>
7 #include <sys/types.h>
8 #include <sys/time.h>
9 #include <sys/resource.h>
10 #include <sys/wait.h>
11
12 void aofUpdateCurrentSize(void);
13
14 void aof_background_fsync(int fd) {
15 bioCreateBackgroundJob(REDIS_BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);
16 }
17
18 /* Called when the user switches from "appendonly yes" to "appendonly no"
19 * at runtime using the CONFIG command. */
20 void stopAppendOnly(void) {
21 flushAppendOnlyFile();
22 aof_fsync(server.appendfd);
23 close(server.appendfd);
24
25 server.appendfd = -1;
26 server.appendseldb = -1;
27 server.appendonly = 0;
28 /* rewrite operation in progress? kill it, wait child exit */
29 if (server.bgrewritechildpid != -1) {
30 int statloc;
31
32 if (kill(server.bgrewritechildpid,SIGKILL) != -1)
33 wait3(&statloc,0,NULL);
34 /* reset the buffer accumulating changes while the child saves */
35 sdsfree(server.bgrewritebuf);
36 server.bgrewritebuf = sdsempty();
37 server.bgrewritechildpid = -1;
38 }
39 }
40
41 /* Called when the user switches from "appendonly no" to "appendonly yes"
42 * at runtime using the CONFIG command. */
43 int startAppendOnly(void) {
44 server.appendonly = 1;
45 server.lastfsync = time(NULL);
46 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
47 if (server.appendfd == -1) {
48 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
49 return REDIS_ERR;
50 }
51 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
52 server.appendonly = 0;
53 close(server.appendfd);
54 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
55 return REDIS_ERR;
56 }
57 return REDIS_OK;
58 }
59
60 /* Write the append only file buffer on disk.
61 *
62 * Since we are required to write the AOF before replying to the client,
63 * and the only way the client socket can get a write is entering when the
64 * the event loop, we accumulate all the AOF writes in a memory
65 * buffer and write it on disk using this function just before entering
66 * the event loop again. */
67 void flushAppendOnlyFile(void) {
68 ssize_t nwritten;
69
70 if (sdslen(server.aofbuf) == 0) return;
71
72 /* We want to perform a single write. This should be guaranteed atomic
73 * at least if the filesystem we are writing is a real physical one.
74 * While this will save us against the server being killed I don't think
75 * there is much to do about the whole server stopping for power problems
76 * or alike */
77 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
78 if (nwritten != (signed)sdslen(server.aofbuf)) {
79 /* Ooops, we are in troubles. The best thing to do for now is
80 * aborting instead of giving the illusion that everything is
81 * working as expected. */
82 if (nwritten == -1) {
83 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
84 } else {
85 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
86 }
87 exit(1);
88 }
89 server.appendonly_current_size += nwritten;
90
91 /* Re-use AOF buffer when it is small enough. The maximum comes from the
92 * arena size of 4k minus some overhead (but is otherwise arbitrary). */
93 if ((sdslen(server.aofbuf)+sdsavail(server.aofbuf)) < 4000) {
94 sdsclear(server.aofbuf);
95 } else {
96 sdsfree(server.aofbuf);
97 server.aofbuf = sdsempty();
98 }
99
100 /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
101 * children doing I/O in the background. */
102 if (server.no_appendfsync_on_rewrite &&
103 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
104 return;
105
106 /* Perform the fsync if needed. */
107 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
108 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
109 server.unixtime > server.lastfsync))
110 {
111 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
112 * flushing metadata. */
113 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
114 server.lastfsync = server.unixtime;
115 }
116 }
117
118 sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) {
119 char buf[32];
120 int len, j;
121 robj *o;
122
123 buf[0] = '*';
124 len = 1+ll2string(buf+1,sizeof(buf)-1,argc);
125 buf[len++] = '\r';
126 buf[len++] = '\n';
127 dst = sdscatlen(dst,buf,len);
128
129 for (j = 0; j < argc; j++) {
130 o = getDecodedObject(argv[j]);
131 buf[0] = '$';
132 len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr));
133 buf[len++] = '\r';
134 buf[len++] = '\n';
135 dst = sdscatlen(dst,buf,len);
136 dst = sdscatlen(dst,o->ptr,sdslen(o->ptr));
137 dst = sdscatlen(dst,"\r\n",2);
138 decrRefCount(o);
139 }
140 return dst;
141 }
142
143 sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
144 int argc = 3;
145 long when;
146 robj *argv[3];
147
148 /* Make sure we can use strtol */
149 seconds = getDecodedObject(seconds);
150 when = time(NULL)+strtol(seconds->ptr,NULL,10);
151 decrRefCount(seconds);
152
153 argv[0] = createStringObject("EXPIREAT",8);
154 argv[1] = key;
155 argv[2] = createObject(REDIS_STRING,
156 sdscatprintf(sdsempty(),"%ld",when));
157 buf = catAppendOnlyGenericCommand(buf, argc, argv);
158 decrRefCount(argv[0]);
159 decrRefCount(argv[2]);
160 return buf;
161 }
162
163 void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
164 sds buf = sdsempty();
165 robj *tmpargv[3];
166
167 /* The DB this command was targetting is not the same as the last command
168 * we appendend. To issue a SELECT command is needed. */
169 if (dictid != server.appendseldb) {
170 char seldb[64];
171
172 snprintf(seldb,sizeof(seldb),"%d",dictid);
173 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
174 (unsigned long)strlen(seldb),seldb);
175 server.appendseldb = dictid;
176 }
177
178 if (cmd->proc == expireCommand) {
179 /* Translate EXPIRE into EXPIREAT */
180 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
181 } else if (cmd->proc == setexCommand) {
182 /* Translate SETEX to SET and EXPIREAT */
183 tmpargv[0] = createStringObject("SET",3);
184 tmpargv[1] = argv[1];
185 tmpargv[2] = argv[3];
186 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
187 decrRefCount(tmpargv[0]);
188 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
189 } else {
190 buf = catAppendOnlyGenericCommand(buf,argc,argv);
191 }
192
193 /* Append to the AOF buffer. This will be flushed on disk just before
194 * of re-entering the event loop, so before the client will get a
195 * positive reply about the operation performed. */
196 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
197
198 /* If a background append only file rewriting is in progress we want to
199 * accumulate the differences between the child DB and the current one
200 * in a buffer, so that when the child process will do its work we
201 * can append the differences to the new append only file. */
202 if (server.bgrewritechildpid != -1)
203 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
204
205 sdsfree(buf);
206 }
207
208 /* In Redis commands are always executed in the context of a client, so in
209 * order to load the append only file we need to create a fake client. */
210 struct redisClient *createFakeClient(void) {
211 struct redisClient *c = zmalloc(sizeof(*c));
212
213 selectDb(c,0);
214 c->fd = -1;
215 c->querybuf = sdsempty();
216 c->argc = 0;
217 c->argv = NULL;
218 c->bufpos = 0;
219 c->flags = 0;
220 /* We set the fake client as a slave waiting for the synchronization
221 * so that Redis will not try to send replies to this client. */
222 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
223 c->reply = listCreate();
224 c->watched_keys = listCreate();
225 listSetFreeMethod(c->reply,decrRefCount);
226 listSetDupMethod(c->reply,dupClientReplyValue);
227 initClientMultiState(c);
228 return c;
229 }
230
231 void freeFakeClient(struct redisClient *c) {
232 sdsfree(c->querybuf);
233 listRelease(c->reply);
234 listRelease(c->watched_keys);
235 freeClientMultiState(c);
236 zfree(c);
237 }
238
239 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
240 * error (the append only file is zero-length) REDIS_ERR is returned. On
241 * fatal error an error message is logged and the program exists. */
242 int loadAppendOnlyFile(char *filename) {
243 struct redisClient *fakeClient;
244 FILE *fp = fopen(filename,"r");
245 struct redis_stat sb;
246 int appendonly = server.appendonly;
247 long loops = 0;
248
249 if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
250 server.appendonly_current_size = 0;
251 fclose(fp);
252 return REDIS_ERR;
253 }
254
255 if (fp == NULL) {
256 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
257 exit(1);
258 }
259
260 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
261 * to the same file we're about to read. */
262 server.appendonly = 0;
263
264 fakeClient = createFakeClient();
265 startLoading(fp);
266
267 while(1) {
268 int argc, j;
269 unsigned long len;
270 robj **argv;
271 char buf[128];
272 sds argsds;
273 struct redisCommand *cmd;
274
275 /* Serve the clients from time to time */
276 if (!(loops++ % 1000)) {
277 loadingProgress(ftello(fp));
278 aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);
279 }
280
281 if (fgets(buf,sizeof(buf),fp) == NULL) {
282 if (feof(fp))
283 break;
284 else
285 goto readerr;
286 }
287 if (buf[0] != '*') goto fmterr;
288 argc = atoi(buf+1);
289 argv = zmalloc(sizeof(robj*)*argc);
290 for (j = 0; j < argc; j++) {
291 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
292 if (buf[0] != '$') goto fmterr;
293 len = strtol(buf+1,NULL,10);
294 argsds = sdsnewlen(NULL,len);
295 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
296 argv[j] = createObject(REDIS_STRING,argsds);
297 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
298 }
299
300 /* Command lookup */
301 cmd = lookupCommand(argv[0]->ptr);
302 if (!cmd) {
303 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
304 exit(1);
305 }
306 /* Run the command in the context of a fake client */
307 fakeClient->argc = argc;
308 fakeClient->argv = argv;
309 cmd->proc(fakeClient);
310
311 /* The fake client should not have a reply */
312 redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0);
313 /* The fake client should never get blocked */
314 redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);
315
316 /* Clean up. Command code may have changed argv/argc so we use the
317 * argv/argc of the client instead of the local variables. */
318 for (j = 0; j < fakeClient->argc; j++)
319 decrRefCount(fakeClient->argv[j]);
320 zfree(fakeClient->argv);
321 }
322
323 /* This point can only be reached when EOF is reached without errors.
324 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
325 if (fakeClient->flags & REDIS_MULTI) goto readerr;
326
327 fclose(fp);
328 freeFakeClient(fakeClient);
329 server.appendonly = appendonly;
330 stopLoading();
331 aofUpdateCurrentSize();
332 server.auto_aofrewrite_base_size = server.appendonly_current_size;
333 return REDIS_OK;
334
335 readerr:
336 if (feof(fp)) {
337 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
338 } else {
339 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
340 }
341 exit(1);
342 fmterr:
343 redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix <filename>");
344 exit(1);
345 }
346
347 /* Write a sequence of commands able to fully rebuild the dataset into
348 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
349 int rewriteAppendOnlyFile(char *filename) {
350 dictIterator *di = NULL;
351 dictEntry *de;
352 FILE *fp;
353 char tmpfile[256];
354 int j;
355 time_t now = time(NULL);
356
357 /* Note that we have to use a different temp name here compared to the
358 * one used by rewriteAppendOnlyFileBackground() function. */
359 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
360 fp = fopen(tmpfile,"w");
361 if (!fp) {
362 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
363 return REDIS_ERR;
364 }
365 for (j = 0; j < server.dbnum; j++) {
366 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
367 redisDb *db = server.db+j;
368 dict *d = db->dict;
369 if (dictSize(d) == 0) continue;
370 di = dictGetSafeIterator(d);
371 if (!di) {
372 fclose(fp);
373 return REDIS_ERR;
374 }
375
376 /* SELECT the new DB */
377 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
378 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
379
380 /* Iterate this DB writing every entry */
381 while((de = dictNext(di)) != NULL) {
382 sds keystr;
383 robj key, *o;
384 time_t expiretime;
385
386 keystr = dictGetEntryKey(de);
387 o = dictGetEntryVal(de);
388 initStaticStringObject(key,keystr);
389
390 expiretime = getExpire(db,&key);
391
392 /* Save the key and associated value */
393 if (o->type == REDIS_STRING) {
394 /* Emit a SET command */
395 char cmd[]="*3\r\n$3\r\nSET\r\n";
396 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
397 /* Key and value */
398 if (fwriteBulkObject(fp,&key) == 0) goto werr;
399 if (fwriteBulkObject(fp,o) == 0) goto werr;
400 } else if (o->type == REDIS_LIST) {
401 /* Emit the RPUSHes needed to rebuild the list */
402 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
403 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
404 unsigned char *zl = o->ptr;
405 unsigned char *p = ziplistIndex(zl,0);
406 unsigned char *vstr;
407 unsigned int vlen;
408 long long vlong;
409
410 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
411 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
412 if (fwriteBulkObject(fp,&key) == 0) goto werr;
413 if (vstr) {
414 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
415 goto werr;
416 } else {
417 if (fwriteBulkLongLong(fp,vlong) == 0)
418 goto werr;
419 }
420 p = ziplistNext(zl,p);
421 }
422 } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
423 list *list = o->ptr;
424 listNode *ln;
425 listIter li;
426
427 listRewind(list,&li);
428 while((ln = listNext(&li))) {
429 robj *eleobj = listNodeValue(ln);
430
431 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
432 if (fwriteBulkObject(fp,&key) == 0) goto werr;
433 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
434 }
435 } else {
436 redisPanic("Unknown list encoding");
437 }
438 } else if (o->type == REDIS_SET) {
439 char cmd[]="*3\r\n$4\r\nSADD\r\n";
440
441 /* Emit the SADDs needed to rebuild the set */
442 if (o->encoding == REDIS_ENCODING_INTSET) {
443 int ii = 0;
444 int64_t llval;
445 while(intsetGet(o->ptr,ii++,&llval)) {
446 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
447 if (fwriteBulkObject(fp,&key) == 0) goto werr;
448 if (fwriteBulkLongLong(fp,llval) == 0) goto werr;
449 }
450 } else if (o->encoding == REDIS_ENCODING_HT) {
451 dictIterator *di = dictGetIterator(o->ptr);
452 dictEntry *de;
453 while((de = dictNext(di)) != NULL) {
454 robj *eleobj = dictGetEntryKey(de);
455 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
456 if (fwriteBulkObject(fp,&key) == 0) goto werr;
457 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
458 }
459 dictReleaseIterator(di);
460 } else {
461 redisPanic("Unknown set encoding");
462 }
463 } else if (o->type == REDIS_ZSET) {
464 /* Emit the ZADDs needed to rebuild the sorted set */
465 char cmd[]="*4\r\n$4\r\nZADD\r\n";
466
467 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
468 unsigned char *zl = o->ptr;
469 unsigned char *eptr, *sptr;
470 unsigned char *vstr;
471 unsigned int vlen;
472 long long vll;
473 double score;
474
475 eptr = ziplistIndex(zl,0);
476 redisAssert(eptr != NULL);
477 sptr = ziplistNext(zl,eptr);
478 redisAssert(sptr != NULL);
479
480 while (eptr != NULL) {
481 redisAssert(ziplistGet(eptr,&vstr,&vlen,&vll));
482 score = zzlGetScore(sptr);
483
484 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
485 if (fwriteBulkObject(fp,&key) == 0) goto werr;
486 if (fwriteBulkDouble(fp,score) == 0) goto werr;
487 if (vstr != NULL) {
488 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
489 goto werr;
490 } else {
491 if (fwriteBulkLongLong(fp,vll) == 0)
492 goto werr;
493 }
494 zzlNext(zl,&eptr,&sptr);
495 }
496 } else if (o->encoding == REDIS_ENCODING_SKIPLIST) {
497 zset *zs = o->ptr;
498 dictIterator *di = dictGetIterator(zs->dict);
499 dictEntry *de;
500
501 while((de = dictNext(di)) != NULL) {
502 robj *eleobj = dictGetEntryKey(de);
503 double *score = dictGetEntryVal(de);
504
505 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
506 if (fwriteBulkObject(fp,&key) == 0) goto werr;
507 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
508 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
509 }
510 dictReleaseIterator(di);
511 } else {
512 redisPanic("Unknown sorted set encoding");
513 }
514 } else if (o->type == REDIS_HASH) {
515 char cmd[]="*4\r\n$4\r\nHSET\r\n";
516
517 /* Emit the HSETs needed to rebuild the hash */
518 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
519 unsigned char *p = zipmapRewind(o->ptr);
520 unsigned char *field, *val;
521 unsigned int flen, vlen;
522
523 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
524 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
525 if (fwriteBulkObject(fp,&key) == 0) goto werr;
526 if (fwriteBulkString(fp,(char*)field,flen) == 0)
527 goto werr;
528 if (fwriteBulkString(fp,(char*)val,vlen) == 0)
529 goto werr;
530 }
531 } else {
532 dictIterator *di = dictGetIterator(o->ptr);
533 dictEntry *de;
534
535 while((de = dictNext(di)) != NULL) {
536 robj *field = dictGetEntryKey(de);
537 robj *val = dictGetEntryVal(de);
538
539 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
540 if (fwriteBulkObject(fp,&key) == 0) goto werr;
541 if (fwriteBulkObject(fp,field) == 0) goto werr;
542 if (fwriteBulkObject(fp,val) == 0) goto werr;
543 }
544 dictReleaseIterator(di);
545 }
546 } else {
547 redisPanic("Unknown object type");
548 }
549 /* Save the expire time */
550 if (expiretime != -1) {
551 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
552 /* If this key is already expired skip it */
553 if (expiretime < now) continue;
554 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
555 if (fwriteBulkObject(fp,&key) == 0) goto werr;
556 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
557 }
558 }
559 dictReleaseIterator(di);
560 }
561
562 /* Make sure data will not remain on the OS's output buffers */
563 fflush(fp);
564 aof_fsync(fileno(fp));
565 fclose(fp);
566
567 /* Use RENAME to make sure the DB file is changed atomically only
568 * if the generate DB file is ok. */
569 if (rename(tmpfile,filename) == -1) {
570 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
571 unlink(tmpfile);
572 return REDIS_ERR;
573 }
574 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
575 return REDIS_OK;
576
577 werr:
578 fclose(fp);
579 unlink(tmpfile);
580 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
581 if (di) dictReleaseIterator(di);
582 return REDIS_ERR;
583 }
584
585 /* This is how rewriting of the append only file in background works:
586 *
587 * 1) The user calls BGREWRITEAOF
588 * 2) Redis calls this function, that forks():
589 * 2a) the child rewrite the append only file in a temp file.
590 * 2b) the parent accumulates differences in server.bgrewritebuf.
591 * 3) When the child finished '2a' exists.
592 * 4) The parent will trap the exit code, if it's OK, will append the
593 * data accumulated into server.bgrewritebuf into the temp file, and
594 * finally will rename(2) the temp file in the actual file name.
595 * The the new file is reopened as the new append only file. Profit!
596 */
597 int rewriteAppendOnlyFileBackground(void) {
598 pid_t childpid;
599 long long start;
600
601 if (server.bgrewritechildpid != -1) return REDIS_ERR;
602 start = ustime();
603 if ((childpid = fork()) == 0) {
604 char tmpfile[256];
605
606 /* Child */
607 if (server.ipfd > 0) close(server.ipfd);
608 if (server.sofd > 0) close(server.sofd);
609 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
610 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
611 _exit(0);
612 } else {
613 _exit(1);
614 }
615 } else {
616 /* Parent */
617 server.stat_fork_time = ustime()-start;
618 if (childpid == -1) {
619 redisLog(REDIS_WARNING,
620 "Can't rewrite append only file in background: fork: %s",
621 strerror(errno));
622 return REDIS_ERR;
623 }
624 redisLog(REDIS_NOTICE,
625 "Background append only file rewriting started by pid %d",childpid);
626 server.bgrewritechildpid = childpid;
627 updateDictResizePolicy();
628 /* We set appendseldb to -1 in order to force the next call to the
629 * feedAppendOnlyFile() to issue a SELECT command, so the differences
630 * accumulated by the parent into server.bgrewritebuf will start
631 * with a SELECT statement and it will be safe to merge. */
632 server.appendseldb = -1;
633 return REDIS_OK;
634 }
635 return REDIS_OK; /* unreached */
636 }
637
638 void bgrewriteaofCommand(redisClient *c) {
639 if (server.bgrewritechildpid != -1) {
640 addReplyError(c,"Background append only file rewriting already in progress");
641 } else if (server.bgsavechildpid != -1) {
642 server.aofrewrite_scheduled = 1;
643 addReplyStatus(c,"Background append only file rewriting scheduled");
644 } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
645 addReplyStatus(c,"Background append only file rewriting started");
646 } else {
647 addReply(c,shared.err);
648 }
649 }
650
651 void aofRemoveTempFile(pid_t childpid) {
652 char tmpfile[256];
653
654 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
655 unlink(tmpfile);
656 }
657
658 /* Update the server.appendonly_current_size filed explicitly using stat(2)
659 * to check the size of the file. This is useful after a rewrite or after
660 * a restart, normally the size is updated just adding the write length
661 * to the current lenght, that is much faster. */
662 void aofUpdateCurrentSize(void) {
663 struct redis_stat sb;
664
665 if (redis_fstat(server.appendfd,&sb) == -1) {
666 redisLog(REDIS_WARNING,"Unable to check the AOF length: %s",
667 strerror(errno));
668 } else {
669 server.appendonly_current_size = sb.st_size;
670 }
671 }
672
673 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
674 * Handle this. */
675 void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
676 if (!bysignal && exitcode == 0) {
677 int newfd, oldfd;
678 int nwritten;
679 char tmpfile[256];
680 long long now = ustime();
681
682 redisLog(REDIS_NOTICE,
683 "Background AOF rewrite terminated with success");
684
685 /* Flush the differences accumulated by the parent to the
686 * rewritten AOF. */
687 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
688 (int)server.bgrewritechildpid);
689 newfd = open(tmpfile,O_WRONLY|O_APPEND);
690 if (newfd == -1) {
691 redisLog(REDIS_WARNING,
692 "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
693 goto cleanup;
694 }
695
696 nwritten = write(newfd,server.bgrewritebuf,sdslen(server.bgrewritebuf));
697 if (nwritten != (signed)sdslen(server.bgrewritebuf)) {
698 if (nwritten == -1) {
699 redisLog(REDIS_WARNING,
700 "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
701 } else {
702 redisLog(REDIS_WARNING,
703 "Short write trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
704 }
705 close(newfd);
706 goto cleanup;
707 }
708
709 redisLog(REDIS_NOTICE,
710 "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", nwritten);
711
712 /* The only remaining thing to do is to rename the temporary file to
713 * the configured file and switch the file descriptor used to do AOF
714 * writes. We don't want close(2) or rename(2) calls to block the
715 * server on old file deletion.
716 *
717 * There are two possible scenarios:
718 *
719 * 1) AOF is DISABLED and this was a one time rewrite. The temporary
720 * file will be renamed to the configured file. When this file already
721 * exists, it will be unlinked, which may block the server.
722 *
723 * 2) AOF is ENABLED and the rewritten AOF will immediately start
724 * receiving writes. After the temporary file is renamed to the
725 * configured file, the original AOF file descriptor will be closed.
726 * Since this will be the last reference to that file, closing it
727 * causes the underlying file to be unlinked, which may block the
728 * server.
729 *
730 * To mitigate the blocking effect of the unlink operation (either
731 * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we
732 * use a background thread to take care of this. First, we
733 * make scenario 1 identical to scenario 2 by opening the target file
734 * when it exists. The unlink operation after the rename(2) will then
735 * be executed upon calling close(2) for its descriptor. Everything to
736 * guarantee atomicity for this switch has already happened by then, so
737 * we don't care what the outcome or duration of that close operation
738 * is, as long as the file descriptor is released again. */
739 if (server.appendfd == -1) {
740 /* AOF disabled */
741
742 /* Don't care if this fails: oldfd will be -1 and we handle that.
743 * One notable case of -1 return is if the old file does
744 * not exist. */
745 oldfd = open(server.appendfilename,O_RDONLY|O_NONBLOCK);
746 } else {
747 /* AOF enabled */
748 oldfd = -1; /* We'll set this to the current AOF filedes later. */
749 }
750
751 /* Rename the temporary file. This will not unlink the target file if
752 * it exists, because we reference it with "oldfd". */
753 if (rename(tmpfile,server.appendfilename) == -1) {
754 redisLog(REDIS_WARNING,
755 "Error trying to rename the temporary AOF: %s", strerror(errno));
756 close(newfd);
757 if (oldfd != -1) close(oldfd);
758 goto cleanup;
759 }
760
761 if (server.appendfd == -1) {
762 /* AOF disabled, we don't need to set the AOF file descriptor
763 * to this new file, so we can close it. */
764 close(newfd);
765 } else {
766 /* AOF enabled, replace the old fd with the new one. */
767 oldfd = server.appendfd;
768 server.appendfd = newfd;
769 if (server.appendfsync == APPENDFSYNC_ALWAYS)
770 aof_fsync(newfd);
771 else if (server.appendfsync == APPENDFSYNC_EVERYSEC)
772 aof_background_fsync(newfd);
773 server.appendseldb = -1; /* Make sure SELECT is re-issued */
774 aofUpdateCurrentSize();
775 server.auto_aofrewrite_base_size = server.appendonly_current_size;
776 }
777
778 redisLog(REDIS_NOTICE, "Background AOF rewrite successful");
779
780 /* Asynchronously close the overwritten AOF. */
781 if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);
782
783 redisLog(REDIS_VERBOSE,
784 "Background AOF rewrite signal handler took %lldus", ustime()-now);
785 } else if (!bysignal && exitcode != 0) {
786 redisLog(REDIS_WARNING,
787 "Background AOF rewrite terminated with error");
788 } else {
789 redisLog(REDIS_WARNING,
790 "Background AOF rewrite terminated by signal %d", bysignal);
791 }
792
793 cleanup:
794 sdsfree(server.bgrewritebuf);
795 server.bgrewritebuf = sdsempty();
796 aofRemoveTempFile(server.bgrewritechildpid);
797 server.bgrewritechildpid = -1;
798 }