]>
Commit | Line | Data |
---|---|---|
e2641e09 | 1 | #include "redis.h" |
986630af | 2 | #include "bio.h" |
f9c6f39b | 3 | #include "rio.h" |
e2641e09 | 4 | |
5 | #include <signal.h> | |
6 | #include <fcntl.h> | |
7 | #include <sys/stat.h> | |
3688d7f3 | 8 | #include <sys/types.h> |
9 | #include <sys/time.h> | |
10 | #include <sys/resource.h> | |
11 | #include <sys/wait.h> | |
e2641e09 | 12 | |
b333e239 | 13 | void aofUpdateCurrentSize(void); |
14 | ||
4b77700a | 15 | void aof_background_fsync(int fd) { |
9a35eb22 | 16 | bioCreateBackgroundJob(REDIS_BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL); |
4b77700a | 17 | } |
18 | ||
e2641e09 | 19 | /* Called when the user switches from "appendonly yes" to "appendonly no" |
20 | * at runtime using the CONFIG command. */ | |
21 | void stopAppendOnly(void) { | |
db3c2a4f | 22 | flushAppendOnlyFile(1); |
e2641e09 | 23 | aof_fsync(server.appendfd); |
24 | close(server.appendfd); | |
25 | ||
26 | server.appendfd = -1; | |
27 | server.appendseldb = -1; | |
28 | server.appendonly = 0; | |
29 | /* rewrite operation in progress? kill it, wait child exit */ | |
b333e239 | 30 | if (server.bgrewritechildpid != -1) { |
e2641e09 | 31 | int statloc; |
32 | ||
b333e239 | 33 | if (kill(server.bgrewritechildpid,SIGKILL) != -1) |
e2641e09 | 34 | wait3(&statloc,0,NULL); |
35 | /* reset the buffer accumulating changes while the child saves */ | |
36 | sdsfree(server.bgrewritebuf); | |
37 | server.bgrewritebuf = sdsempty(); | |
b333e239 | 38 | server.bgrewritechildpid = -1; |
e2641e09 | 39 | } |
40 | } | |
41 | ||
42 | /* Called when the user switches from "appendonly no" to "appendonly yes" | |
43 | * at runtime using the CONFIG command. */ | |
44 | int startAppendOnly(void) { | |
45 | server.appendonly = 1; | |
46 | server.lastfsync = time(NULL); | |
47 | server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); | |
48 | if (server.appendfd == -1) { | |
49 | redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno)); | |
50 | return REDIS_ERR; | |
51 | } | |
52 | if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { | |
53 | server.appendonly = 0; | |
54 | close(server.appendfd); | |
ff15dba0 | 55 | redisLog(REDIS_WARNING,"User tried turning on AOF with CONFIG SET but I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error."); |
e2641e09 | 56 | return REDIS_ERR; |
57 | } | |
58 | return REDIS_OK; | |
59 | } | |
60 | ||
61 | /* Write the append only file buffer on disk. | |
62 | * | |
63 | * Since we are required to write the AOF before replying to the client, | |
64 | * and the only way the client socket can get a write is entering when the | |
65 | * the event loop, we accumulate all the AOF writes in a memory | |
66 | * buffer and write it on disk using this function just before entering | |
db3c2a4f | 67 | * the event loop again. |
68 | * | |
69 | * About the 'force' argument: | |
70 | * | |
71 | * When the fsync policy is set to 'everysec' we may delay the flush if there | |
72 | * is still an fsync() going on in the background thread, since for instance | |
73 | * on Linux write(2) will be blocked by the background fsync anyway. | |
74 | * When this happens we remember that there is some aof buffer to be | |
75 | * flushed ASAP, and will try to do that in the serverCron() function. | |
76 | * | |
77 | * However if force is set to 1 we'll write regardless of the background | |
78 | * fsync. */ | |
79 | void flushAppendOnlyFile(int force) { | |
e2641e09 | 80 | ssize_t nwritten; |
db3c2a4f | 81 | int sync_in_progress = 0; |
e2641e09 | 82 | |
83 | if (sdslen(server.aofbuf) == 0) return; | |
84 | ||
db3c2a4f | 85 | if (server.appendfsync == APPENDFSYNC_EVERYSEC) |
86 | sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0; | |
87 | ||
88 | if (server.appendfsync == APPENDFSYNC_EVERYSEC && !force) { | |
89 | /* With this append fsync policy we do background fsyncing. | |
90 | * If the fsync is still in progress we can try to delay | |
91 | * the write for a couple of seconds. */ | |
92 | if (sync_in_progress) { | |
93 | if (server.aof_flush_postponed_start == 0) { | |
94 | /* No previous write postponinig, remember that we are | |
95 | * postponing the flush and return. */ | |
96 | server.aof_flush_postponed_start = server.unixtime; | |
97 | return; | |
98 | } else if (server.unixtime - server.aof_flush_postponed_start < 2) { | |
e7aec180 | 99 | /* We were already waiting for fsync to finish, but for less |
db3c2a4f | 100 | * than two seconds this is still ok. Postpone again. */ |
101 | return; | |
102 | } | |
103 | /* Otherwise fall trough, and go write since we can't wait | |
104 | * over two seconds. */ | |
77ca5fcb | 105 | redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis."); |
db3c2a4f | 106 | } |
107 | } | |
108 | /* If you are following this code path, then we are going to write so | |
109 | * set reset the postponed flush sentinel to zero. */ | |
110 | server.aof_flush_postponed_start = 0; | |
111 | ||
e2641e09 | 112 | /* We want to perform a single write. This should be guaranteed atomic |
113 | * at least if the filesystem we are writing is a real physical one. | |
114 | * While this will save us against the server being killed I don't think | |
115 | * there is much to do about the whole server stopping for power problems | |
116 | * or alike */ | |
a57225c2 PN |
117 | nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); |
118 | if (nwritten != (signed)sdslen(server.aofbuf)) { | |
e2641e09 | 119 | /* Ooops, we are in troubles. The best thing to do for now is |
120 | * aborting instead of giving the illusion that everything is | |
121 | * working as expected. */ | |
a57225c2 | 122 | if (nwritten == -1) { |
e2641e09 | 123 | redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); |
a57225c2 | 124 | } else { |
e2641e09 | 125 | redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); |
a57225c2 PN |
126 | } |
127 | exit(1); | |
e2641e09 | 128 | } |
b333e239 | 129 | server.appendonly_current_size += nwritten; |
e2641e09 | 130 | |
f990782f PN |
131 | /* Re-use AOF buffer when it is small enough. The maximum comes from the |
132 | * arena size of 4k minus some overhead (but is otherwise arbitrary). */ | |
133 | if ((sdslen(server.aofbuf)+sdsavail(server.aofbuf)) < 4000) { | |
134 | sdsclear(server.aofbuf); | |
135 | } else { | |
136 | sdsfree(server.aofbuf); | |
137 | server.aofbuf = sdsempty(); | |
138 | } | |
139 | ||
29732248 PN |
140 | /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are |
141 | * children doing I/O in the background. */ | |
e2641e09 | 142 | if (server.no_appendfsync_on_rewrite && |
143 | (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) | |
144 | return; | |
29732248 PN |
145 | |
146 | /* Perform the fsync if needed. */ | |
db3c2a4f | 147 | if (server.appendfsync == APPENDFSYNC_ALWAYS) { |
e2641e09 | 148 | /* aof_fsync is defined as fdatasync() for Linux in order to avoid |
149 | * flushing metadata. */ | |
150 | aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ | |
29732248 | 151 | server.lastfsync = server.unixtime; |
db3c2a4f | 152 | } else if ((server.appendfsync == APPENDFSYNC_EVERYSEC && |
153 | server.unixtime > server.lastfsync)) { | |
154 | if (!sync_in_progress) aof_background_fsync(server.appendfd); | |
155 | server.lastfsync = server.unixtime; | |
e2641e09 | 156 | } |
157 | } | |
158 | ||
d1ec6c8b PN |
159 | sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { |
160 | char buf[32]; | |
161 | int len, j; | |
162 | robj *o; | |
163 | ||
164 | buf[0] = '*'; | |
165 | len = 1+ll2string(buf+1,sizeof(buf)-1,argc); | |
166 | buf[len++] = '\r'; | |
167 | buf[len++] = '\n'; | |
168 | dst = sdscatlen(dst,buf,len); | |
169 | ||
e2641e09 | 170 | for (j = 0; j < argc; j++) { |
d1ec6c8b PN |
171 | o = getDecodedObject(argv[j]); |
172 | buf[0] = '$'; | |
173 | len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); | |
174 | buf[len++] = '\r'; | |
175 | buf[len++] = '\n'; | |
176 | dst = sdscatlen(dst,buf,len); | |
177 | dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); | |
178 | dst = sdscatlen(dst,"\r\n",2); | |
e2641e09 | 179 | decrRefCount(o); |
180 | } | |
d1ec6c8b | 181 | return dst; |
e2641e09 | 182 | } |
183 | ||
184 | sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { | |
185 | int argc = 3; | |
186 | long when; | |
187 | robj *argv[3]; | |
188 | ||
189 | /* Make sure we can use strtol */ | |
190 | seconds = getDecodedObject(seconds); | |
191 | when = time(NULL)+strtol(seconds->ptr,NULL,10); | |
192 | decrRefCount(seconds); | |
193 | ||
194 | argv[0] = createStringObject("EXPIREAT",8); | |
195 | argv[1] = key; | |
196 | argv[2] = createObject(REDIS_STRING, | |
197 | sdscatprintf(sdsempty(),"%ld",when)); | |
198 | buf = catAppendOnlyGenericCommand(buf, argc, argv); | |
199 | decrRefCount(argv[0]); | |
200 | decrRefCount(argv[2]); | |
201 | return buf; | |
202 | } | |
203 | ||
204 | void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { | |
205 | sds buf = sdsempty(); | |
206 | robj *tmpargv[3]; | |
207 | ||
208 | /* The DB this command was targetting is not the same as the last command | |
209 | * we appendend. To issue a SELECT command is needed. */ | |
210 | if (dictid != server.appendseldb) { | |
211 | char seldb[64]; | |
212 | ||
213 | snprintf(seldb,sizeof(seldb),"%d",dictid); | |
214 | buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", | |
215 | (unsigned long)strlen(seldb),seldb); | |
216 | server.appendseldb = dictid; | |
217 | } | |
218 | ||
219 | if (cmd->proc == expireCommand) { | |
220 | /* Translate EXPIRE into EXPIREAT */ | |
221 | buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); | |
222 | } else if (cmd->proc == setexCommand) { | |
223 | /* Translate SETEX to SET and EXPIREAT */ | |
224 | tmpargv[0] = createStringObject("SET",3); | |
225 | tmpargv[1] = argv[1]; | |
226 | tmpargv[2] = argv[3]; | |
227 | buf = catAppendOnlyGenericCommand(buf,3,tmpargv); | |
228 | decrRefCount(tmpargv[0]); | |
229 | buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); | |
230 | } else { | |
231 | buf = catAppendOnlyGenericCommand(buf,argc,argv); | |
232 | } | |
233 | ||
234 | /* Append to the AOF buffer. This will be flushed on disk just before | |
235 | * of re-entering the event loop, so before the client will get a | |
236 | * positive reply about the operation performed. */ | |
237 | server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf)); | |
238 | ||
239 | /* If a background append only file rewriting is in progress we want to | |
240 | * accumulate the differences between the child DB and the current one | |
241 | * in a buffer, so that when the child process will do its work we | |
242 | * can append the differences to the new append only file. */ | |
243 | if (server.bgrewritechildpid != -1) | |
244 | server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf)); | |
245 | ||
246 | sdsfree(buf); | |
247 | } | |
248 | ||
249 | /* In Redis commands are always executed in the context of a client, so in | |
250 | * order to load the append only file we need to create a fake client. */ | |
251 | struct redisClient *createFakeClient(void) { | |
252 | struct redisClient *c = zmalloc(sizeof(*c)); | |
253 | ||
254 | selectDb(c,0); | |
255 | c->fd = -1; | |
256 | c->querybuf = sdsempty(); | |
257 | c->argc = 0; | |
258 | c->argv = NULL; | |
2403fc9f | 259 | c->bufpos = 0; |
e2641e09 | 260 | c->flags = 0; |
261 | /* We set the fake client as a slave waiting for the synchronization | |
262 | * so that Redis will not try to send replies to this client. */ | |
263 | c->replstate = REDIS_REPL_WAIT_BGSAVE_START; | |
264 | c->reply = listCreate(); | |
b67d2345 | 265 | c->watched_keys = listCreate(); |
e2641e09 | 266 | listSetFreeMethod(c->reply,decrRefCount); |
267 | listSetDupMethod(c->reply,dupClientReplyValue); | |
268 | initClientMultiState(c); | |
269 | return c; | |
270 | } | |
271 | ||
272 | void freeFakeClient(struct redisClient *c) { | |
273 | sdsfree(c->querybuf); | |
274 | listRelease(c->reply); | |
b67d2345 | 275 | listRelease(c->watched_keys); |
e2641e09 | 276 | freeClientMultiState(c); |
277 | zfree(c); | |
278 | } | |
279 | ||
280 | /* Replay the append log file. On error REDIS_OK is returned. On non fatal | |
281 | * error (the append only file is zero-length) REDIS_ERR is returned. On | |
282 | * fatal error an error message is logged and the program exists. */ | |
283 | int loadAppendOnlyFile(char *filename) { | |
284 | struct redisClient *fakeClient; | |
285 | FILE *fp = fopen(filename,"r"); | |
286 | struct redis_stat sb; | |
287 | int appendonly = server.appendonly; | |
97e7f8ae | 288 | long loops = 0; |
e2641e09 | 289 | |
4aec2ec8 | 290 | if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { |
b333e239 | 291 | server.appendonly_current_size = 0; |
4aec2ec8 | 292 | fclose(fp); |
e2641e09 | 293 | return REDIS_ERR; |
4aec2ec8 | 294 | } |
e2641e09 | 295 | |
296 | if (fp == NULL) { | |
297 | redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); | |
298 | exit(1); | |
299 | } | |
300 | ||
301 | /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI | |
302 | * to the same file we're about to read. */ | |
303 | server.appendonly = 0; | |
304 | ||
305 | fakeClient = createFakeClient(); | |
97e7f8ae | 306 | startLoading(fp); |
307 | ||
e2641e09 | 308 | while(1) { |
309 | int argc, j; | |
310 | unsigned long len; | |
311 | robj **argv; | |
312 | char buf[128]; | |
313 | sds argsds; | |
314 | struct redisCommand *cmd; | |
e2641e09 | 315 | |
97e7f8ae | 316 | /* Serve the clients from time to time */ |
317 | if (!(loops++ % 1000)) { | |
318 | loadingProgress(ftello(fp)); | |
319 | aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); | |
320 | } | |
321 | ||
e2641e09 | 322 | if (fgets(buf,sizeof(buf),fp) == NULL) { |
323 | if (feof(fp)) | |
324 | break; | |
325 | else | |
326 | goto readerr; | |
327 | } | |
328 | if (buf[0] != '*') goto fmterr; | |
329 | argc = atoi(buf+1); | |
be6f6395 KM |
330 | if (argc < 1) goto fmterr; |
331 | ||
e2641e09 | 332 | argv = zmalloc(sizeof(robj*)*argc); |
333 | for (j = 0; j < argc; j++) { | |
334 | if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; | |
335 | if (buf[0] != '$') goto fmterr; | |
336 | len = strtol(buf+1,NULL,10); | |
337 | argsds = sdsnewlen(NULL,len); | |
338 | if (len && fread(argsds,len,1,fp) == 0) goto fmterr; | |
339 | argv[j] = createObject(REDIS_STRING,argsds); | |
340 | if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ | |
341 | } | |
342 | ||
343 | /* Command lookup */ | |
344 | cmd = lookupCommand(argv[0]->ptr); | |
345 | if (!cmd) { | |
346 | redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); | |
347 | exit(1); | |
348 | } | |
e2641e09 | 349 | /* Run the command in the context of a fake client */ |
350 | fakeClient->argc = argc; | |
351 | fakeClient->argv = argv; | |
352 | cmd->proc(fakeClient); | |
57b07380 PN |
353 | |
354 | /* The fake client should not have a reply */ | |
355 | redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); | |
ef67a2fc | 356 | /* The fake client should never get blocked */ |
357 | redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0); | |
57b07380 | 358 | |
45b0f6fb PN |
359 | /* Clean up. Command code may have changed argv/argc so we use the |
360 | * argv/argc of the client instead of the local variables. */ | |
361 | for (j = 0; j < fakeClient->argc; j++) | |
362 | decrRefCount(fakeClient->argv[j]); | |
363 | zfree(fakeClient->argv); | |
e2641e09 | 364 | } |
365 | ||
366 | /* This point can only be reached when EOF is reached without errors. | |
367 | * If the client is in the middle of a MULTI/EXEC, log error and quit. */ | |
368 | if (fakeClient->flags & REDIS_MULTI) goto readerr; | |
369 | ||
370 | fclose(fp); | |
371 | freeFakeClient(fakeClient); | |
372 | server.appendonly = appendonly; | |
97e7f8ae | 373 | stopLoading(); |
b333e239 | 374 | aofUpdateCurrentSize(); |
c66bf1fa | 375 | server.auto_aofrewrite_base_size = server.appendonly_current_size; |
e2641e09 | 376 | return REDIS_OK; |
377 | ||
378 | readerr: | |
379 | if (feof(fp)) { | |
380 | redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); | |
381 | } else { | |
382 | redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); | |
383 | } | |
384 | exit(1); | |
385 | fmterr: | |
412e457c | 386 | redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix <filename>"); |
e2641e09 | 387 | exit(1); |
388 | } | |
389 | ||
7271198c PN |
390 | /* Delegate writing an object to writing a bulk string or bulk long long. |
391 | * This is not placed in rio.c since that adds the redis.h dependency. */ | |
392 | int rioWriteBulkObject(rio *r, robj *obj) { | |
393 | /* Avoid using getDecodedObject to help copy-on-write (we are often | |
394 | * in a child process when this function is called). */ | |
395 | if (obj->encoding == REDIS_ENCODING_INT) { | |
396 | return rioWriteBulkLongLong(r,(long)obj->ptr); | |
397 | } else if (obj->encoding == REDIS_ENCODING_RAW) { | |
398 | return rioWriteBulkString(r,obj->ptr,sdslen(obj->ptr)); | |
399 | } else { | |
400 | redisPanic("Unknown string encoding"); | |
401 | } | |
402 | } | |
403 | ||
e2641e09 | 404 | /* Write a sequence of commands able to fully rebuild the dataset into |
405 | * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */ | |
406 | int rewriteAppendOnlyFile(char *filename) { | |
407 | dictIterator *di = NULL; | |
408 | dictEntry *de; | |
7271198c | 409 | rio aof; |
e2641e09 | 410 | FILE *fp; |
411 | char tmpfile[256]; | |
412 | int j; | |
413 | time_t now = time(NULL); | |
414 | ||
415 | /* Note that we have to use a different temp name here compared to the | |
416 | * one used by rewriteAppendOnlyFileBackground() function. */ | |
417 | snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); | |
418 | fp = fopen(tmpfile,"w"); | |
419 | if (!fp) { | |
420 | redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno)); | |
421 | return REDIS_ERR; | |
422 | } | |
7271198c | 423 | |
f96a8a80 | 424 | rioInitWithFile(&aof,fp); |
e2641e09 | 425 | for (j = 0; j < server.dbnum; j++) { |
426 | char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; | |
427 | redisDb *db = server.db+j; | |
428 | dict *d = db->dict; | |
429 | if (dictSize(d) == 0) continue; | |
591f29e0 | 430 | di = dictGetSafeIterator(d); |
e2641e09 | 431 | if (!di) { |
432 | fclose(fp); | |
433 | return REDIS_ERR; | |
434 | } | |
435 | ||
436 | /* SELECT the new DB */ | |
7271198c PN |
437 | if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr; |
438 | if (rioWriteBulkLongLong(&aof,j) == 0) goto werr; | |
e2641e09 | 439 | |
440 | /* Iterate this DB writing every entry */ | |
441 | while((de = dictNext(di)) != NULL) { | |
6901fe77 | 442 | sds keystr; |
e2641e09 | 443 | robj key, *o; |
444 | time_t expiretime; | |
e2641e09 | 445 | |
446 | keystr = dictGetEntryKey(de); | |
447 | o = dictGetEntryVal(de); | |
448 | initStaticStringObject(key,keystr); | |
16d77878 | 449 | |
e2641e09 | 450 | expiretime = getExpire(db,&key); |
451 | ||
452 | /* Save the key and associated value */ | |
453 | if (o->type == REDIS_STRING) { | |
454 | /* Emit a SET command */ | |
455 | char cmd[]="*3\r\n$3\r\nSET\r\n"; | |
7271198c | 456 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
e2641e09 | 457 | /* Key and value */ |
7271198c PN |
458 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; |
459 | if (rioWriteBulkObject(&aof,o) == 0) goto werr; | |
e2641e09 | 460 | } else if (o->type == REDIS_LIST) { |
461 | /* Emit the RPUSHes needed to rebuild the list */ | |
462 | char cmd[]="*3\r\n$5\r\nRPUSH\r\n"; | |
463 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { | |
464 | unsigned char *zl = o->ptr; | |
465 | unsigned char *p = ziplistIndex(zl,0); | |
466 | unsigned char *vstr; | |
467 | unsigned int vlen; | |
468 | long long vlong; | |
469 | ||
470 | while(ziplistGet(p,&vstr,&vlen,&vlong)) { | |
7271198c PN |
471 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
472 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
e2641e09 | 473 | if (vstr) { |
7271198c | 474 | if (rioWriteBulkString(&aof,(char*)vstr,vlen) == 0) |
e2641e09 | 475 | goto werr; |
476 | } else { | |
7271198c | 477 | if (rioWriteBulkLongLong(&aof,vlong) == 0) |
e2641e09 | 478 | goto werr; |
479 | } | |
480 | p = ziplistNext(zl,p); | |
481 | } | |
482 | } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { | |
483 | list *list = o->ptr; | |
484 | listNode *ln; | |
485 | listIter li; | |
486 | ||
487 | listRewind(list,&li); | |
488 | while((ln = listNext(&li))) { | |
489 | robj *eleobj = listNodeValue(ln); | |
490 | ||
7271198c PN |
491 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
492 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
493 | if (rioWriteBulkObject(&aof,eleobj) == 0) goto werr; | |
e2641e09 | 494 | } |
495 | } else { | |
496 | redisPanic("Unknown list encoding"); | |
497 | } | |
498 | } else if (o->type == REDIS_SET) { | |
2767f1c0 | 499 | char cmd[]="*3\r\n$4\r\nSADD\r\n"; |
e2641e09 | 500 | |
2767f1c0 PN |
501 | /* Emit the SADDs needed to rebuild the set */ |
502 | if (o->encoding == REDIS_ENCODING_INTSET) { | |
503 | int ii = 0; | |
23c64fe5 | 504 | int64_t llval; |
2767f1c0 | 505 | while(intsetGet(o->ptr,ii++,&llval)) { |
7271198c PN |
506 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
507 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
508 | if (rioWriteBulkLongLong(&aof,llval) == 0) goto werr; | |
2767f1c0 PN |
509 | } |
510 | } else if (o->encoding == REDIS_ENCODING_HT) { | |
511 | dictIterator *di = dictGetIterator(o->ptr); | |
512 | dictEntry *de; | |
513 | while((de = dictNext(di)) != NULL) { | |
514 | robj *eleobj = dictGetEntryKey(de); | |
7271198c PN |
515 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
516 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
517 | if (rioWriteBulkObject(&aof,eleobj) == 0) goto werr; | |
2767f1c0 PN |
518 | } |
519 | dictReleaseIterator(di); | |
520 | } else { | |
521 | redisPanic("Unknown set encoding"); | |
e2641e09 | 522 | } |
e2641e09 | 523 | } else if (o->type == REDIS_ZSET) { |
524 | /* Emit the ZADDs needed to rebuild the sorted set */ | |
dddf5335 PN |
525 | char cmd[]="*4\r\n$4\r\nZADD\r\n"; |
526 | ||
527 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { | |
528 | unsigned char *zl = o->ptr; | |
529 | unsigned char *eptr, *sptr; | |
530 | unsigned char *vstr; | |
531 | unsigned int vlen; | |
532 | long long vll; | |
533 | double score; | |
534 | ||
535 | eptr = ziplistIndex(zl,0); | |
536 | redisAssert(eptr != NULL); | |
537 | sptr = ziplistNext(zl,eptr); | |
538 | redisAssert(sptr != NULL); | |
539 | ||
540 | while (eptr != NULL) { | |
541 | redisAssert(ziplistGet(eptr,&vstr,&vlen,&vll)); | |
542 | score = zzlGetScore(sptr); | |
543 | ||
7271198c PN |
544 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
545 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
546 | if (rioWriteBulkDouble(&aof,score) == 0) goto werr; | |
dddf5335 | 547 | if (vstr != NULL) { |
7271198c | 548 | if (rioWriteBulkString(&aof,(char*)vstr,vlen) == 0) |
dddf5335 PN |
549 | goto werr; |
550 | } else { | |
7271198c | 551 | if (rioWriteBulkLongLong(&aof,vll) == 0) |
dddf5335 PN |
552 | goto werr; |
553 | } | |
554 | zzlNext(zl,&eptr,&sptr); | |
555 | } | |
100ed062 | 556 | } else if (o->encoding == REDIS_ENCODING_SKIPLIST) { |
dddf5335 PN |
557 | zset *zs = o->ptr; |
558 | dictIterator *di = dictGetIterator(zs->dict); | |
559 | dictEntry *de; | |
560 | ||
561 | while((de = dictNext(di)) != NULL) { | |
562 | robj *eleobj = dictGetEntryKey(de); | |
563 | double *score = dictGetEntryVal(de); | |
564 | ||
7271198c PN |
565 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
566 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
567 | if (rioWriteBulkDouble(&aof,*score) == 0) goto werr; | |
568 | if (rioWriteBulkObject(&aof,eleobj) == 0) goto werr; | |
dddf5335 PN |
569 | } |
570 | dictReleaseIterator(di); | |
571 | } else { | |
572 | redisPanic("Unknown sorted set encoding"); | |
e2641e09 | 573 | } |
e2641e09 | 574 | } else if (o->type == REDIS_HASH) { |
575 | char cmd[]="*4\r\n$4\r\nHSET\r\n"; | |
576 | ||
577 | /* Emit the HSETs needed to rebuild the hash */ | |
578 | if (o->encoding == REDIS_ENCODING_ZIPMAP) { | |
579 | unsigned char *p = zipmapRewind(o->ptr); | |
580 | unsigned char *field, *val; | |
581 | unsigned int flen, vlen; | |
582 | ||
583 | while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) { | |
7271198c PN |
584 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
585 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
586 | if (rioWriteBulkString(&aof,(char*)field,flen) == 0) | |
5bd09cd4 | 587 | goto werr; |
7271198c | 588 | if (rioWriteBulkString(&aof,(char*)val,vlen) == 0) |
5bd09cd4 | 589 | goto werr; |
e2641e09 | 590 | } |
591 | } else { | |
592 | dictIterator *di = dictGetIterator(o->ptr); | |
593 | dictEntry *de; | |
594 | ||
595 | while((de = dictNext(di)) != NULL) { | |
596 | robj *field = dictGetEntryKey(de); | |
597 | robj *val = dictGetEntryVal(de); | |
598 | ||
7271198c PN |
599 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
600 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
601 | if (rioWriteBulkObject(&aof,field) == 0) goto werr; | |
602 | if (rioWriteBulkObject(&aof,val) == 0) goto werr; | |
e2641e09 | 603 | } |
604 | dictReleaseIterator(di); | |
605 | } | |
606 | } else { | |
607 | redisPanic("Unknown object type"); | |
608 | } | |
609 | /* Save the expire time */ | |
610 | if (expiretime != -1) { | |
611 | char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n"; | |
612 | /* If this key is already expired skip it */ | |
613 | if (expiretime < now) continue; | |
7271198c PN |
614 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
615 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; | |
616 | if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr; | |
e2641e09 | 617 | } |
e2641e09 | 618 | } |
619 | dictReleaseIterator(di); | |
620 | } | |
621 | ||
622 | /* Make sure data will not remain on the OS's output buffers */ | |
623 | fflush(fp); | |
624 | aof_fsync(fileno(fp)); | |
625 | fclose(fp); | |
626 | ||
627 | /* Use RENAME to make sure the DB file is changed atomically only | |
628 | * if the generate DB file is ok. */ | |
629 | if (rename(tmpfile,filename) == -1) { | |
630 | redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); | |
631 | unlink(tmpfile); | |
632 | return REDIS_ERR; | |
633 | } | |
634 | redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); | |
635 | return REDIS_OK; | |
636 | ||
637 | werr: | |
638 | fclose(fp); | |
639 | unlink(tmpfile); | |
640 | redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); | |
641 | if (di) dictReleaseIterator(di); | |
642 | return REDIS_ERR; | |
643 | } | |
644 | ||
645 | /* This is how rewriting of the append only file in background works: | |
646 | * | |
647 | * 1) The user calls BGREWRITEAOF | |
648 | * 2) Redis calls this function, that forks(): | |
649 | * 2a) the child rewrite the append only file in a temp file. | |
650 | * 2b) the parent accumulates differences in server.bgrewritebuf. | |
651 | * 3) When the child finished '2a' exists. | |
652 | * 4) The parent will trap the exit code, if it's OK, will append the | |
653 | * data accumulated into server.bgrewritebuf into the temp file, and | |
654 | * finally will rename(2) the temp file in the actual file name. | |
655 | * The the new file is reopened as the new append only file. Profit! | |
656 | */ | |
657 | int rewriteAppendOnlyFileBackground(void) { | |
658 | pid_t childpid; | |
615e414c | 659 | long long start; |
e2641e09 | 660 | |
661 | if (server.bgrewritechildpid != -1) return REDIS_ERR; | |
615e414c | 662 | start = ustime(); |
e2641e09 | 663 | if ((childpid = fork()) == 0) { |
e2641e09 | 664 | char tmpfile[256]; |
665 | ||
615e414c | 666 | /* Child */ |
a5639e7d PN |
667 | if (server.ipfd > 0) close(server.ipfd); |
668 | if (server.sofd > 0) close(server.sofd); | |
e2641e09 | 669 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); |
670 | if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { | |
671 | _exit(0); | |
672 | } else { | |
673 | _exit(1); | |
674 | } | |
675 | } else { | |
676 | /* Parent */ | |
615e414c | 677 | server.stat_fork_time = ustime()-start; |
e2641e09 | 678 | if (childpid == -1) { |
679 | redisLog(REDIS_WARNING, | |
680 | "Can't rewrite append only file in background: fork: %s", | |
681 | strerror(errno)); | |
682 | return REDIS_ERR; | |
683 | } | |
684 | redisLog(REDIS_NOTICE, | |
685 | "Background append only file rewriting started by pid %d",childpid); | |
b508aeb9 | 686 | server.aofrewrite_scheduled = 0; |
e2641e09 | 687 | server.bgrewritechildpid = childpid; |
688 | updateDictResizePolicy(); | |
689 | /* We set appendseldb to -1 in order to force the next call to the | |
690 | * feedAppendOnlyFile() to issue a SELECT command, so the differences | |
691 | * accumulated by the parent into server.bgrewritebuf will start | |
692 | * with a SELECT statement and it will be safe to merge. */ | |
693 | server.appendseldb = -1; | |
694 | return REDIS_OK; | |
695 | } | |
696 | return REDIS_OK; /* unreached */ | |
697 | } | |
698 | ||
699 | void bgrewriteaofCommand(redisClient *c) { | |
700 | if (server.bgrewritechildpid != -1) { | |
3ab20376 | 701 | addReplyError(c,"Background append only file rewriting already in progress"); |
b333e239 | 702 | } else if (server.bgsavechildpid != -1) { |
703 | server.aofrewrite_scheduled = 1; | |
9e40bce3 | 704 | addReplyStatus(c,"Background append only file rewriting scheduled"); |
b333e239 | 705 | } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) { |
3ab20376 | 706 | addReplyStatus(c,"Background append only file rewriting started"); |
e2641e09 | 707 | } else { |
708 | addReply(c,shared.err); | |
709 | } | |
710 | } | |
711 | ||
712 | void aofRemoveTempFile(pid_t childpid) { | |
713 | char tmpfile[256]; | |
714 | ||
715 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); | |
716 | unlink(tmpfile); | |
717 | } | |
718 | ||
b333e239 | 719 | /* Update the server.appendonly_current_size filed explicitly using stat(2) |
720 | * to check the size of the file. This is useful after a rewrite or after | |
721 | * a restart, normally the size is updated just adding the write length | |
722 | * to the current lenght, that is much faster. */ | |
723 | void aofUpdateCurrentSize(void) { | |
724 | struct redis_stat sb; | |
725 | ||
726 | if (redis_fstat(server.appendfd,&sb) == -1) { | |
727 | redisLog(REDIS_WARNING,"Unable to check the AOF length: %s", | |
728 | strerror(errno)); | |
729 | } else { | |
730 | server.appendonly_current_size = sb.st_size; | |
731 | } | |
732 | } | |
733 | ||
e2641e09 | 734 | /* A background append only file rewriting (BGREWRITEAOF) terminated its work. |
735 | * Handle this. */ | |
36c17a53 | 736 | void backgroundRewriteDoneHandler(int exitcode, int bysignal) { |
e2641e09 | 737 | if (!bysignal && exitcode == 0) { |
b454056d PN |
738 | int newfd, oldfd; |
739 | int nwritten; | |
e2641e09 | 740 | char tmpfile[256]; |
b454056d | 741 | long long now = ustime(); |
e2641e09 | 742 | |
743 | redisLog(REDIS_NOTICE, | |
b454056d PN |
744 | "Background AOF rewrite terminated with success"); |
745 | ||
986630af | 746 | /* Flush the differences accumulated by the parent to the |
747 | * rewritten AOF. */ | |
748 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", | |
749 | (int)server.bgrewritechildpid); | |
b454056d PN |
750 | newfd = open(tmpfile,O_WRONLY|O_APPEND); |
751 | if (newfd == -1) { | |
752 | redisLog(REDIS_WARNING, | |
753 | "Unable to open the temporary AOF produced by the child: %s", strerror(errno)); | |
e2641e09 | 754 | goto cleanup; |
755 | } | |
b454056d PN |
756 | |
757 | nwritten = write(newfd,server.bgrewritebuf,sdslen(server.bgrewritebuf)); | |
758 | if (nwritten != (signed)sdslen(server.bgrewritebuf)) { | |
759 | if (nwritten == -1) { | |
760 | redisLog(REDIS_WARNING, | |
761 | "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); | |
762 | } else { | |
763 | redisLog(REDIS_WARNING, | |
764 | "Short write trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); | |
765 | } | |
766 | close(newfd); | |
e2641e09 | 767 | goto cleanup; |
768 | } | |
b454056d PN |
769 | |
770 | redisLog(REDIS_NOTICE, | |
771 | "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", nwritten); | |
772 | ||
773 | /* The only remaining thing to do is to rename the temporary file to | |
774 | * the configured file and switch the file descriptor used to do AOF | |
986630af | 775 | * writes. We don't want close(2) or rename(2) calls to block the |
776 | * server on old file deletion. | |
777 | * | |
778 | * There are two possible scenarios: | |
b454056d PN |
779 | * |
780 | * 1) AOF is DISABLED and this was a one time rewrite. The temporary | |
781 | * file will be renamed to the configured file. When this file already | |
782 | * exists, it will be unlinked, which may block the server. | |
783 | * | |
784 | * 2) AOF is ENABLED and the rewritten AOF will immediately start | |
785 | * receiving writes. After the temporary file is renamed to the | |
786 | * configured file, the original AOF file descriptor will be closed. | |
787 | * Since this will be the last reference to that file, closing it | |
788 | * causes the underlying file to be unlinked, which may block the | |
789 | * server. | |
790 | * | |
791 | * To mitigate the blocking effect of the unlink operation (either | |
792 | * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we | |
986630af | 793 | * use a background thread to take care of this. First, we |
b454056d PN |
794 | * make scenario 1 identical to scenario 2 by opening the target file |
795 | * when it exists. The unlink operation after the rename(2) will then | |
796 | * be executed upon calling close(2) for its descriptor. Everything to | |
797 | * guarantee atomicity for this switch has already happened by then, so | |
798 | * we don't care what the outcome or duration of that close operation | |
799 | * is, as long as the file descriptor is released again. */ | |
800 | if (server.appendfd == -1) { | |
801 | /* AOF disabled */ | |
b454056d | 802 | |
986630af | 803 | /* Don't care if this fails: oldfd will be -1 and we handle that. |
804 | * One notable case of -1 return is if the old file does | |
805 | * not exist. */ | |
806 | oldfd = open(server.appendfilename,O_RDONLY|O_NONBLOCK); | |
b454056d PN |
807 | } else { |
808 | /* AOF enabled */ | |
986630af | 809 | oldfd = -1; /* We'll set this to the current AOF filedes later. */ |
b454056d PN |
810 | } |
811 | ||
812 | /* Rename the temporary file. This will not unlink the target file if | |
813 | * it exists, because we reference it with "oldfd". */ | |
e2641e09 | 814 | if (rename(tmpfile,server.appendfilename) == -1) { |
b454056d PN |
815 | redisLog(REDIS_WARNING, |
816 | "Error trying to rename the temporary AOF: %s", strerror(errno)); | |
817 | close(newfd); | |
986630af | 818 | if (oldfd != -1) close(oldfd); |
e2641e09 | 819 | goto cleanup; |
820 | } | |
b454056d PN |
821 | |
822 | if (server.appendfd == -1) { | |
986630af | 823 | /* AOF disabled, we don't need to set the AOF file descriptor |
824 | * to this new file, so we can close it. */ | |
b454056d PN |
825 | close(newfd); |
826 | } else { | |
986630af | 827 | /* AOF enabled, replace the old fd with the new one. */ |
b454056d PN |
828 | oldfd = server.appendfd; |
829 | server.appendfd = newfd; | |
4b77700a | 830 | if (server.appendfsync == APPENDFSYNC_ALWAYS) |
831 | aof_fsync(newfd); | |
832 | else if (server.appendfsync == APPENDFSYNC_EVERYSEC) | |
833 | aof_background_fsync(newfd); | |
b454056d | 834 | server.appendseldb = -1; /* Make sure SELECT is re-issued */ |
b333e239 | 835 | aofUpdateCurrentSize(); |
c66bf1fa | 836 | server.auto_aofrewrite_base_size = server.appendonly_current_size; |
5f54a5e6 PN |
837 | |
838 | /* Clear regular AOF buffer since its contents was just written to | |
839 | * the new AOF from the background rewrite buffer. */ | |
840 | sdsfree(server.aofbuf); | |
841 | server.aofbuf = sdsempty(); | |
e2641e09 | 842 | } |
b454056d PN |
843 | |
844 | redisLog(REDIS_NOTICE, "Background AOF rewrite successful"); | |
845 | ||
846 | /* Asynchronously close the overwritten AOF. */ | |
50be9b97 | 847 | if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL); |
b454056d PN |
848 | |
849 | redisLog(REDIS_VERBOSE, | |
850 | "Background AOF rewrite signal handler took %lldus", ustime()-now); | |
e2641e09 | 851 | } else if (!bysignal && exitcode != 0) { |
b454056d PN |
852 | redisLog(REDIS_WARNING, |
853 | "Background AOF rewrite terminated with error"); | |
e2641e09 | 854 | } else { |
855 | redisLog(REDIS_WARNING, | |
b454056d | 856 | "Background AOF rewrite terminated by signal %d", bysignal); |
e2641e09 | 857 | } |
b454056d | 858 | |
e2641e09 | 859 | cleanup: |
860 | sdsfree(server.bgrewritebuf); | |
861 | server.bgrewritebuf = sdsempty(); | |
862 | aofRemoveTempFile(server.bgrewritechildpid); | |
863 | server.bgrewritechildpid = -1; | |
864 | } |