]>
Commit | Line | Data |
---|---|---|
e2641e09 | 1 | #include "redis.h" |
2 | ||
3 | #include <signal.h> | |
4 | #include <fcntl.h> | |
5 | #include <sys/stat.h> | |
3688d7f3 | 6 | #include <sys/types.h> |
7 | #include <sys/time.h> | |
8 | #include <sys/resource.h> | |
9 | #include <sys/wait.h> | |
e2641e09 | 10 | |
11 | /* Called when the user switches from "appendonly yes" to "appendonly no" | |
12 | * at runtime using the CONFIG command. */ | |
13 | void stopAppendOnly(void) { | |
14 | flushAppendOnlyFile(); | |
15 | aof_fsync(server.appendfd); | |
16 | close(server.appendfd); | |
17 | ||
18 | server.appendfd = -1; | |
19 | server.appendseldb = -1; | |
20 | server.appendonly = 0; | |
21 | /* rewrite operation in progress? kill it, wait child exit */ | |
22 | if (server.bgsavechildpid != -1) { | |
23 | int statloc; | |
24 | ||
25 | if (kill(server.bgsavechildpid,SIGKILL) != -1) | |
26 | wait3(&statloc,0,NULL); | |
27 | /* reset the buffer accumulating changes while the child saves */ | |
28 | sdsfree(server.bgrewritebuf); | |
29 | server.bgrewritebuf = sdsempty(); | |
30 | server.bgsavechildpid = -1; | |
31 | } | |
32 | } | |
33 | ||
34 | /* Called when the user switches from "appendonly no" to "appendonly yes" | |
35 | * at runtime using the CONFIG command. */ | |
36 | int startAppendOnly(void) { | |
37 | server.appendonly = 1; | |
38 | server.lastfsync = time(NULL); | |
39 | server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); | |
40 | if (server.appendfd == -1) { | |
41 | redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno)); | |
42 | return REDIS_ERR; | |
43 | } | |
44 | if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { | |
45 | server.appendonly = 0; | |
46 | close(server.appendfd); | |
47 | redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno)); | |
48 | return REDIS_ERR; | |
49 | } | |
50 | return REDIS_OK; | |
51 | } | |
52 | ||
53 | /* Write the append only file buffer on disk. | |
54 | * | |
55 | * Since we are required to write the AOF before replying to the client, | |
56 | * and the only way the client socket can get a write is entering when the | |
57 | * the event loop, we accumulate all the AOF writes in a memory | |
58 | * buffer and write it on disk using this function just before entering | |
59 | * the event loop again. */ | |
60 | void flushAppendOnlyFile(void) { | |
61 | time_t now; | |
62 | ssize_t nwritten; | |
63 | ||
64 | if (sdslen(server.aofbuf) == 0) return; | |
65 | ||
66 | /* We want to perform a single write. This should be guaranteed atomic | |
67 | * at least if the filesystem we are writing is a real physical one. | |
68 | * While this will save us against the server being killed I don't think | |
69 | * there is much to do about the whole server stopping for power problems | |
70 | * or alike */ | |
71 | nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); | |
72 | if (nwritten != (signed)sdslen(server.aofbuf)) { | |
73 | /* Ooops, we are in troubles. The best thing to do for now is | |
74 | * aborting instead of giving the illusion that everything is | |
75 | * working as expected. */ | |
76 | if (nwritten == -1) { | |
77 | redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); | |
78 | } else { | |
79 | redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); | |
80 | } | |
81 | exit(1); | |
82 | } | |
83 | sdsfree(server.aofbuf); | |
84 | server.aofbuf = sdsempty(); | |
85 | ||
86 | /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have | |
87 | * childs performing heavy I/O on disk. */ | |
88 | if (server.no_appendfsync_on_rewrite && | |
89 | (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) | |
90 | return; | |
91 | /* Fsync if needed */ | |
92 | now = time(NULL); | |
93 | if (server.appendfsync == APPENDFSYNC_ALWAYS || | |
94 | (server.appendfsync == APPENDFSYNC_EVERYSEC && | |
95 | now-server.lastfsync > 1)) | |
96 | { | |
97 | /* aof_fsync is defined as fdatasync() for Linux in order to avoid | |
98 | * flushing metadata. */ | |
99 | aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ | |
100 | server.lastfsync = now; | |
101 | } | |
102 | } | |
103 | ||
104 | sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { | |
105 | int j; | |
106 | buf = sdscatprintf(buf,"*%d\r\n",argc); | |
107 | for (j = 0; j < argc; j++) { | |
108 | robj *o = getDecodedObject(argv[j]); | |
109 | buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); | |
110 | buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); | |
111 | buf = sdscatlen(buf,"\r\n",2); | |
112 | decrRefCount(o); | |
113 | } | |
114 | return buf; | |
115 | } | |
116 | ||
117 | sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { | |
118 | int argc = 3; | |
119 | long when; | |
120 | robj *argv[3]; | |
121 | ||
122 | /* Make sure we can use strtol */ | |
123 | seconds = getDecodedObject(seconds); | |
124 | when = time(NULL)+strtol(seconds->ptr,NULL,10); | |
125 | decrRefCount(seconds); | |
126 | ||
127 | argv[0] = createStringObject("EXPIREAT",8); | |
128 | argv[1] = key; | |
129 | argv[2] = createObject(REDIS_STRING, | |
130 | sdscatprintf(sdsempty(),"%ld",when)); | |
131 | buf = catAppendOnlyGenericCommand(buf, argc, argv); | |
132 | decrRefCount(argv[0]); | |
133 | decrRefCount(argv[2]); | |
134 | return buf; | |
135 | } | |
136 | ||
137 | void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { | |
138 | sds buf = sdsempty(); | |
139 | robj *tmpargv[3]; | |
140 | ||
141 | /* The DB this command was targetting is not the same as the last command | |
142 | * we appendend. To issue a SELECT command is needed. */ | |
143 | if (dictid != server.appendseldb) { | |
144 | char seldb[64]; | |
145 | ||
146 | snprintf(seldb,sizeof(seldb),"%d",dictid); | |
147 | buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", | |
148 | (unsigned long)strlen(seldb),seldb); | |
149 | server.appendseldb = dictid; | |
150 | } | |
151 | ||
152 | if (cmd->proc == expireCommand) { | |
153 | /* Translate EXPIRE into EXPIREAT */ | |
154 | buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); | |
155 | } else if (cmd->proc == setexCommand) { | |
156 | /* Translate SETEX to SET and EXPIREAT */ | |
157 | tmpargv[0] = createStringObject("SET",3); | |
158 | tmpargv[1] = argv[1]; | |
159 | tmpargv[2] = argv[3]; | |
160 | buf = catAppendOnlyGenericCommand(buf,3,tmpargv); | |
161 | decrRefCount(tmpargv[0]); | |
162 | buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); | |
163 | } else { | |
164 | buf = catAppendOnlyGenericCommand(buf,argc,argv); | |
165 | } | |
166 | ||
167 | /* Append to the AOF buffer. This will be flushed on disk just before | |
168 | * of re-entering the event loop, so before the client will get a | |
169 | * positive reply about the operation performed. */ | |
170 | server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf)); | |
171 | ||
172 | /* If a background append only file rewriting is in progress we want to | |
173 | * accumulate the differences between the child DB and the current one | |
174 | * in a buffer, so that when the child process will do its work we | |
175 | * can append the differences to the new append only file. */ | |
176 | if (server.bgrewritechildpid != -1) | |
177 | server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf)); | |
178 | ||
179 | sdsfree(buf); | |
180 | } | |
181 | ||
182 | /* In Redis commands are always executed in the context of a client, so in | |
183 | * order to load the append only file we need to create a fake client. */ | |
184 | struct redisClient *createFakeClient(void) { | |
185 | struct redisClient *c = zmalloc(sizeof(*c)); | |
186 | ||
187 | selectDb(c,0); | |
188 | c->fd = -1; | |
189 | c->querybuf = sdsempty(); | |
190 | c->argc = 0; | |
191 | c->argv = NULL; | |
192 | c->flags = 0; | |
193 | /* We set the fake client as a slave waiting for the synchronization | |
194 | * so that Redis will not try to send replies to this client. */ | |
195 | c->replstate = REDIS_REPL_WAIT_BGSAVE_START; | |
196 | c->reply = listCreate(); | |
b67d2345 | 197 | c->watched_keys = listCreate(); |
e2641e09 | 198 | listSetFreeMethod(c->reply,decrRefCount); |
199 | listSetDupMethod(c->reply,dupClientReplyValue); | |
200 | initClientMultiState(c); | |
201 | return c; | |
202 | } | |
203 | ||
204 | void freeFakeClient(struct redisClient *c) { | |
205 | sdsfree(c->querybuf); | |
206 | listRelease(c->reply); | |
b67d2345 | 207 | listRelease(c->watched_keys); |
e2641e09 | 208 | freeClientMultiState(c); |
209 | zfree(c); | |
210 | } | |
211 | ||
212 | /* Replay the append log file. On error REDIS_OK is returned. On non fatal | |
213 | * error (the append only file is zero-length) REDIS_ERR is returned. On | |
214 | * fatal error an error message is logged and the program exists. */ | |
215 | int loadAppendOnlyFile(char *filename) { | |
216 | struct redisClient *fakeClient; | |
217 | FILE *fp = fopen(filename,"r"); | |
218 | struct redis_stat sb; | |
219 | int appendonly = server.appendonly; | |
220 | ||
221 | if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) | |
222 | return REDIS_ERR; | |
223 | ||
224 | if (fp == NULL) { | |
225 | redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); | |
226 | exit(1); | |
227 | } | |
228 | ||
229 | /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI | |
230 | * to the same file we're about to read. */ | |
231 | server.appendonly = 0; | |
232 | ||
233 | fakeClient = createFakeClient(); | |
234 | while(1) { | |
235 | int argc, j; | |
236 | unsigned long len; | |
237 | robj **argv; | |
238 | char buf[128]; | |
239 | sds argsds; | |
240 | struct redisCommand *cmd; | |
241 | int force_swapout; | |
242 | ||
243 | if (fgets(buf,sizeof(buf),fp) == NULL) { | |
244 | if (feof(fp)) | |
245 | break; | |
246 | else | |
247 | goto readerr; | |
248 | } | |
249 | if (buf[0] != '*') goto fmterr; | |
250 | argc = atoi(buf+1); | |
251 | argv = zmalloc(sizeof(robj*)*argc); | |
252 | for (j = 0; j < argc; j++) { | |
253 | if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; | |
254 | if (buf[0] != '$') goto fmterr; | |
255 | len = strtol(buf+1,NULL,10); | |
256 | argsds = sdsnewlen(NULL,len); | |
257 | if (len && fread(argsds,len,1,fp) == 0) goto fmterr; | |
258 | argv[j] = createObject(REDIS_STRING,argsds); | |
259 | if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ | |
260 | } | |
261 | ||
262 | /* Command lookup */ | |
263 | cmd = lookupCommand(argv[0]->ptr); | |
264 | if (!cmd) { | |
265 | redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); | |
266 | exit(1); | |
267 | } | |
268 | /* Try object encoding */ | |
269 | if (cmd->flags & REDIS_CMD_BULK) | |
270 | argv[argc-1] = tryObjectEncoding(argv[argc-1]); | |
271 | /* Run the command in the context of a fake client */ | |
272 | fakeClient->argc = argc; | |
273 | fakeClient->argv = argv; | |
274 | cmd->proc(fakeClient); | |
57b07380 PN |
275 | |
276 | /* The fake client should not have a reply */ | |
277 | redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); | |
278 | ||
e2641e09 | 279 | /* Clean up, ready for the next command */ |
280 | for (j = 0; j < argc; j++) decrRefCount(argv[j]); | |
281 | zfree(argv); | |
57b07380 | 282 | |
e2641e09 | 283 | /* Handle swapping while loading big datasets when VM is on */ |
284 | force_swapout = 0; | |
285 | if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) | |
286 | force_swapout = 1; | |
287 | ||
288 | if (server.vm_enabled && force_swapout) { | |
289 | while (zmalloc_used_memory() > server.vm_max_memory) { | |
290 | if (vmSwapOneObjectBlocking() == REDIS_ERR) break; | |
291 | } | |
292 | } | |
293 | } | |
294 | ||
295 | /* This point can only be reached when EOF is reached without errors. | |
296 | * If the client is in the middle of a MULTI/EXEC, log error and quit. */ | |
297 | if (fakeClient->flags & REDIS_MULTI) goto readerr; | |
298 | ||
299 | fclose(fp); | |
300 | freeFakeClient(fakeClient); | |
301 | server.appendonly = appendonly; | |
302 | return REDIS_OK; | |
303 | ||
304 | readerr: | |
305 | if (feof(fp)) { | |
306 | redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); | |
307 | } else { | |
308 | redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); | |
309 | } | |
310 | exit(1); | |
311 | fmterr: | |
312 | redisLog(REDIS_WARNING,"Bad file format reading the append only file"); | |
313 | exit(1); | |
314 | } | |
315 | ||
316 | /* Write binary-safe string into a file in the bulkformat | |
317 | * $<count>\r\n<payload>\r\n */ | |
318 | int fwriteBulkString(FILE *fp, char *s, unsigned long len) { | |
319 | char cbuf[128]; | |
320 | int clen; | |
321 | cbuf[0] = '$'; | |
322 | clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len); | |
323 | cbuf[clen++] = '\r'; | |
324 | cbuf[clen++] = '\n'; | |
325 | if (fwrite(cbuf,clen,1,fp) == 0) return 0; | |
326 | if (len > 0 && fwrite(s,len,1,fp) == 0) return 0; | |
327 | if (fwrite("\r\n",2,1,fp) == 0) return 0; | |
328 | return 1; | |
329 | } | |
330 | ||
331 | /* Write a double value in bulk format $<count>\r\n<payload>\r\n */ | |
332 | int fwriteBulkDouble(FILE *fp, double d) { | |
333 | char buf[128], dbuf[128]; | |
334 | ||
335 | snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d); | |
336 | snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2); | |
337 | if (fwrite(buf,strlen(buf),1,fp) == 0) return 0; | |
338 | if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0; | |
339 | return 1; | |
340 | } | |
341 | ||
342 | /* Write a long value in bulk format $<count>\r\n<payload>\r\n */ | |
343 | int fwriteBulkLongLong(FILE *fp, long long l) { | |
344 | char bbuf[128], lbuf[128]; | |
345 | unsigned int blen, llen; | |
346 | llen = ll2string(lbuf,32,l); | |
347 | blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf); | |
348 | if (fwrite(bbuf,blen,1,fp) == 0) return 0; | |
349 | return 1; | |
350 | } | |
351 | ||
352 | /* Delegate writing an object to writing a bulk string or bulk long long. */ | |
353 | int fwriteBulkObject(FILE *fp, robj *obj) { | |
354 | /* Avoid using getDecodedObject to help copy-on-write (we are often | |
355 | * in a child process when this function is called). */ | |
356 | if (obj->encoding == REDIS_ENCODING_INT) { | |
357 | return fwriteBulkLongLong(fp,(long)obj->ptr); | |
358 | } else if (obj->encoding == REDIS_ENCODING_RAW) { | |
359 | return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr)); | |
360 | } else { | |
361 | redisPanic("Unknown string encoding"); | |
362 | } | |
363 | } | |
364 | ||
365 | /* Write a sequence of commands able to fully rebuild the dataset into | |
366 | * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */ | |
367 | int rewriteAppendOnlyFile(char *filename) { | |
368 | dictIterator *di = NULL; | |
369 | dictEntry *de; | |
370 | FILE *fp; | |
371 | char tmpfile[256]; | |
372 | int j; | |
373 | time_t now = time(NULL); | |
374 | ||
375 | /* Note that we have to use a different temp name here compared to the | |
376 | * one used by rewriteAppendOnlyFileBackground() function. */ | |
377 | snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); | |
378 | fp = fopen(tmpfile,"w"); | |
379 | if (!fp) { | |
380 | redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno)); | |
381 | return REDIS_ERR; | |
382 | } | |
383 | for (j = 0; j < server.dbnum; j++) { | |
384 | char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; | |
385 | redisDb *db = server.db+j; | |
386 | dict *d = db->dict; | |
387 | if (dictSize(d) == 0) continue; | |
388 | di = dictGetIterator(d); | |
389 | if (!di) { | |
390 | fclose(fp); | |
391 | return REDIS_ERR; | |
392 | } | |
393 | ||
394 | /* SELECT the new DB */ | |
395 | if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr; | |
396 | if (fwriteBulkLongLong(fp,j) == 0) goto werr; | |
397 | ||
398 | /* Iterate this DB writing every entry */ | |
399 | while((de = dictNext(di)) != NULL) { | |
400 | sds keystr = dictGetEntryKey(de); | |
401 | robj key, *o; | |
402 | time_t expiretime; | |
403 | int swapped; | |
404 | ||
405 | keystr = dictGetEntryKey(de); | |
406 | o = dictGetEntryVal(de); | |
407 | initStaticStringObject(key,keystr); | |
408 | /* If the value for this key is swapped, load a preview in memory. | |
409 | * We use a "swapped" flag to remember if we need to free the | |
410 | * value object instead to just increment the ref count anyway | |
411 | * in order to avoid copy-on-write of pages if we are forked() */ | |
412 | if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || | |
413 | o->storage == REDIS_VM_SWAPPING) { | |
414 | swapped = 0; | |
415 | } else { | |
416 | o = vmPreviewObject(o); | |
417 | swapped = 1; | |
418 | } | |
419 | expiretime = getExpire(db,&key); | |
420 | ||
421 | /* Save the key and associated value */ | |
422 | if (o->type == REDIS_STRING) { | |
423 | /* Emit a SET command */ | |
424 | char cmd[]="*3\r\n$3\r\nSET\r\n"; | |
425 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
426 | /* Key and value */ | |
427 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
428 | if (fwriteBulkObject(fp,o) == 0) goto werr; | |
429 | } else if (o->type == REDIS_LIST) { | |
430 | /* Emit the RPUSHes needed to rebuild the list */ | |
431 | char cmd[]="*3\r\n$5\r\nRPUSH\r\n"; | |
432 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { | |
433 | unsigned char *zl = o->ptr; | |
434 | unsigned char *p = ziplistIndex(zl,0); | |
435 | unsigned char *vstr; | |
436 | unsigned int vlen; | |
437 | long long vlong; | |
438 | ||
439 | while(ziplistGet(p,&vstr,&vlen,&vlong)) { | |
440 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
441 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
442 | if (vstr) { | |
443 | if (fwriteBulkString(fp,(char*)vstr,vlen) == 0) | |
444 | goto werr; | |
445 | } else { | |
446 | if (fwriteBulkLongLong(fp,vlong) == 0) | |
447 | goto werr; | |
448 | } | |
449 | p = ziplistNext(zl,p); | |
450 | } | |
451 | } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { | |
452 | list *list = o->ptr; | |
453 | listNode *ln; | |
454 | listIter li; | |
455 | ||
456 | listRewind(list,&li); | |
457 | while((ln = listNext(&li))) { | |
458 | robj *eleobj = listNodeValue(ln); | |
459 | ||
460 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
461 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
462 | if (fwriteBulkObject(fp,eleobj) == 0) goto werr; | |
463 | } | |
464 | } else { | |
465 | redisPanic("Unknown list encoding"); | |
466 | } | |
467 | } else if (o->type == REDIS_SET) { | |
2767f1c0 | 468 | char cmd[]="*3\r\n$4\r\nSADD\r\n"; |
e2641e09 | 469 | |
2767f1c0 PN |
470 | /* Emit the SADDs needed to rebuild the set */ |
471 | if (o->encoding == REDIS_ENCODING_INTSET) { | |
472 | int ii = 0; | |
23c64fe5 | 473 | int64_t llval; |
2767f1c0 PN |
474 | while(intsetGet(o->ptr,ii++,&llval)) { |
475 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
476 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
477 | if (fwriteBulkLongLong(fp,llval) == 0) goto werr; | |
478 | } | |
479 | } else if (o->encoding == REDIS_ENCODING_HT) { | |
480 | dictIterator *di = dictGetIterator(o->ptr); | |
481 | dictEntry *de; | |
482 | while((de = dictNext(di)) != NULL) { | |
483 | robj *eleobj = dictGetEntryKey(de); | |
484 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
485 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
486 | if (fwriteBulkObject(fp,eleobj) == 0) goto werr; | |
487 | } | |
488 | dictReleaseIterator(di); | |
489 | } else { | |
490 | redisPanic("Unknown set encoding"); | |
e2641e09 | 491 | } |
e2641e09 | 492 | } else if (o->type == REDIS_ZSET) { |
493 | /* Emit the ZADDs needed to rebuild the sorted set */ | |
494 | zset *zs = o->ptr; | |
495 | dictIterator *di = dictGetIterator(zs->dict); | |
496 | dictEntry *de; | |
497 | ||
498 | while((de = dictNext(di)) != NULL) { | |
499 | char cmd[]="*4\r\n$4\r\nZADD\r\n"; | |
500 | robj *eleobj = dictGetEntryKey(de); | |
501 | double *score = dictGetEntryVal(de); | |
502 | ||
503 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
504 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
505 | if (fwriteBulkDouble(fp,*score) == 0) goto werr; | |
506 | if (fwriteBulkObject(fp,eleobj) == 0) goto werr; | |
507 | } | |
508 | dictReleaseIterator(di); | |
509 | } else if (o->type == REDIS_HASH) { | |
510 | char cmd[]="*4\r\n$4\r\nHSET\r\n"; | |
511 | ||
512 | /* Emit the HSETs needed to rebuild the hash */ | |
513 | if (o->encoding == REDIS_ENCODING_ZIPMAP) { | |
514 | unsigned char *p = zipmapRewind(o->ptr); | |
515 | unsigned char *field, *val; | |
516 | unsigned int flen, vlen; | |
517 | ||
518 | while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) { | |
519 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
520 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
daf2049d | 521 | if (fwriteBulkString(fp,(char*)field,flen) == 0) |
5bd09cd4 | 522 | goto werr; |
daf2049d | 523 | if (fwriteBulkString(fp,(char*)val,vlen) == 0) |
5bd09cd4 | 524 | goto werr; |
e2641e09 | 525 | } |
526 | } else { | |
527 | dictIterator *di = dictGetIterator(o->ptr); | |
528 | dictEntry *de; | |
529 | ||
530 | while((de = dictNext(di)) != NULL) { | |
531 | robj *field = dictGetEntryKey(de); | |
532 | robj *val = dictGetEntryVal(de); | |
533 | ||
534 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
535 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
5bd09cd4 | 536 | if (fwriteBulkObject(fp,field) == 0) goto werr; |
537 | if (fwriteBulkObject(fp,val) == 0) goto werr; | |
e2641e09 | 538 | } |
539 | dictReleaseIterator(di); | |
540 | } | |
541 | } else { | |
542 | redisPanic("Unknown object type"); | |
543 | } | |
544 | /* Save the expire time */ | |
545 | if (expiretime != -1) { | |
546 | char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n"; | |
547 | /* If this key is already expired skip it */ | |
548 | if (expiretime < now) continue; | |
549 | if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; | |
550 | if (fwriteBulkObject(fp,&key) == 0) goto werr; | |
551 | if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr; | |
552 | } | |
553 | if (swapped) decrRefCount(o); | |
554 | } | |
555 | dictReleaseIterator(di); | |
556 | } | |
557 | ||
558 | /* Make sure data will not remain on the OS's output buffers */ | |
559 | fflush(fp); | |
560 | aof_fsync(fileno(fp)); | |
561 | fclose(fp); | |
562 | ||
563 | /* Use RENAME to make sure the DB file is changed atomically only | |
564 | * if the generate DB file is ok. */ | |
565 | if (rename(tmpfile,filename) == -1) { | |
566 | redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); | |
567 | unlink(tmpfile); | |
568 | return REDIS_ERR; | |
569 | } | |
570 | redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); | |
571 | return REDIS_OK; | |
572 | ||
573 | werr: | |
574 | fclose(fp); | |
575 | unlink(tmpfile); | |
576 | redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); | |
577 | if (di) dictReleaseIterator(di); | |
578 | return REDIS_ERR; | |
579 | } | |
580 | ||
581 | /* This is how rewriting of the append only file in background works: | |
582 | * | |
583 | * 1) The user calls BGREWRITEAOF | |
584 | * 2) Redis calls this function, that forks(): | |
585 | * 2a) the child rewrite the append only file in a temp file. | |
586 | * 2b) the parent accumulates differences in server.bgrewritebuf. | |
587 | * 3) When the child finished '2a' exists. | |
588 | * 4) The parent will trap the exit code, if it's OK, will append the | |
589 | * data accumulated into server.bgrewritebuf into the temp file, and | |
590 | * finally will rename(2) the temp file in the actual file name. | |
591 | * The the new file is reopened as the new append only file. Profit! | |
592 | */ | |
593 | int rewriteAppendOnlyFileBackground(void) { | |
594 | pid_t childpid; | |
595 | ||
596 | if (server.bgrewritechildpid != -1) return REDIS_ERR; | |
597 | if (server.vm_enabled) waitEmptyIOJobsQueue(); | |
598 | if ((childpid = fork()) == 0) { | |
599 | /* Child */ | |
600 | char tmpfile[256]; | |
601 | ||
602 | if (server.vm_enabled) vmReopenSwapFile(); | |
603 | close(server.fd); | |
604 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); | |
605 | if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { | |
606 | _exit(0); | |
607 | } else { | |
608 | _exit(1); | |
609 | } | |
610 | } else { | |
611 | /* Parent */ | |
612 | if (childpid == -1) { | |
613 | redisLog(REDIS_WARNING, | |
614 | "Can't rewrite append only file in background: fork: %s", | |
615 | strerror(errno)); | |
616 | return REDIS_ERR; | |
617 | } | |
618 | redisLog(REDIS_NOTICE, | |
619 | "Background append only file rewriting started by pid %d",childpid); | |
620 | server.bgrewritechildpid = childpid; | |
621 | updateDictResizePolicy(); | |
622 | /* We set appendseldb to -1 in order to force the next call to the | |
623 | * feedAppendOnlyFile() to issue a SELECT command, so the differences | |
624 | * accumulated by the parent into server.bgrewritebuf will start | |
625 | * with a SELECT statement and it will be safe to merge. */ | |
626 | server.appendseldb = -1; | |
627 | return REDIS_OK; | |
628 | } | |
629 | return REDIS_OK; /* unreached */ | |
630 | } | |
631 | ||
632 | void bgrewriteaofCommand(redisClient *c) { | |
633 | if (server.bgrewritechildpid != -1) { | |
634 | addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n")); | |
635 | return; | |
636 | } | |
637 | if (rewriteAppendOnlyFileBackground() == REDIS_OK) { | |
638 | char *status = "+Background append only file rewriting started\r\n"; | |
639 | addReplySds(c,sdsnew(status)); | |
640 | } else { | |
641 | addReply(c,shared.err); | |
642 | } | |
643 | } | |
644 | ||
645 | void aofRemoveTempFile(pid_t childpid) { | |
646 | char tmpfile[256]; | |
647 | ||
648 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); | |
649 | unlink(tmpfile); | |
650 | } | |
651 | ||
652 | /* A background append only file rewriting (BGREWRITEAOF) terminated its work. | |
653 | * Handle this. */ | |
654 | void backgroundRewriteDoneHandler(int statloc) { | |
655 | int exitcode = WEXITSTATUS(statloc); | |
656 | int bysignal = WIFSIGNALED(statloc); | |
657 | ||
658 | if (!bysignal && exitcode == 0) { | |
659 | int fd; | |
660 | char tmpfile[256]; | |
661 | ||
662 | redisLog(REDIS_NOTICE, | |
663 | "Background append only file rewriting terminated with success"); | |
664 | /* Now it's time to flush the differences accumulated by the parent */ | |
665 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid); | |
666 | fd = open(tmpfile,O_WRONLY|O_APPEND); | |
667 | if (fd == -1) { | |
668 | redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno)); | |
669 | goto cleanup; | |
670 | } | |
671 | /* Flush our data... */ | |
672 | if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) != | |
673 | (signed) sdslen(server.bgrewritebuf)) { | |
674 | redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno)); | |
675 | close(fd); | |
676 | goto cleanup; | |
677 | } | |
678 | redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf)); | |
679 | /* Now our work is to rename the temp file into the stable file. And | |
680 | * switch the file descriptor used by the server for append only. */ | |
681 | if (rename(tmpfile,server.appendfilename) == -1) { | |
682 | redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno)); | |
683 | close(fd); | |
684 | goto cleanup; | |
685 | } | |
686 | /* Mission completed... almost */ | |
687 | redisLog(REDIS_NOTICE,"Append only file successfully rewritten."); | |
688 | if (server.appendfd != -1) { | |
689 | /* If append only is actually enabled... */ | |
690 | close(server.appendfd); | |
691 | server.appendfd = fd; | |
692 | if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); | |
693 | server.appendseldb = -1; /* Make sure it will issue SELECT */ | |
694 | redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); | |
695 | } else { | |
696 | /* If append only is disabled we just generate a dump in this | |
697 | * format. Why not? */ | |
698 | close(fd); | |
699 | } | |
700 | } else if (!bysignal && exitcode != 0) { | |
701 | redisLog(REDIS_WARNING, "Background append only file rewriting error"); | |
702 | } else { | |
703 | redisLog(REDIS_WARNING, | |
704 | "Background append only file rewriting terminated by signal %d", | |
705 | WTERMSIG(statloc)); | |
706 | } | |
707 | cleanup: | |
708 | sdsfree(server.bgrewritebuf); | |
709 | server.bgrewritebuf = sdsempty(); | |
710 | aofRemoveTempFile(server.bgrewritechildpid); | |
711 | server.bgrewritechildpid = -1; | |
712 | } |