| 1 | #include "redis.h" |
| 2 | #include "bio.h" |
| 3 | #include "rio.h" |
| 4 | |
| 5 | #include <signal.h> |
| 6 | #include <fcntl.h> |
| 7 | #include <sys/stat.h> |
| 8 | #include <sys/types.h> |
| 9 | #include <sys/time.h> |
| 10 | #include <sys/resource.h> |
| 11 | #include <sys/wait.h> |
| 12 | |
| 13 | void aofUpdateCurrentSize(void); |
| 14 | |
| 15 | /* ---------------------------------------------------------------------------- |
| 16 | * AOF rewrite buffer implementation. |
| 17 | * |
| 18 | * The following code implement a simple buffer used in order to accumulate |
| 19 | * changes while the background process is rewriting the AOF file. |
| 20 | * |
| 21 | * We only need to append, but can't just use realloc with a large block |
| 22 | * because 'huge' reallocs are not always handled as one could expect |
| 23 | * (via remapping of pages at OS level) but may involve copying data. |
| 24 | * |
| 25 | * For this reason we use a list of blocks, every block is |
| 26 | * AOF_RW_BUF_BLOCK_SIZE bytes. |
| 27 | * ------------------------------------------------------------------------- */ |
| 28 | |
| 29 | #define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10) /* 10 MB per block */ |
| 30 | |
| 31 | typedef struct aofrwblock { |
| 32 | unsigned long used, free; |
| 33 | char buf[AOF_RW_BUF_BLOCK_SIZE]; |
| 34 | } aofrwblock; |
| 35 | |
| 36 | /* This function free the old AOF rewrite buffer if needed, and initialize |
| 37 | * a fresh new one. It tests for server.aof_rewrite_buf_blocks equal to NULL |
| 38 | * so can be used for the first initialization as well. */ |
| 39 | void aofRewriteBufferReset(void) { |
| 40 | if (server.aof_rewrite_buf_blocks) |
| 41 | listRelease(server.aof_rewrite_buf_blocks); |
| 42 | |
| 43 | server.aof_rewrite_buf_blocks = listCreate(); |
| 44 | listSetFreeMethod(server.aof_rewrite_buf_blocks,zfree); |
| 45 | } |
| 46 | |
| 47 | /* Return the current size of the AOF rerwite buffer. */ |
| 48 | unsigned long aofRewriteBufferSize(void) { |
| 49 | listNode *ln = listLast(server.aof_rewrite_buf_blocks); |
| 50 | aofrwblock *block = ln ? ln->value : NULL; |
| 51 | |
| 52 | if (block == NULL) return 0; |
| 53 | unsigned long size = |
| 54 | (listLength(server.aof_rewrite_buf_blocks)-1) * AOF_RW_BUF_BLOCK_SIZE; |
| 55 | size += block->used; |
| 56 | return size; |
| 57 | } |
| 58 | |
| 59 | /* Append data to the AOF rewrite buffer, allocating new blocks if needed. */ |
| 60 | void aofRewriteBufferAppend(unsigned char *s, unsigned long len) { |
| 61 | listNode *ln = listLast(server.aof_rewrite_buf_blocks); |
| 62 | aofrwblock *block = ln ? ln->value : NULL; |
| 63 | |
| 64 | while(len) { |
| 65 | /* If we already got at least an allocated block, try appending |
| 66 | * at least some piece into it. */ |
| 67 | if (block) { |
| 68 | unsigned long thislen = (block->free < len) ? block->free : len; |
| 69 | if (thislen) { /* The current block is not already full. */ |
| 70 | memcpy(block->buf+block->used, s, thislen); |
| 71 | block->used += thislen; |
| 72 | block->free -= thislen; |
| 73 | s += thislen; |
| 74 | len -= thislen; |
| 75 | } |
| 76 | } |
| 77 | |
| 78 | if (len) { /* First block to allocate, or need another block. */ |
| 79 | int numblocks; |
| 80 | |
| 81 | block = zmalloc(sizeof(*block)); |
| 82 | block->free = AOF_RW_BUF_BLOCK_SIZE; |
| 83 | block->used = 0; |
| 84 | listAddNodeTail(server.aof_rewrite_buf_blocks,block); |
| 85 | |
| 86 | /* Log every time we cross more 10 or 100 blocks, respectively |
| 87 | * as a notice or warning. */ |
| 88 | numblocks = listLength(server.aof_rewrite_buf_blocks); |
| 89 | if (((numblocks+1) % 10) == 0) { |
| 90 | int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING : |
| 91 | REDIS_NOTICE; |
| 92 | redisLog(level,"Background AOF buffer size: %lu MB", |
| 93 | aofRewriteBufferSize()/(1024*1024)); |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | /* Write the buffer (possibly composed of multiple blocks) into the specified |
| 100 | * fd. If no short write or any other error happens -1 is returned, |
| 101 | * otherwise the number of bytes written is returned. */ |
| 102 | ssize_t aofRewriteBufferWrite(int fd) { |
| 103 | listNode *ln; |
| 104 | listIter li; |
| 105 | ssize_t count = 0; |
| 106 | |
| 107 | listRewind(server.aof_rewrite_buf_blocks,&li); |
| 108 | while((ln = listNext(&li))) { |
| 109 | aofrwblock *block = listNodeValue(ln); |
| 110 | ssize_t nwritten; |
| 111 | |
| 112 | if (block->used) { |
| 113 | nwritten = write(fd,block->buf,block->used); |
| 114 | if (nwritten != block->used) { |
| 115 | if (nwritten == 0) errno = EIO; |
| 116 | return -1; |
| 117 | } |
| 118 | count += nwritten; |
| 119 | } |
| 120 | } |
| 121 | return count; |
| 122 | } |
| 123 | |
| 124 | /* ---------------------------------------------------------------------------- |
| 125 | * AOF file implementation |
| 126 | * ------------------------------------------------------------------------- */ |
| 127 | |
| 128 | /* Starts a background task that performs fsync() against the specified |
| 129 | * file descriptor (the one of the AOF file) in another thread. */ |
| 130 | void aof_background_fsync(int fd) { |
| 131 | bioCreateBackgroundJob(REDIS_BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL); |
| 132 | } |
| 133 | |
| 134 | /* Called when the user switches from "appendonly yes" to "appendonly no" |
| 135 | * at runtime using the CONFIG command. */ |
| 136 | void stopAppendOnly(void) { |
| 137 | redisAssert(server.aof_state != REDIS_AOF_OFF); |
| 138 | flushAppendOnlyFile(1); |
| 139 | aof_fsync(server.aof_fd); |
| 140 | close(server.aof_fd); |
| 141 | |
| 142 | server.aof_fd = -1; |
| 143 | server.aof_selected_db = -1; |
| 144 | server.aof_state = REDIS_AOF_OFF; |
| 145 | /* rewrite operation in progress? kill it, wait child exit */ |
| 146 | if (server.aof_child_pid != -1) { |
| 147 | int statloc; |
| 148 | |
| 149 | redisLog(REDIS_NOTICE,"Killing running AOF rewrite child: %ld", |
| 150 | (long) server.aof_child_pid); |
| 151 | if (kill(server.aof_child_pid,SIGKILL) != -1) |
| 152 | wait3(&statloc,0,NULL); |
| 153 | /* reset the buffer accumulating changes while the child saves */ |
| 154 | aofRewriteBufferReset(); |
| 155 | aofRemoveTempFile(server.aof_child_pid); |
| 156 | server.aof_child_pid = -1; |
| 157 | server.aof_rewrite_time_start = -1; |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | /* Called when the user switches from "appendonly no" to "appendonly yes" |
| 162 | * at runtime using the CONFIG command. */ |
| 163 | int startAppendOnly(void) { |
| 164 | server.aof_last_fsync = server.unixtime; |
| 165 | server.aof_fd = open(server.aof_filename,O_WRONLY|O_APPEND|O_CREAT,0644); |
| 166 | redisAssert(server.aof_state == REDIS_AOF_OFF); |
| 167 | if (server.aof_fd == -1) { |
| 168 | redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't open the append only file: %s",strerror(errno)); |
| 169 | return REDIS_ERR; |
| 170 | } |
| 171 | if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { |
| 172 | close(server.aof_fd); |
| 173 | redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't trigger a background AOF rewrite operation. Check the above logs for more info about the error."); |
| 174 | return REDIS_ERR; |
| 175 | } |
| 176 | /* We correctly switched on AOF, now wait for the rerwite to be complete |
| 177 | * in order to append data on disk. */ |
| 178 | server.aof_state = REDIS_AOF_WAIT_REWRITE; |
| 179 | return REDIS_OK; |
| 180 | } |
| 181 | |
| 182 | /* Write the append only file buffer on disk. |
| 183 | * |
| 184 | * Since we are required to write the AOF before replying to the client, |
| 185 | * and the only way the client socket can get a write is entering when the |
| 186 | * the event loop, we accumulate all the AOF writes in a memory |
| 187 | * buffer and write it on disk using this function just before entering |
| 188 | * the event loop again. |
| 189 | * |
| 190 | * About the 'force' argument: |
| 191 | * |
| 192 | * When the fsync policy is set to 'everysec' we may delay the flush if there |
| 193 | * is still an fsync() going on in the background thread, since for instance |
| 194 | * on Linux write(2) will be blocked by the background fsync anyway. |
| 195 | * When this happens we remember that there is some aof buffer to be |
| 196 | * flushed ASAP, and will try to do that in the serverCron() function. |
| 197 | * |
| 198 | * However if force is set to 1 we'll write regardless of the background |
| 199 | * fsync. */ |
| 200 | void flushAppendOnlyFile(int force) { |
| 201 | ssize_t nwritten; |
| 202 | int sync_in_progress = 0; |
| 203 | |
| 204 | if (sdslen(server.aof_buf) == 0) return; |
| 205 | |
| 206 | if (server.aof_fsync == AOF_FSYNC_EVERYSEC) |
| 207 | sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0; |
| 208 | |
| 209 | if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { |
| 210 | /* With this append fsync policy we do background fsyncing. |
| 211 | * If the fsync is still in progress we can try to delay |
| 212 | * the write for a couple of seconds. */ |
| 213 | if (sync_in_progress) { |
| 214 | if (server.aof_flush_postponed_start == 0) { |
| 215 | /* No previous write postponinig, remember that we are |
| 216 | * postponing the flush and return. */ |
| 217 | server.aof_flush_postponed_start = server.unixtime; |
| 218 | return; |
| 219 | } else if (server.unixtime - server.aof_flush_postponed_start < 2) { |
| 220 | /* We were already waiting for fsync to finish, but for less |
| 221 | * than two seconds this is still ok. Postpone again. */ |
| 222 | return; |
| 223 | } |
| 224 | /* Otherwise fall trough, and go write since we can't wait |
| 225 | * over two seconds. */ |
| 226 | server.aof_delayed_fsync++; |
| 227 | redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis."); |
| 228 | } |
| 229 | } |
| 230 | /* If you are following this code path, then we are going to write so |
| 231 | * set reset the postponed flush sentinel to zero. */ |
| 232 | server.aof_flush_postponed_start = 0; |
| 233 | |
| 234 | /* We want to perform a single write. This should be guaranteed atomic |
| 235 | * at least if the filesystem we are writing is a real physical one. |
| 236 | * While this will save us against the server being killed I don't think |
| 237 | * there is much to do about the whole server stopping for power problems |
| 238 | * or alike */ |
| 239 | nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf)); |
| 240 | if (nwritten != (signed)sdslen(server.aof_buf)) { |
| 241 | /* Ooops, we are in troubles. The best thing to do for now is |
| 242 | * aborting instead of giving the illusion that everything is |
| 243 | * working as expected. */ |
| 244 | if (nwritten == -1) { |
| 245 | redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); |
| 246 | } else { |
| 247 | redisLog(REDIS_WARNING,"Exiting on short write while writing to " |
| 248 | "the append-only file: %s (nwritten=%ld, " |
| 249 | "expected=%ld)", |
| 250 | strerror(errno), |
| 251 | (long)nwritten, |
| 252 | (long)sdslen(server.aof_buf)); |
| 253 | |
| 254 | if (ftruncate(server.aof_fd, server.aof_current_size) == -1) { |
| 255 | redisLog(REDIS_WARNING, "Could not remove short write " |
| 256 | "from the append-only file. Redis may refuse " |
| 257 | "to load the AOF the next time it starts. " |
| 258 | "ftruncate: %s", strerror(errno)); |
| 259 | } |
| 260 | } |
| 261 | exit(1); |
| 262 | } |
| 263 | server.aof_current_size += nwritten; |
| 264 | |
| 265 | /* Re-use AOF buffer when it is small enough. The maximum comes from the |
| 266 | * arena size of 4k minus some overhead (but is otherwise arbitrary). */ |
| 267 | if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) { |
| 268 | sdsclear(server.aof_buf); |
| 269 | } else { |
| 270 | sdsfree(server.aof_buf); |
| 271 | server.aof_buf = sdsempty(); |
| 272 | } |
| 273 | |
| 274 | /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are |
| 275 | * children doing I/O in the background. */ |
| 276 | if (server.aof_no_fsync_on_rewrite && |
| 277 | (server.aof_child_pid != -1 || server.rdb_child_pid != -1)) |
| 278 | return; |
| 279 | |
| 280 | /* Perform the fsync if needed. */ |
| 281 | if (server.aof_fsync == AOF_FSYNC_ALWAYS) { |
| 282 | /* aof_fsync is defined as fdatasync() for Linux in order to avoid |
| 283 | * flushing metadata. */ |
| 284 | aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */ |
| 285 | server.aof_last_fsync = server.unixtime; |
| 286 | } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC && |
| 287 | server.unixtime > server.aof_last_fsync)) { |
| 288 | if (!sync_in_progress) aof_background_fsync(server.aof_fd); |
| 289 | server.aof_last_fsync = server.unixtime; |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { |
| 294 | char buf[32]; |
| 295 | int len, j; |
| 296 | robj *o; |
| 297 | |
| 298 | buf[0] = '*'; |
| 299 | len = 1+ll2string(buf+1,sizeof(buf)-1,argc); |
| 300 | buf[len++] = '\r'; |
| 301 | buf[len++] = '\n'; |
| 302 | dst = sdscatlen(dst,buf,len); |
| 303 | |
| 304 | for (j = 0; j < argc; j++) { |
| 305 | o = getDecodedObject(argv[j]); |
| 306 | buf[0] = '$'; |
| 307 | len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); |
| 308 | buf[len++] = '\r'; |
| 309 | buf[len++] = '\n'; |
| 310 | dst = sdscatlen(dst,buf,len); |
| 311 | dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); |
| 312 | dst = sdscatlen(dst,"\r\n",2); |
| 313 | decrRefCount(o); |
| 314 | } |
| 315 | return dst; |
| 316 | } |
| 317 | |
| 318 | /* Create the sds representation of an PEXPIREAT command, using |
| 319 | * 'seconds' as time to live and 'cmd' to understand what command |
| 320 | * we are translating into a PEXPIREAT. |
| 321 | * |
| 322 | * This command is used in order to translate EXPIRE and PEXPIRE commands |
| 323 | * into PEXPIREAT command so that we retain precision in the append only |
| 324 | * file, and the time is always absolute and not relative. */ |
| 325 | sds catAppendOnlyExpireAtCommand(sds buf, struct redisCommand *cmd, robj *key, robj *seconds) { |
| 326 | long long when; |
| 327 | robj *argv[3]; |
| 328 | |
| 329 | /* Make sure we can use strtol */ |
| 330 | seconds = getDecodedObject(seconds); |
| 331 | when = strtoll(seconds->ptr,NULL,10); |
| 332 | /* Convert argument into milliseconds for EXPIRE, SETEX, EXPIREAT */ |
| 333 | if (cmd->proc == expireCommand || cmd->proc == setexCommand || |
| 334 | cmd->proc == expireatCommand) |
| 335 | { |
| 336 | when *= 1000; |
| 337 | } |
| 338 | /* Convert into absolute time for EXPIRE, PEXPIRE, SETEX, PSETEX */ |
| 339 | if (cmd->proc == expireCommand || cmd->proc == pexpireCommand || |
| 340 | cmd->proc == setexCommand || cmd->proc == psetexCommand) |
| 341 | { |
| 342 | when += mstime(); |
| 343 | } |
| 344 | decrRefCount(seconds); |
| 345 | |
| 346 | argv[0] = createStringObject("PEXPIREAT",9); |
| 347 | argv[1] = key; |
| 348 | argv[2] = createStringObjectFromLongLong(when); |
| 349 | buf = catAppendOnlyGenericCommand(buf, 3, argv); |
| 350 | decrRefCount(argv[0]); |
| 351 | decrRefCount(argv[2]); |
| 352 | return buf; |
| 353 | } |
| 354 | |
| 355 | void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { |
| 356 | sds buf = sdsempty(); |
| 357 | robj *tmpargv[3]; |
| 358 | |
| 359 | /* The DB this command was targetting is not the same as the last command |
| 360 | * we appendend. To issue a SELECT command is needed. */ |
| 361 | if (dictid != server.aof_selected_db) { |
| 362 | char seldb[64]; |
| 363 | |
| 364 | snprintf(seldb,sizeof(seldb),"%d",dictid); |
| 365 | buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", |
| 366 | (unsigned long)strlen(seldb),seldb); |
| 367 | server.aof_selected_db = dictid; |
| 368 | } |
| 369 | |
| 370 | if (cmd->proc == expireCommand || cmd->proc == pexpireCommand || |
| 371 | cmd->proc == expireatCommand) { |
| 372 | /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */ |
| 373 | buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]); |
| 374 | } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) { |
| 375 | /* Translate SETEX/PSETEX to SET and PEXPIREAT */ |
| 376 | tmpargv[0] = createStringObject("SET",3); |
| 377 | tmpargv[1] = argv[1]; |
| 378 | tmpargv[2] = argv[3]; |
| 379 | buf = catAppendOnlyGenericCommand(buf,3,tmpargv); |
| 380 | decrRefCount(tmpargv[0]); |
| 381 | buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]); |
| 382 | } else { |
| 383 | /* All the other commands don't need translation or need the |
| 384 | * same translation already operated in the command vector |
| 385 | * for the replication itself. */ |
| 386 | buf = catAppendOnlyGenericCommand(buf,argc,argv); |
| 387 | } |
| 388 | |
| 389 | /* Append to the AOF buffer. This will be flushed on disk just before |
| 390 | * of re-entering the event loop, so before the client will get a |
| 391 | * positive reply about the operation performed. */ |
| 392 | if (server.aof_state == REDIS_AOF_ON) |
| 393 | server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf)); |
| 394 | |
| 395 | /* If a background append only file rewriting is in progress we want to |
| 396 | * accumulate the differences between the child DB and the current one |
| 397 | * in a buffer, so that when the child process will do its work we |
| 398 | * can append the differences to the new append only file. */ |
| 399 | if (server.aof_child_pid != -1) |
| 400 | aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf)); |
| 401 | |
| 402 | sdsfree(buf); |
| 403 | } |
| 404 | |
| 405 | /* ---------------------------------------------------------------------------- |
| 406 | * AOF loading |
| 407 | * ------------------------------------------------------------------------- */ |
| 408 | |
| 409 | /* In Redis commands are always executed in the context of a client, so in |
| 410 | * order to load the append only file we need to create a fake client. */ |
| 411 | struct redisClient *createFakeClient(void) { |
| 412 | struct redisClient *c = zmalloc(sizeof(*c)); |
| 413 | |
| 414 | selectDb(c,0); |
| 415 | c->fd = -1; |
| 416 | c->querybuf = sdsempty(); |
| 417 | c->querybuf_peak = 0; |
| 418 | c->argc = 0; |
| 419 | c->argv = NULL; |
| 420 | c->bufpos = 0; |
| 421 | c->flags = 0; |
| 422 | /* We set the fake client as a slave waiting for the synchronization |
| 423 | * so that Redis will not try to send replies to this client. */ |
| 424 | c->replstate = REDIS_REPL_WAIT_BGSAVE_START; |
| 425 | c->reply = listCreate(); |
| 426 | c->reply_bytes = 0; |
| 427 | c->obuf_soft_limit_reached_time = 0; |
| 428 | c->watched_keys = listCreate(); |
| 429 | listSetFreeMethod(c->reply,decrRefCount); |
| 430 | listSetDupMethod(c->reply,dupClientReplyValue); |
| 431 | initClientMultiState(c); |
| 432 | return c; |
| 433 | } |
| 434 | |
| 435 | void freeFakeClient(struct redisClient *c) { |
| 436 | sdsfree(c->querybuf); |
| 437 | listRelease(c->reply); |
| 438 | listRelease(c->watched_keys); |
| 439 | freeClientMultiState(c); |
| 440 | zfree(c); |
| 441 | } |
| 442 | |
| 443 | /* Replay the append log file. On error REDIS_OK is returned. On non fatal |
| 444 | * error (the append only file is zero-length) REDIS_ERR is returned. On |
| 445 | * fatal error an error message is logged and the program exists. */ |
| 446 | int loadAppendOnlyFile(char *filename) { |
| 447 | struct redisClient *fakeClient; |
| 448 | FILE *fp = fopen(filename,"r"); |
| 449 | struct redis_stat sb; |
| 450 | int old_aof_state = server.aof_state; |
| 451 | long loops = 0; |
| 452 | |
| 453 | if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { |
| 454 | server.aof_current_size = 0; |
| 455 | fclose(fp); |
| 456 | return REDIS_ERR; |
| 457 | } |
| 458 | |
| 459 | if (fp == NULL) { |
| 460 | redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); |
| 461 | exit(1); |
| 462 | } |
| 463 | |
| 464 | /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI |
| 465 | * to the same file we're about to read. */ |
| 466 | server.aof_state = REDIS_AOF_OFF; |
| 467 | |
| 468 | fakeClient = createFakeClient(); |
| 469 | startLoading(fp); |
| 470 | |
| 471 | while(1) { |
| 472 | int argc, j; |
| 473 | unsigned long len; |
| 474 | robj **argv; |
| 475 | char buf[128]; |
| 476 | sds argsds; |
| 477 | struct redisCommand *cmd; |
| 478 | |
| 479 | /* Serve the clients from time to time */ |
| 480 | if (!(loops++ % 1000)) { |
| 481 | loadingProgress(ftello(fp)); |
| 482 | aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); |
| 483 | } |
| 484 | |
| 485 | if (fgets(buf,sizeof(buf),fp) == NULL) { |
| 486 | if (feof(fp)) |
| 487 | break; |
| 488 | else |
| 489 | goto readerr; |
| 490 | } |
| 491 | if (buf[0] != '*') goto fmterr; |
| 492 | argc = atoi(buf+1); |
| 493 | if (argc < 1) goto fmterr; |
| 494 | |
| 495 | argv = zmalloc(sizeof(robj*)*argc); |
| 496 | for (j = 0; j < argc; j++) { |
| 497 | if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; |
| 498 | if (buf[0] != '$') goto fmterr; |
| 499 | len = strtol(buf+1,NULL,10); |
| 500 | argsds = sdsnewlen(NULL,len); |
| 501 | if (len && fread(argsds,len,1,fp) == 0) goto fmterr; |
| 502 | argv[j] = createObject(REDIS_STRING,argsds); |
| 503 | if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ |
| 504 | } |
| 505 | |
| 506 | /* Command lookup */ |
| 507 | cmd = lookupCommand(argv[0]->ptr); |
| 508 | if (!cmd) { |
| 509 | redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); |
| 510 | exit(1); |
| 511 | } |
| 512 | /* Run the command in the context of a fake client */ |
| 513 | fakeClient->argc = argc; |
| 514 | fakeClient->argv = argv; |
| 515 | cmd->proc(fakeClient); |
| 516 | |
| 517 | /* The fake client should not have a reply */ |
| 518 | redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0); |
| 519 | /* The fake client should never get blocked */ |
| 520 | redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0); |
| 521 | |
| 522 | /* Clean up. Command code may have changed argv/argc so we use the |
| 523 | * argv/argc of the client instead of the local variables. */ |
| 524 | for (j = 0; j < fakeClient->argc; j++) |
| 525 | decrRefCount(fakeClient->argv[j]); |
| 526 | zfree(fakeClient->argv); |
| 527 | } |
| 528 | |
| 529 | /* This point can only be reached when EOF is reached without errors. |
| 530 | * If the client is in the middle of a MULTI/EXEC, log error and quit. */ |
| 531 | if (fakeClient->flags & REDIS_MULTI) goto readerr; |
| 532 | |
| 533 | fclose(fp); |
| 534 | freeFakeClient(fakeClient); |
| 535 | server.aof_state = old_aof_state; |
| 536 | stopLoading(); |
| 537 | aofUpdateCurrentSize(); |
| 538 | server.aof_rewrite_base_size = server.aof_current_size; |
| 539 | return REDIS_OK; |
| 540 | |
| 541 | readerr: |
| 542 | if (feof(fp)) { |
| 543 | redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); |
| 544 | } else { |
| 545 | redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); |
| 546 | } |
| 547 | exit(1); |
| 548 | fmterr: |
| 549 | redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix <filename>"); |
| 550 | exit(1); |
| 551 | } |
| 552 | |
| 553 | /* ---------------------------------------------------------------------------- |
| 554 | * AOF rewrite |
| 555 | * ------------------------------------------------------------------------- */ |
| 556 | |
| 557 | /* Delegate writing an object to writing a bulk string or bulk long long. |
| 558 | * This is not placed in rio.c since that adds the redis.h dependency. */ |
| 559 | int rioWriteBulkObject(rio *r, robj *obj) { |
| 560 | /* Avoid using getDecodedObject to help copy-on-write (we are often |
| 561 | * in a child process when this function is called). */ |
| 562 | if (obj->encoding == REDIS_ENCODING_INT) { |
| 563 | return rioWriteBulkLongLong(r,(long)obj->ptr); |
| 564 | } else if (obj->encoding == REDIS_ENCODING_RAW) { |
| 565 | return rioWriteBulkString(r,obj->ptr,sdslen(obj->ptr)); |
| 566 | } else { |
| 567 | redisPanic("Unknown string encoding"); |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | /* Emit the commands needed to rebuild a list object. |
| 572 | * The function returns 0 on error, 1 on success. */ |
| 573 | int rewriteListObject(rio *r, robj *key, robj *o) { |
| 574 | long long count = 0, items = listTypeLength(o); |
| 575 | |
| 576 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { |
| 577 | unsigned char *zl = o->ptr; |
| 578 | unsigned char *p = ziplistIndex(zl,0); |
| 579 | unsigned char *vstr; |
| 580 | unsigned int vlen; |
| 581 | long long vlong; |
| 582 | |
| 583 | while(ziplistGet(p,&vstr,&vlen,&vlong)) { |
| 584 | if (count == 0) { |
| 585 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 586 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 587 | |
| 588 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; |
| 589 | if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0; |
| 590 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 591 | } |
| 592 | if (vstr) { |
| 593 | if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0; |
| 594 | } else { |
| 595 | if (rioWriteBulkLongLong(r,vlong) == 0) return 0; |
| 596 | } |
| 597 | p = ziplistNext(zl,p); |
| 598 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 599 | items--; |
| 600 | } |
| 601 | } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { |
| 602 | list *list = o->ptr; |
| 603 | listNode *ln; |
| 604 | listIter li; |
| 605 | |
| 606 | listRewind(list,&li); |
| 607 | while((ln = listNext(&li))) { |
| 608 | robj *eleobj = listNodeValue(ln); |
| 609 | |
| 610 | if (count == 0) { |
| 611 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 612 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 613 | |
| 614 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; |
| 615 | if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0; |
| 616 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 617 | } |
| 618 | if (rioWriteBulkObject(r,eleobj) == 0) return 0; |
| 619 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 620 | items--; |
| 621 | } |
| 622 | } else { |
| 623 | redisPanic("Unknown list encoding"); |
| 624 | } |
| 625 | return 1; |
| 626 | } |
| 627 | |
| 628 | /* Emit the commands needed to rebuild a set object. |
| 629 | * The function returns 0 on error, 1 on success. */ |
| 630 | int rewriteSetObject(rio *r, robj *key, robj *o) { |
| 631 | long long count = 0, items = setTypeSize(o); |
| 632 | |
| 633 | if (o->encoding == REDIS_ENCODING_INTSET) { |
| 634 | int ii = 0; |
| 635 | int64_t llval; |
| 636 | |
| 637 | while(intsetGet(o->ptr,ii++,&llval)) { |
| 638 | if (count == 0) { |
| 639 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 640 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 641 | |
| 642 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; |
| 643 | if (rioWriteBulkString(r,"SADD",4) == 0) return 0; |
| 644 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 645 | } |
| 646 | if (rioWriteBulkLongLong(r,llval) == 0) return 0; |
| 647 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 648 | items--; |
| 649 | } |
| 650 | } else if (o->encoding == REDIS_ENCODING_HT) { |
| 651 | dictIterator *di = dictGetIterator(o->ptr); |
| 652 | dictEntry *de; |
| 653 | |
| 654 | while((de = dictNext(di)) != NULL) { |
| 655 | robj *eleobj = dictGetKey(de); |
| 656 | if (count == 0) { |
| 657 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 658 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 659 | |
| 660 | if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; |
| 661 | if (rioWriteBulkString(r,"SADD",4) == 0) return 0; |
| 662 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 663 | } |
| 664 | if (rioWriteBulkObject(r,eleobj) == 0) return 0; |
| 665 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 666 | items--; |
| 667 | } |
| 668 | dictReleaseIterator(di); |
| 669 | } else { |
| 670 | redisPanic("Unknown set encoding"); |
| 671 | } |
| 672 | return 1; |
| 673 | } |
| 674 | |
| 675 | /* Emit the commands needed to rebuild a sorted set object. |
| 676 | * The function returns 0 on error, 1 on success. */ |
| 677 | int rewriteSortedSetObject(rio *r, robj *key, robj *o) { |
| 678 | long long count = 0, items = zsetLength(o); |
| 679 | |
| 680 | if (o->encoding == REDIS_ENCODING_ZIPLIST) { |
| 681 | unsigned char *zl = o->ptr; |
| 682 | unsigned char *eptr, *sptr; |
| 683 | unsigned char *vstr; |
| 684 | unsigned int vlen; |
| 685 | long long vll; |
| 686 | double score; |
| 687 | |
| 688 | eptr = ziplistIndex(zl,0); |
| 689 | redisAssert(eptr != NULL); |
| 690 | sptr = ziplistNext(zl,eptr); |
| 691 | redisAssert(sptr != NULL); |
| 692 | |
| 693 | while (eptr != NULL) { |
| 694 | redisAssert(ziplistGet(eptr,&vstr,&vlen,&vll)); |
| 695 | score = zzlGetScore(sptr); |
| 696 | |
| 697 | if (count == 0) { |
| 698 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 699 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 700 | |
| 701 | if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0; |
| 702 | if (rioWriteBulkString(r,"ZADD",4) == 0) return 0; |
| 703 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 704 | } |
| 705 | if (rioWriteBulkDouble(r,score) == 0) return 0; |
| 706 | if (vstr != NULL) { |
| 707 | if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0; |
| 708 | } else { |
| 709 | if (rioWriteBulkLongLong(r,vll) == 0) return 0; |
| 710 | } |
| 711 | zzlNext(zl,&eptr,&sptr); |
| 712 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 713 | items--; |
| 714 | } |
| 715 | } else if (o->encoding == REDIS_ENCODING_SKIPLIST) { |
| 716 | zset *zs = o->ptr; |
| 717 | dictIterator *di = dictGetIterator(zs->dict); |
| 718 | dictEntry *de; |
| 719 | |
| 720 | while((de = dictNext(di)) != NULL) { |
| 721 | robj *eleobj = dictGetKey(de); |
| 722 | double *score = dictGetVal(de); |
| 723 | |
| 724 | if (count == 0) { |
| 725 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 726 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 727 | |
| 728 | if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0; |
| 729 | if (rioWriteBulkString(r,"ZADD",4) == 0) return 0; |
| 730 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 731 | } |
| 732 | if (rioWriteBulkDouble(r,*score) == 0) return 0; |
| 733 | if (rioWriteBulkObject(r,eleobj) == 0) return 0; |
| 734 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 735 | items--; |
| 736 | } |
| 737 | dictReleaseIterator(di); |
| 738 | } else { |
| 739 | redisPanic("Unknown sorted zset encoding"); |
| 740 | } |
| 741 | return 1; |
| 742 | } |
| 743 | |
| 744 | /* Write either the key or the value of the currently selected item of an hash. |
| 745 | * The 'hi' argument passes a valid Redis hash iterator. |
| 746 | * The 'what' filed specifies if to write a key or a value and can be |
| 747 | * either REDIS_HASH_KEY or REDIS_HASH_VALUE. |
| 748 | * |
| 749 | * The function returns 0 on error, non-zero on success. */ |
| 750 | static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { |
| 751 | if (hi->encoding == REDIS_ENCODING_ZIPLIST) { |
| 752 | unsigned char *vstr = NULL; |
| 753 | unsigned int vlen = UINT_MAX; |
| 754 | long long vll = LLONG_MAX; |
| 755 | |
| 756 | hashTypeCurrentFromZiplist(hi, what, &vstr, &vlen, &vll); |
| 757 | if (vstr) { |
| 758 | return rioWriteBulkString(r, (char*)vstr, vlen); |
| 759 | } else { |
| 760 | return rioWriteBulkLongLong(r, vll); |
| 761 | } |
| 762 | |
| 763 | } else if (hi->encoding == REDIS_ENCODING_HT) { |
| 764 | robj *value; |
| 765 | |
| 766 | hashTypeCurrentFromHashTable(hi, what, &value); |
| 767 | return rioWriteBulkObject(r, value); |
| 768 | } |
| 769 | |
| 770 | redisPanic("Unknown hash encoding"); |
| 771 | return 0; |
| 772 | } |
| 773 | |
| 774 | /* Emit the commands needed to rebuild a hash object. |
| 775 | * The function returns 0 on error, 1 on success. */ |
| 776 | int rewriteHashObject(rio *r, robj *key, robj *o) { |
| 777 | hashTypeIterator *hi; |
| 778 | long long count = 0, items = hashTypeLength(o); |
| 779 | |
| 780 | hi = hashTypeInitIterator(o); |
| 781 | while (hashTypeNext(hi) != REDIS_ERR) { |
| 782 | if (count == 0) { |
| 783 | int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? |
| 784 | REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; |
| 785 | |
| 786 | if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0; |
| 787 | if (rioWriteBulkString(r,"HMSET",5) == 0) return 0; |
| 788 | if (rioWriteBulkObject(r,key) == 0) return 0; |
| 789 | } |
| 790 | |
| 791 | if (rioWriteHashIteratorCursor(r, hi, REDIS_HASH_KEY) == 0) return 0; |
| 792 | if (rioWriteHashIteratorCursor(r, hi, REDIS_HASH_VALUE) == 0) return 0; |
| 793 | if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; |
| 794 | items--; |
| 795 | } |
| 796 | |
| 797 | hashTypeReleaseIterator(hi); |
| 798 | |
| 799 | return 1; |
| 800 | } |
| 801 | |
| 802 | /* Write a sequence of commands able to fully rebuild the dataset into |
| 803 | * "filename". Used both by REWRITEAOF and BGREWRITEAOF. |
| 804 | * |
| 805 | * In order to minimize the number of commands needed in the rewritten |
| 806 | * log Redis uses variadic commands when possible, such as RPUSH, SADD |
| 807 | * and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time |
| 808 | * are inserted using a single command. */ |
| 809 | int rewriteAppendOnlyFile(char *filename) { |
| 810 | dictIterator *di = NULL; |
| 811 | dictEntry *de; |
| 812 | rio aof; |
| 813 | FILE *fp; |
| 814 | char tmpfile[256]; |
| 815 | int j; |
| 816 | long long now = mstime(); |
| 817 | |
| 818 | /* Note that we have to use a different temp name here compared to the |
| 819 | * one used by rewriteAppendOnlyFileBackground() function. */ |
| 820 | snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); |
| 821 | fp = fopen(tmpfile,"w"); |
| 822 | if (!fp) { |
| 823 | redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno)); |
| 824 | return REDIS_ERR; |
| 825 | } |
| 826 | |
| 827 | rioInitWithFile(&aof,fp); |
| 828 | for (j = 0; j < server.dbnum; j++) { |
| 829 | char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; |
| 830 | redisDb *db = server.db+j; |
| 831 | dict *d = db->dict; |
| 832 | if (dictSize(d) == 0) continue; |
| 833 | di = dictGetSafeIterator(d); |
| 834 | if (!di) { |
| 835 | fclose(fp); |
| 836 | return REDIS_ERR; |
| 837 | } |
| 838 | |
| 839 | /* SELECT the new DB */ |
| 840 | if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr; |
| 841 | if (rioWriteBulkLongLong(&aof,j) == 0) goto werr; |
| 842 | |
| 843 | /* Iterate this DB writing every entry */ |
| 844 | while((de = dictNext(di)) != NULL) { |
| 845 | sds keystr; |
| 846 | robj key, *o; |
| 847 | long long expiretime; |
| 848 | |
| 849 | keystr = dictGetKey(de); |
| 850 | o = dictGetVal(de); |
| 851 | initStaticStringObject(key,keystr); |
| 852 | |
| 853 | expiretime = getExpire(db,&key); |
| 854 | |
| 855 | /* Save the key and associated value */ |
| 856 | if (o->type == REDIS_STRING) { |
| 857 | /* Emit a SET command */ |
| 858 | char cmd[]="*3\r\n$3\r\nSET\r\n"; |
| 859 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
| 860 | /* Key and value */ |
| 861 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; |
| 862 | if (rioWriteBulkObject(&aof,o) == 0) goto werr; |
| 863 | } else if (o->type == REDIS_LIST) { |
| 864 | if (rewriteListObject(&aof,&key,o) == 0) goto werr; |
| 865 | } else if (o->type == REDIS_SET) { |
| 866 | if (rewriteSetObject(&aof,&key,o) == 0) goto werr; |
| 867 | } else if (o->type == REDIS_ZSET) { |
| 868 | if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr; |
| 869 | } else if (o->type == REDIS_HASH) { |
| 870 | if (rewriteHashObject(&aof,&key,o) == 0) goto werr; |
| 871 | } else { |
| 872 | redisPanic("Unknown object type"); |
| 873 | } |
| 874 | /* Save the expire time */ |
| 875 | if (expiretime != -1) { |
| 876 | char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; |
| 877 | /* If this key is already expired skip it */ |
| 878 | if (expiretime < now) continue; |
| 879 | if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr; |
| 880 | if (rioWriteBulkObject(&aof,&key) == 0) goto werr; |
| 881 | if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr; |
| 882 | } |
| 883 | } |
| 884 | dictReleaseIterator(di); |
| 885 | } |
| 886 | |
| 887 | /* Make sure data will not remain on the OS's output buffers */ |
| 888 | fflush(fp); |
| 889 | aof_fsync(fileno(fp)); |
| 890 | fclose(fp); |
| 891 | |
| 892 | /* Use RENAME to make sure the DB file is changed atomically only |
| 893 | * if the generate DB file is ok. */ |
| 894 | if (rename(tmpfile,filename) == -1) { |
| 895 | redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); |
| 896 | unlink(tmpfile); |
| 897 | return REDIS_ERR; |
| 898 | } |
| 899 | redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); |
| 900 | return REDIS_OK; |
| 901 | |
| 902 | werr: |
| 903 | fclose(fp); |
| 904 | unlink(tmpfile); |
| 905 | redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); |
| 906 | if (di) dictReleaseIterator(di); |
| 907 | return REDIS_ERR; |
| 908 | } |
| 909 | |
| 910 | /* This is how rewriting of the append only file in background works: |
| 911 | * |
| 912 | * 1) The user calls BGREWRITEAOF |
| 913 | * 2) Redis calls this function, that forks(): |
| 914 | * 2a) the child rewrite the append only file in a temp file. |
| 915 | * 2b) the parent accumulates differences in server.aof_rewrite_buf. |
| 916 | * 3) When the child finished '2a' exists. |
| 917 | * 4) The parent will trap the exit code, if it's OK, will append the |
| 918 | * data accumulated into server.aof_rewrite_buf into the temp file, and |
| 919 | * finally will rename(2) the temp file in the actual file name. |
| 920 | * The the new file is reopened as the new append only file. Profit! |
| 921 | */ |
| 922 | int rewriteAppendOnlyFileBackground(void) { |
| 923 | pid_t childpid; |
| 924 | long long start; |
| 925 | |
| 926 | if (server.aof_child_pid != -1) return REDIS_ERR; |
| 927 | start = ustime(); |
| 928 | if ((childpid = fork()) == 0) { |
| 929 | char tmpfile[256]; |
| 930 | |
| 931 | /* Child */ |
| 932 | if (server.ipfd > 0) close(server.ipfd); |
| 933 | if (server.sofd > 0) close(server.sofd); |
| 934 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); |
| 935 | if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { |
| 936 | exitFromChild(0); |
| 937 | } else { |
| 938 | exitFromChild(1); |
| 939 | } |
| 940 | } else { |
| 941 | /* Parent */ |
| 942 | server.stat_fork_time = ustime()-start; |
| 943 | if (childpid == -1) { |
| 944 | redisLog(REDIS_WARNING, |
| 945 | "Can't rewrite append only file in background: fork: %s", |
| 946 | strerror(errno)); |
| 947 | return REDIS_ERR; |
| 948 | } |
| 949 | redisLog(REDIS_NOTICE, |
| 950 | "Background append only file rewriting started by pid %d",childpid); |
| 951 | server.aof_rewrite_scheduled = 0; |
| 952 | server.aof_rewrite_time_start = time(NULL); |
| 953 | server.aof_child_pid = childpid; |
| 954 | updateDictResizePolicy(); |
| 955 | /* We set appendseldb to -1 in order to force the next call to the |
| 956 | * feedAppendOnlyFile() to issue a SELECT command, so the differences |
| 957 | * accumulated by the parent into server.aof_rewrite_buf will start |
| 958 | * with a SELECT statement and it will be safe to merge. */ |
| 959 | server.aof_selected_db = -1; |
| 960 | return REDIS_OK; |
| 961 | } |
| 962 | return REDIS_OK; /* unreached */ |
| 963 | } |
| 964 | |
| 965 | void bgrewriteaofCommand(redisClient *c) { |
| 966 | if (server.aof_child_pid != -1) { |
| 967 | addReplyError(c,"Background append only file rewriting already in progress"); |
| 968 | } else if (server.rdb_child_pid != -1) { |
| 969 | server.aof_rewrite_scheduled = 1; |
| 970 | addReplyStatus(c,"Background append only file rewriting scheduled"); |
| 971 | } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) { |
| 972 | addReplyStatus(c,"Background append only file rewriting started"); |
| 973 | } else { |
| 974 | addReply(c,shared.err); |
| 975 | } |
| 976 | } |
| 977 | |
| 978 | void aofRemoveTempFile(pid_t childpid) { |
| 979 | char tmpfile[256]; |
| 980 | |
| 981 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); |
| 982 | unlink(tmpfile); |
| 983 | } |
| 984 | |
| 985 | /* Update the server.aof_current_size filed explicitly using stat(2) |
| 986 | * to check the size of the file. This is useful after a rewrite or after |
| 987 | * a restart, normally the size is updated just adding the write length |
| 988 | * to the current length, that is much faster. */ |
| 989 | void aofUpdateCurrentSize(void) { |
| 990 | struct redis_stat sb; |
| 991 | |
| 992 | if (redis_fstat(server.aof_fd,&sb) == -1) { |
| 993 | redisLog(REDIS_WARNING,"Unable to obtain the AOF file length. stat: %s", |
| 994 | strerror(errno)); |
| 995 | } else { |
| 996 | server.aof_current_size = sb.st_size; |
| 997 | } |
| 998 | } |
| 999 | |
| 1000 | /* A background append only file rewriting (BGREWRITEAOF) terminated its work. |
| 1001 | * Handle this. */ |
| 1002 | void backgroundRewriteDoneHandler(int exitcode, int bysignal) { |
| 1003 | if (!bysignal && exitcode == 0) { |
| 1004 | int newfd, oldfd; |
| 1005 | char tmpfile[256]; |
| 1006 | long long now = ustime(); |
| 1007 | |
| 1008 | redisLog(REDIS_NOTICE, |
| 1009 | "Background AOF rewrite terminated with success"); |
| 1010 | |
| 1011 | /* Flush the differences accumulated by the parent to the |
| 1012 | * rewritten AOF. */ |
| 1013 | snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", |
| 1014 | (int)server.aof_child_pid); |
| 1015 | newfd = open(tmpfile,O_WRONLY|O_APPEND); |
| 1016 | if (newfd == -1) { |
| 1017 | redisLog(REDIS_WARNING, |
| 1018 | "Unable to open the temporary AOF produced by the child: %s", strerror(errno)); |
| 1019 | goto cleanup; |
| 1020 | } |
| 1021 | |
| 1022 | if (aofRewriteBufferWrite(newfd) == -1) { |
| 1023 | redisLog(REDIS_WARNING, |
| 1024 | "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno)); |
| 1025 | close(newfd); |
| 1026 | goto cleanup; |
| 1027 | } |
| 1028 | |
| 1029 | redisLog(REDIS_NOTICE, |
| 1030 | "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", aofRewriteBufferSize()); |
| 1031 | |
| 1032 | /* The only remaining thing to do is to rename the temporary file to |
| 1033 | * the configured file and switch the file descriptor used to do AOF |
| 1034 | * writes. We don't want close(2) or rename(2) calls to block the |
| 1035 | * server on old file deletion. |
| 1036 | * |
| 1037 | * There are two possible scenarios: |
| 1038 | * |
| 1039 | * 1) AOF is DISABLED and this was a one time rewrite. The temporary |
| 1040 | * file will be renamed to the configured file. When this file already |
| 1041 | * exists, it will be unlinked, which may block the server. |
| 1042 | * |
| 1043 | * 2) AOF is ENABLED and the rewritten AOF will immediately start |
| 1044 | * receiving writes. After the temporary file is renamed to the |
| 1045 | * configured file, the original AOF file descriptor will be closed. |
| 1046 | * Since this will be the last reference to that file, closing it |
| 1047 | * causes the underlying file to be unlinked, which may block the |
| 1048 | * server. |
| 1049 | * |
| 1050 | * To mitigate the blocking effect of the unlink operation (either |
| 1051 | * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we |
| 1052 | * use a background thread to take care of this. First, we |
| 1053 | * make scenario 1 identical to scenario 2 by opening the target file |
| 1054 | * when it exists. The unlink operation after the rename(2) will then |
| 1055 | * be executed upon calling close(2) for its descriptor. Everything to |
| 1056 | * guarantee atomicity for this switch has already happened by then, so |
| 1057 | * we don't care what the outcome or duration of that close operation |
| 1058 | * is, as long as the file descriptor is released again. */ |
| 1059 | if (server.aof_fd == -1) { |
| 1060 | /* AOF disabled */ |
| 1061 | |
| 1062 | /* Don't care if this fails: oldfd will be -1 and we handle that. |
| 1063 | * One notable case of -1 return is if the old file does |
| 1064 | * not exist. */ |
| 1065 | oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK); |
| 1066 | } else { |
| 1067 | /* AOF enabled */ |
| 1068 | oldfd = -1; /* We'll set this to the current AOF filedes later. */ |
| 1069 | } |
| 1070 | |
| 1071 | /* Rename the temporary file. This will not unlink the target file if |
| 1072 | * it exists, because we reference it with "oldfd". */ |
| 1073 | if (rename(tmpfile,server.aof_filename) == -1) { |
| 1074 | redisLog(REDIS_WARNING, |
| 1075 | "Error trying to rename the temporary AOF file: %s", strerror(errno)); |
| 1076 | close(newfd); |
| 1077 | if (oldfd != -1) close(oldfd); |
| 1078 | goto cleanup; |
| 1079 | } |
| 1080 | |
| 1081 | if (server.aof_fd == -1) { |
| 1082 | /* AOF disabled, we don't need to set the AOF file descriptor |
| 1083 | * to this new file, so we can close it. */ |
| 1084 | close(newfd); |
| 1085 | } else { |
| 1086 | /* AOF enabled, replace the old fd with the new one. */ |
| 1087 | oldfd = server.aof_fd; |
| 1088 | server.aof_fd = newfd; |
| 1089 | if (server.aof_fsync == AOF_FSYNC_ALWAYS) |
| 1090 | aof_fsync(newfd); |
| 1091 | else if (server.aof_fsync == AOF_FSYNC_EVERYSEC) |
| 1092 | aof_background_fsync(newfd); |
| 1093 | server.aof_selected_db = -1; /* Make sure SELECT is re-issued */ |
| 1094 | aofUpdateCurrentSize(); |
| 1095 | server.aof_rewrite_base_size = server.aof_current_size; |
| 1096 | |
| 1097 | /* Clear regular AOF buffer since its contents was just written to |
| 1098 | * the new AOF from the background rewrite buffer. */ |
| 1099 | sdsfree(server.aof_buf); |
| 1100 | server.aof_buf = sdsempty(); |
| 1101 | } |
| 1102 | |
| 1103 | server.aof_lastbgrewrite_status = REDIS_OK; |
| 1104 | |
| 1105 | redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully"); |
| 1106 | /* Change state from WAIT_REWRITE to ON if needed */ |
| 1107 | if (server.aof_state == REDIS_AOF_WAIT_REWRITE) |
| 1108 | server.aof_state = REDIS_AOF_ON; |
| 1109 | |
| 1110 | /* Asynchronously close the overwritten AOF. */ |
| 1111 | if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL); |
| 1112 | |
| 1113 | redisLog(REDIS_VERBOSE, |
| 1114 | "Background AOF rewrite signal handler took %lldus", ustime()-now); |
| 1115 | } else if (!bysignal && exitcode != 0) { |
| 1116 | server.aof_lastbgrewrite_status = REDIS_ERR; |
| 1117 | |
| 1118 | redisLog(REDIS_WARNING, |
| 1119 | "Background AOF rewrite terminated with error"); |
| 1120 | } else { |
| 1121 | server.aof_lastbgrewrite_status = REDIS_ERR; |
| 1122 | |
| 1123 | redisLog(REDIS_WARNING, |
| 1124 | "Background AOF rewrite terminated by signal %d", bysignal); |
| 1125 | } |
| 1126 | |
| 1127 | cleanup: |
| 1128 | aofRewriteBufferReset(); |
| 1129 | aofRemoveTempFile(server.aof_child_pid); |
| 1130 | server.aof_child_pid = -1; |
| 1131 | server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start; |
| 1132 | server.aof_rewrite_time_start = -1; |
| 1133 | /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */ |
| 1134 | if (server.aof_state == REDIS_AOF_WAIT_REWRITE) |
| 1135 | server.aof_rewrite_scheduled = 1; |
| 1136 | } |