]> git.saurik.com Git - redis.git/blob - src/replication.c
Configurable synchronous I/O timeout
[redis.git] / src / replication.c
1 #include "redis.h"
2
3 #include <sys/time.h>
4 #include <unistd.h>
5 #include <fcntl.h>
6 #include <sys/stat.h>
7
8 /* ---------------------------------- MASTER -------------------------------- */
9
10 void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
11 listNode *ln;
12 listIter li;
13 int outc = 0, j;
14 robj **outv;
15 /* We need 1+(ARGS*3) objects since commands are using the new protocol
16 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
17 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
18 robj *static_outv[REDIS_STATIC_ARGS*3+1];
19 robj *lenobj;
20
21 if (argc <= REDIS_STATIC_ARGS) {
22 outv = static_outv;
23 } else {
24 outv = zmalloc(sizeof(robj*)*(argc*3+1));
25 }
26
27 lenobj = createObject(REDIS_STRING,
28 sdscatprintf(sdsempty(), "*%d\r\n", argc));
29 lenobj->refcount = 0;
30 outv[outc++] = lenobj;
31 for (j = 0; j < argc; j++) {
32 lenobj = createObject(REDIS_STRING,
33 sdscatprintf(sdsempty(),"$%lu\r\n",
34 (unsigned long) stringObjectLen(argv[j])));
35 lenobj->refcount = 0;
36 outv[outc++] = lenobj;
37 outv[outc++] = argv[j];
38 outv[outc++] = shared.crlf;
39 }
40
41 /* Increment all the refcounts at start and decrement at end in order to
42 * be sure to free objects if there is no slave in a replication state
43 * able to be feed with commands */
44 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
45 listRewind(slaves,&li);
46 while((ln = listNext(&li))) {
47 redisClient *slave = ln->value;
48
49 /* Don't feed slaves that are still waiting for BGSAVE to start */
50 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
51
52 /* Feed all the other slaves, MONITORs and so on */
53 if (slave->slaveseldb != dictid) {
54 robj *selectcmd;
55
56 switch(dictid) {
57 case 0: selectcmd = shared.select0; break;
58 case 1: selectcmd = shared.select1; break;
59 case 2: selectcmd = shared.select2; break;
60 case 3: selectcmd = shared.select3; break;
61 case 4: selectcmd = shared.select4; break;
62 case 5: selectcmd = shared.select5; break;
63 case 6: selectcmd = shared.select6; break;
64 case 7: selectcmd = shared.select7; break;
65 case 8: selectcmd = shared.select8; break;
66 case 9: selectcmd = shared.select9; break;
67 default:
68 selectcmd = createObject(REDIS_STRING,
69 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
70 selectcmd->refcount = 0;
71 break;
72 }
73 addReply(slave,selectcmd);
74 slave->slaveseldb = dictid;
75 }
76 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
77 }
78 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
79 if (outv != static_outv) zfree(outv);
80 }
81
82 void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
83 listNode *ln;
84 listIter li;
85 int j;
86 sds cmdrepr = sdsnew("+");
87 robj *cmdobj;
88 struct timeval tv;
89
90 gettimeofday(&tv,NULL);
91 cmdrepr = sdscatprintf(cmdrepr,"%ld.%06ld ",(long)tv.tv_sec,(long)tv.tv_usec);
92 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
93
94 for (j = 0; j < argc; j++) {
95 if (argv[j]->encoding == REDIS_ENCODING_INT) {
96 cmdrepr = sdscatprintf(cmdrepr, "\"%ld\"", (long)argv[j]->ptr);
97 } else {
98 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
99 sdslen(argv[j]->ptr));
100 }
101 if (j != argc-1)
102 cmdrepr = sdscatlen(cmdrepr," ",1);
103 }
104 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
105 cmdobj = createObject(REDIS_STRING,cmdrepr);
106
107 listRewind(monitors,&li);
108 while((ln = listNext(&li))) {
109 redisClient *monitor = ln->value;
110 addReply(monitor,cmdobj);
111 }
112 decrRefCount(cmdobj);
113 }
114
115 void syncCommand(redisClient *c) {
116 /* ignore SYNC if aleady slave or in monitor mode */
117 if (c->flags & REDIS_SLAVE) return;
118
119 /* Refuse SYNC requests if we are a slave but the link with our master
120 * is not ok... */
121 if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED) {
122 addReplyError(c,"Can't SYNC while not connected with my master");
123 return;
124 }
125
126 /* SYNC can't be issued when the server has pending data to send to
127 * the client about already issued commands. We need a fresh reply
128 * buffer registering the differences between the BGSAVE and the current
129 * dataset, so that we can copy to other slaves if needed. */
130 if (listLength(c->reply) != 0) {
131 addReplyError(c,"SYNC is invalid with pending input");
132 return;
133 }
134
135 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
136 /* Here we need to check if there is a background saving operation
137 * in progress, or if it is required to start one */
138 if (server.bgsavechildpid != -1) {
139 /* Ok a background save is in progress. Let's check if it is a good
140 * one for replication, i.e. if there is another slave that is
141 * registering differences since the server forked to save */
142 redisClient *slave;
143 listNode *ln;
144 listIter li;
145
146 listRewind(server.slaves,&li);
147 while((ln = listNext(&li))) {
148 slave = ln->value;
149 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
150 }
151 if (ln) {
152 /* Perfect, the server is already registering differences for
153 * another slave. Set the right state, and copy the buffer. */
154 listRelease(c->reply);
155 c->reply = listDup(slave->reply);
156 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
157 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
158 } else {
159 /* No way, we need to wait for the next BGSAVE in order to
160 * register differences */
161 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
162 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
163 }
164 } else {
165 /* Ok we don't have a BGSAVE in progress, let's start one */
166 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
167 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
168 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
169 addReplyError(c,"Unable to perform background save");
170 return;
171 }
172 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
173 }
174 c->repldbfd = -1;
175 c->flags |= REDIS_SLAVE;
176 c->slaveseldb = 0;
177 listAddNodeTail(server.slaves,c);
178 return;
179 }
180
181 void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
182 redisClient *slave = privdata;
183 REDIS_NOTUSED(el);
184 REDIS_NOTUSED(mask);
185 char buf[REDIS_IOBUF_LEN];
186 ssize_t nwritten, buflen;
187
188 if (slave->repldboff == 0) {
189 /* Write the bulk write count before to transfer the DB. In theory here
190 * we don't know how much room there is in the output buffer of the
191 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
192 * operations) will never be smaller than the few bytes we need. */
193 sds bulkcount;
194
195 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
196 slave->repldbsize);
197 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
198 {
199 sdsfree(bulkcount);
200 freeClient(slave);
201 return;
202 }
203 sdsfree(bulkcount);
204 }
205 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
206 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
207 if (buflen <= 0) {
208 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
209 (buflen == 0) ? "premature EOF" : strerror(errno));
210 freeClient(slave);
211 return;
212 }
213 if ((nwritten = write(fd,buf,buflen)) == -1) {
214 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
215 strerror(errno));
216 freeClient(slave);
217 return;
218 }
219 slave->repldboff += nwritten;
220 if (slave->repldboff == slave->repldbsize) {
221 close(slave->repldbfd);
222 slave->repldbfd = -1;
223 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
224 slave->replstate = REDIS_REPL_ONLINE;
225 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
226 sendReplyToClient, slave) == AE_ERR) {
227 freeClient(slave);
228 return;
229 }
230 addReplySds(slave,sdsempty());
231 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
232 }
233 }
234
235 /* This function is called at the end of every backgrond saving.
236 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
237 * otherwise REDIS_ERR is passed to the function.
238 *
239 * The goal of this function is to handle slaves waiting for a successful
240 * background saving in order to perform non-blocking synchronization. */
241 void updateSlavesWaitingBgsave(int bgsaveerr) {
242 listNode *ln;
243 int startbgsave = 0;
244 listIter li;
245
246 listRewind(server.slaves,&li);
247 while((ln = listNext(&li))) {
248 redisClient *slave = ln->value;
249
250 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
251 startbgsave = 1;
252 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
253 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
254 struct redis_stat buf;
255
256 if (bgsaveerr != REDIS_OK) {
257 freeClient(slave);
258 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
259 continue;
260 }
261 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
262 redis_fstat(slave->repldbfd,&buf) == -1) {
263 freeClient(slave);
264 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
265 continue;
266 }
267 slave->repldboff = 0;
268 slave->repldbsize = buf.st_size;
269 slave->replstate = REDIS_REPL_SEND_BULK;
270 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
271 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
272 freeClient(slave);
273 continue;
274 }
275 }
276 }
277 if (startbgsave) {
278 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
279 listIter li;
280
281 listRewind(server.slaves,&li);
282 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
283 while((ln = listNext(&li))) {
284 redisClient *slave = ln->value;
285
286 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
287 freeClient(slave);
288 }
289 }
290 }
291 }
292
293 /* ----------------------------------- SLAVE -------------------------------- */
294
295 /* Abort the async download of the bulk dataset while SYNC-ing with master */
296 void replicationAbortSyncTransfer(void) {
297 redisAssert(server.replstate == REDIS_REPL_TRANSFER);
298
299 aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);
300 close(server.repl_transfer_s);
301 close(server.repl_transfer_fd);
302 unlink(server.repl_transfer_tmpfile);
303 zfree(server.repl_transfer_tmpfile);
304 server.replstate = REDIS_REPL_CONNECT;
305 }
306
307 /* Asynchronously read the SYNC payload we receive from a master */
308 void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
309 char buf[4096];
310 ssize_t nread, readlen;
311 REDIS_NOTUSED(el);
312 REDIS_NOTUSED(privdata);
313 REDIS_NOTUSED(mask);
314
315 /* If repl_transfer_left == -1 we still have to read the bulk length
316 * from the master reply. */
317 if (server.repl_transfer_left == -1) {
318 if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout) == -1) {
319 redisLog(REDIS_WARNING,
320 "I/O error reading bulk count from MASTER: %s",
321 strerror(errno));
322 goto error;
323 }
324
325 if (buf[0] == '-') {
326 redisLog(REDIS_WARNING,
327 "MASTER aborted replication with an error: %s",
328 buf+1);
329 goto error;
330 } else if (buf[0] == '\0') {
331 /* At this stage just a newline works as a PING in order to take
332 * the connection live. So we refresh our last interaction
333 * timestamp. */
334 server.repl_transfer_lastio = time(NULL);
335 return;
336 } else if (buf[0] != '$') {
337 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
338 goto error;
339 }
340 server.repl_transfer_left = strtol(buf+1,NULL,10);
341 redisLog(REDIS_NOTICE,
342 "MASTER <-> SLAVE sync: receiving %ld bytes from master",
343 server.repl_transfer_left);
344 return;
345 }
346
347 /* Read bulk data */
348 readlen = (server.repl_transfer_left < (signed)sizeof(buf)) ?
349 server.repl_transfer_left : (signed)sizeof(buf);
350 nread = read(fd,buf,readlen);
351 if (nread <= 0) {
352 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
353 (nread == -1) ? strerror(errno) : "connection lost");
354 replicationAbortSyncTransfer();
355 return;
356 }
357 server.repl_transfer_lastio = time(NULL);
358 if (write(server.repl_transfer_fd,buf,nread) != nread) {
359 redisLog(REDIS_WARNING,"Write error or short write writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
360 goto error;
361 }
362 server.repl_transfer_left -= nread;
363 /* Check if the transfer is now complete */
364 if (server.repl_transfer_left == 0) {
365 if (rename(server.repl_transfer_tmpfile,server.dbfilename) == -1) {
366 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
367 replicationAbortSyncTransfer();
368 return;
369 }
370 redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory");
371 emptyDb();
372 /* Before loading the DB into memory we need to delete the readable
373 * handler, otherwise it will get called recursively since
374 * rdbLoad() will call the event loop to process events from time to
375 * time for non blocking loading. */
376 aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);
377 if (rdbLoad(server.dbfilename) != REDIS_OK) {
378 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
379 replicationAbortSyncTransfer();
380 return;
381 }
382 /* Final setup of the connected slave <- master link */
383 zfree(server.repl_transfer_tmpfile);
384 close(server.repl_transfer_fd);
385 server.master = createClient(server.repl_transfer_s);
386 server.master->flags |= REDIS_MASTER;
387 server.master->authenticated = 1;
388 server.replstate = REDIS_REPL_CONNECTED;
389 redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Finished with success");
390 }
391
392 return;
393
394 error:
395 replicationAbortSyncTransfer();
396 return;
397 }
398
399 void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
400 char buf[1024], tmpfile[256];
401 int dfd, maxtries = 5;
402 REDIS_NOTUSED(el);
403 REDIS_NOTUSED(privdata);
404 REDIS_NOTUSED(mask);
405
406 /* This event should only be triggered once since it is used to have a
407 * non-blocking connect(2) to the master. It has been triggered when this
408 * function is called, so we can delete it. */
409 aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
410
411 /* AUTH with the master if required. */
412 if(server.masterauth) {
413 char authcmd[1024];
414 size_t authlen;
415
416 authlen = snprintf(authcmd,sizeof(authcmd),"AUTH %s\r\n",server.masterauth);
417 if (syncWrite(fd,authcmd,authlen,server.repl_syncio_timeout) == -1) {
418 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
419 strerror(errno));
420 goto error;
421 }
422 /* Read the AUTH result. */
423 if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout) == -1) {
424 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
425 strerror(errno));
426 goto error;
427 }
428 if (buf[0] != '+') {
429 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
430 goto error;
431 }
432 }
433
434 /* Issue the SYNC command */
435 if (syncWrite(fd,"SYNC \r\n",7,server.repl_syncio_timeout) == -1) {
436 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
437 strerror(errno));
438 goto error;
439 }
440
441 /* Prepare a suitable temp file for bulk transfer */
442 while(maxtries--) {
443 snprintf(tmpfile,256,
444 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
445 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
446 if (dfd != -1) break;
447 sleep(1);
448 }
449 if (dfd == -1) {
450 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
451 goto error;
452 }
453
454 /* Setup the non blocking download of the bulk file. */
455 if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
456 == AE_ERR)
457 {
458 redisLog(REDIS_WARNING,"Can't create readable event for SYNC");
459 goto error;
460 }
461
462 server.replstate = REDIS_REPL_TRANSFER;
463 server.repl_transfer_left = -1;
464 server.repl_transfer_fd = dfd;
465 server.repl_transfer_lastio = time(NULL);
466 server.repl_transfer_tmpfile = zstrdup(tmpfile);
467 return;
468
469 error:
470 server.replstate = REDIS_REPL_CONNECT;
471 close(fd);
472 return;
473 }
474
475 int connectWithMaster(void) {
476 int fd;
477
478 fd = anetTcpNonBlockConnect(NULL,server.masterhost,server.masterport);
479 if (fd == -1) {
480 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
481 strerror(errno));
482 return REDIS_ERR;
483 }
484
485 if (aeCreateFileEvent(server.el,fd,AE_WRITABLE,syncWithMaster,NULL) ==
486 AE_ERR)
487 {
488 close(fd);
489 redisLog(REDIS_WARNING,"Can't create readable event for SYNC");
490 return REDIS_ERR;
491 }
492
493 server.repl_transfer_s = fd;
494 server.replstate = REDIS_REPL_CONNECTING;
495 return REDIS_OK;
496 }
497
498 void slaveofCommand(redisClient *c) {
499 if (!strcasecmp(c->argv[1]->ptr,"no") &&
500 !strcasecmp(c->argv[2]->ptr,"one")) {
501 if (server.masterhost) {
502 sdsfree(server.masterhost);
503 server.masterhost = NULL;
504 if (server.master) freeClient(server.master);
505 if (server.replstate == REDIS_REPL_TRANSFER)
506 replicationAbortSyncTransfer();
507 server.replstate = REDIS_REPL_NONE;
508 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
509 }
510 } else {
511 sdsfree(server.masterhost);
512 server.masterhost = sdsdup(c->argv[1]->ptr);
513 server.masterport = atoi(c->argv[2]->ptr);
514 if (server.master) freeClient(server.master);
515 if (server.replstate == REDIS_REPL_TRANSFER)
516 replicationAbortSyncTransfer();
517 server.replstate = REDIS_REPL_CONNECT;
518 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
519 server.masterhost, server.masterport);
520 }
521 addReply(c,shared.ok);
522 }
523
524 /* --------------------------- REPLICATION CRON ---------------------------- */
525
526 #define REDIS_REPL_TIMEOUT 60
527 #define REDIS_REPL_PING_SLAVE_PERIOD 10
528
529 void replicationCron(void) {
530 /* Bulk transfer I/O timeout? */
531 if (server.masterhost && server.replstate == REDIS_REPL_TRANSFER &&
532 (time(NULL)-server.repl_transfer_lastio) > REDIS_REPL_TIMEOUT)
533 {
534 redisLog(REDIS_WARNING,"Timeout receiving bulk data from MASTER...");
535 replicationAbortSyncTransfer();
536 }
537
538 /* Timed out master when we are an already connected slave? */
539 if (server.masterhost && server.replstate == REDIS_REPL_CONNECTED &&
540 (time(NULL)-server.master->lastinteraction) > REDIS_REPL_TIMEOUT)
541 {
542 redisLog(REDIS_WARNING,"MASTER time out: no data nor PING received...");
543 freeClient(server.master);
544 }
545
546 /* Check if we should connect to a MASTER */
547 if (server.replstate == REDIS_REPL_CONNECT) {
548 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
549 if (connectWithMaster() == REDIS_OK) {
550 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started");
551 if (server.appendonly) rewriteAppendOnlyFileBackground();
552 }
553 }
554
555 /* If we have attached slaves, PING them from time to time.
556 * So slaves can implement an explicit timeout to masters, and will
557 * be able to detect a link disconnection even if the TCP connection
558 * will not actually go down. */
559 if (!(server.cronloops % (REDIS_REPL_PING_SLAVE_PERIOD*10))) {
560 listIter li;
561 listNode *ln;
562
563 listRewind(server.slaves,&li);
564 while((ln = listNext(&li))) {
565 redisClient *slave = ln->value;
566
567 /* Don't ping slaves that are in the middle of a bulk transfer
568 * with the master for first synchronization. */
569 if (slave->replstate == REDIS_REPL_SEND_BULK) continue;
570 if (slave->replstate == REDIS_REPL_ONLINE) {
571 /* If the slave is online send a normal ping */
572 addReplySds(slave,sdsnew("PING\r\n"));
573 } else {
574 /* Otherwise we are in the pre-synchronization stage.
575 * Just a newline will do the work of refreshing the
576 * connection last interaction time, and at the same time
577 * we'll be sure that being a single char there are no
578 * short-write problems. */
579 if (write(slave->fd, "\n", 1) == -1) {
580 /* Don't worry, it's just a ping. */
581 }
582 }
583 }
584 }
585 }