]> git.saurik.com Git - redis.git/blame - src/sentinel.c
Sentinel: added documentation about slave-priority in redis.conf
[redis.git] / src / sentinel.c
CommitLineData
6b5daa2d 1/* Redis Sentinel implementation
2 * -----------------------------
3 *
4 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 * * Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Redis nor the names of its contributors may be used
16 * to endorse or promote products derived from this software without
17 * specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "redis.h"
33#include "hiredis.h"
34#include "async.h"
35
36#include <ctype.h>
37#include <arpa/inet.h>
38#include <sys/socket.h>
c6c19c83 39#include <sys/wait.h>
6b5daa2d 40
baace5fc 41extern char **environ;
42
6b5daa2d 43#define REDIS_SENTINEL_PORT 26379
44
45/* ======================== Sentinel global state =========================== */
46
47typedef long long mstime_t; /* millisecond time type. */
48
49/* Address object, used to describe an ip:port pair. */
50typedef struct sentinelAddr {
51 char *ip;
52 int port;
53} sentinelAddr;
54
55/* A Sentinel Redis Instance object is monitoring. */
56#define SRI_MASTER (1<<0)
57#define SRI_SLAVE (1<<1)
58#define SRI_SENTINEL (1<<2)
59#define SRI_DISCONNECTED (1<<3)
60#define SRI_S_DOWN (1<<4) /* Subjectively down (no quorum). */
61#define SRI_O_DOWN (1<<5) /* Objectively down (quorum reached). */
62#define SRI_MASTER_DOWN (1<<6) /* A Sentinel with this flag set thinks that
63 its master is down. */
64/* SRI_CAN_FAILOVER when set in an SRI_MASTER instance means that we are
65 * allowed to perform the failover for this master.
66 * When set in a SRI_SENTINEL instance means that sentinel is allowed to
67 * perform the failover on its master. */
68#define SRI_CAN_FAILOVER (1<<7)
69#define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for
70 this master. */
71#define SRI_I_AM_THE_LEADER (1<<9) /* We are the leader for this master. */
72#define SRI_PROMOTED (1<<10) /* Slave selected for promotion. */
73#define SRI_RECONF_SENT (1<<11) /* SLAVEOF <newmaster> sent. */
74#define SRI_RECONF_INPROG (1<<12) /* Slave synchronization in progress. */
75#define SRI_RECONF_DONE (1<<13) /* Slave synchronized with new master. */
cada7f96 76#define SRI_FORCE_FAILOVER (1<<14) /* Force failover with master up. */
850789ce 77#define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */
6b5daa2d 78
79#define SENTINEL_INFO_PERIOD 10000
80#define SENTINEL_PING_PERIOD 1000
81#define SENTINEL_ASK_PERIOD 1000
82#define SENTINEL_PUBLISH_PERIOD 5000
83#define SENTINEL_DOWN_AFTER_PERIOD 30000
84#define SENTINEL_HELLO_CHANNEL "__sentinel__:hello"
85#define SENTINEL_TILT_TRIGGER 2000
86#define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30)
87#define SENTINEL_DEFAULT_SLAVE_PRIORITY 100
88#define SENTINEL_PROMOTION_RETRY_PERIOD 30000
89#define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000
90#define SENTINEL_DEFAULT_PARALLEL_SYNCS 1
91#define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000
92#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000)
93#define SENTINEL_MAX_PENDING_COMMANDS 100
94#define SENTINEL_EXTENDED_SDOWN_MULTIPLIER 10
95
96/* How many milliseconds is an information valid? This applies for instance
97 * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */
98#define SENTINEL_INFO_VALIDITY_TIME 5000
99#define SENTINEL_FAILOVER_FIXED_DELAY 5000
100#define SENTINEL_FAILOVER_MAX_RANDOM_DELAY 10000
101
102/* Failover machine different states. */
103#define SENTINEL_FAILOVER_STATE_NONE 0 /* No failover in progress. */
104#define SENTINEL_FAILOVER_STATE_WAIT_START 1 /* Wait for failover_start_time*/
105#define SENTINEL_FAILOVER_STATE_SELECT_SLAVE 2 /* Select slave to promote */
106#define SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE 3 /* Slave -> Master */
107#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait slave to change role */
108#define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 5 /* SLAVEOF newmaster */
109#define SENTINEL_FAILOVER_STATE_WAIT_NEXT_SLAVE 6 /* wait replication */
110#define SENTINEL_FAILOVER_STATE_ALERT_CLIENTS 7 /* Run user script. */
111#define SENTINEL_FAILOVER_STATE_WAIT_ALERT_SCRIPT 8 /* Wait script exec. */
112#define SENTINEL_FAILOVER_STATE_DETECT_END 9 /* Check for failover end. */
113#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 10 /* Monitor promoted slave. */
114
115#define SENTINEL_MASTER_LINK_STATUS_UP 0
116#define SENTINEL_MASTER_LINK_STATUS_DOWN 1
117
75fb6e5b 118/* Generic flags that can be used with different functions. */
119#define SENTINEL_NO_FLAGS 0
120#define SENTINEL_GENERATE_EVENT 1
6275004c 121#define SENTINEL_LEADER 2
122#define SENTINEL_OBSERVER 4
75fb6e5b 123
3f194a9d 124/* Script execution flags and limits. */
125#define SENTINEL_SCRIPT_NONE 0
126#define SENTINEL_SCRIPT_RUNNING 1
127#define SENTINEL_SCRIPT_MAX_QUEUE 256
128#define SENTINEL_SCRIPT_MAX_RUNNING 16
129#define SENTINEL_SCRIPT_MAX_RUNTIME 60000 /* 60 seconds max exec time. */
130#define SENTINEL_SCRIPT_MAX_RETRY 10
131#define SENTINEL_SCRIPT_RETRY_DELAY 30000 /* 30 seconds between retries. */
132
6b5daa2d 133typedef struct sentinelRedisInstance {
134 int flags; /* See SRI_... defines */
135 char *name; /* Master name from the point of view of this sentinel. */
136 char *runid; /* run ID of this instance. */
137 sentinelAddr *addr; /* Master host. */
138 redisAsyncContext *cc; /* Hiredis context for commands. */
139 redisAsyncContext *pc; /* Hiredis context for Pub / Sub. */
140 int pending_commands; /* Number of commands sent waiting for a reply. */
141 mstime_t cc_conn_time; /* cc connection time. */
142 mstime_t pc_conn_time; /* pc connection time. */
143 mstime_t pc_last_activity; /* Last time we received any message. */
144 mstime_t last_avail_time; /* Last time the instance replied to ping with
145 a reply we consider valid. */
146 mstime_t last_pong_time; /* Last time the instance replied to ping,
147 whatever the reply was. That's used to check
148 if the link is idle and must be reconnected. */
149 mstime_t last_pub_time; /* Last time we sent hello via Pub/Sub. */
150 mstime_t last_hello_time; /* Only used if SRI_SENTINEL is set. Last time
151 we received an hello from this Sentinel
152 via Pub/Sub. */
153 mstime_t last_master_down_reply_time; /* Time of last reply to
154 SENTINEL is-master-down command. */
155 mstime_t s_down_since_time; /* Subjectively down since time. */
156 mstime_t o_down_since_time; /* Objectively down since time. */
157 mstime_t down_after_period; /* Consider it down after that period. */
158 mstime_t info_refresh; /* Time at which we received INFO output from it. */
159
160 /* Master specific. */
161 dict *sentinels; /* Other sentinels monitoring the same master. */
162 dict *slaves; /* Slaves for this master instance. */
163 int quorum; /* Number of sentinels that need to agree on failure. */
164 int parallel_syncs; /* How many slaves to reconfigure at same time. */
165
166 /* Slave specific. */
167 mstime_t master_link_down_time; /* Slave replication link down time. */
168 int slave_priority; /* Slave priority according to its INFO output. */
169 mstime_t slave_reconf_sent_time; /* Time at which we sent SLAVE OF <new> */
170 struct sentinelRedisInstance *master; /* Master instance if SRI_SLAVE is set. */
171 char *slave_master_host; /* Master host as reported by INFO */
172 int slave_master_port; /* Master port as reported by INFO */
173 int slave_master_link_status; /* Master link status as reported by INFO */
174 /* Failover */
175 char *leader; /* If this is a master instance, this is the runid of
176 the Sentinel that should perform the failover. If
177 this is a Sentinel, this is the runid of the Sentinel
178 that this other Sentinel is voting as leader.
179 This field is valid only if SRI_MASTER_DOWN is
180 set on the Sentinel instance. */
181 int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */
182 mstime_t failover_state_change_time;
183 mstime_t failover_start_time; /* When to start to failover if leader. */
184 mstime_t failover_timeout; /* Max time to refresh failover state. */
185 struct sentinelRedisInstance *promoted_slave; /* Promoted slave instance. */
186 /* Scripts executed to notify admin or reconfigure clients: when they
187 * are set to NULL no script is executed. */
baace5fc 188 char *notification_script;
6b5daa2d 189 char *client_reconfig_script;
190} sentinelRedisInstance;
191
192/* Main state. */
193struct sentinelState {
194 dict *masters; /* Dictionary of master sentinelRedisInstances.
195 Key is the instance name, value is the
196 sentinelRedisInstance structure pointer. */
197 int tilt; /* Are we in TILT mode? */
3f194a9d 198 int running_scripts; /* Number of scripts in execution right now. */
6b5daa2d 199 mstime_t tilt_start_time; /* When TITL started. */
200 mstime_t previous_time; /* Time last time we ran the time handler. */
3f194a9d 201 list *scripts_queue; /* Queue of user scripts to execute. */
6b5daa2d 202} sentinel;
203
3f194a9d 204/* A script execution job. */
205typedef struct sentinelScriptJob {
206 int flags; /* Script job flags: SENTINEL_SCRIPT_* */
207 int retry_num; /* Number of times we tried to execute it. */
208 char **argv; /* Arguments to call the script. */
209 mstime_t start_time; /* Script execution time if the script is running,
210 otherwise 0 if we are allowed to retry the
211 execution at any time. If the script is not
212 running and it's not 0, it means: do not run
213 before the specified time. */
214 pid_t pid; /* Script execution pid. */
215} sentinelScriptJob;
216
6b5daa2d 217/* ======================= hiredis ae.c adapters =============================
218 * Note: this implementation is taken from hiredis/adapters/ae.h, however
219 * we have our modified copy for Sentinel in order to use our allocator
220 * and to have full control over how the adapter works. */
221
222typedef struct redisAeEvents {
223 redisAsyncContext *context;
224 aeEventLoop *loop;
225 int fd;
226 int reading, writing;
227} redisAeEvents;
228
229static void redisAeReadEvent(aeEventLoop *el, int fd, void *privdata, int mask) {
230 ((void)el); ((void)fd); ((void)mask);
231
232 redisAeEvents *e = (redisAeEvents*)privdata;
233 redisAsyncHandleRead(e->context);
234}
235
236static void redisAeWriteEvent(aeEventLoop *el, int fd, void *privdata, int mask) {
237 ((void)el); ((void)fd); ((void)mask);
238
239 redisAeEvents *e = (redisAeEvents*)privdata;
240 redisAsyncHandleWrite(e->context);
241}
242
243static void redisAeAddRead(void *privdata) {
244 redisAeEvents *e = (redisAeEvents*)privdata;
245 aeEventLoop *loop = e->loop;
246 if (!e->reading) {
247 e->reading = 1;
248 aeCreateFileEvent(loop,e->fd,AE_READABLE,redisAeReadEvent,e);
249 }
250}
251
252static void redisAeDelRead(void *privdata) {
253 redisAeEvents *e = (redisAeEvents*)privdata;
254 aeEventLoop *loop = e->loop;
255 if (e->reading) {
256 e->reading = 0;
257 aeDeleteFileEvent(loop,e->fd,AE_READABLE);
258 }
259}
260
261static void redisAeAddWrite(void *privdata) {
262 redisAeEvents *e = (redisAeEvents*)privdata;
263 aeEventLoop *loop = e->loop;
264 if (!e->writing) {
265 e->writing = 1;
266 aeCreateFileEvent(loop,e->fd,AE_WRITABLE,redisAeWriteEvent,e);
267 }
268}
269
270static void redisAeDelWrite(void *privdata) {
271 redisAeEvents *e = (redisAeEvents*)privdata;
272 aeEventLoop *loop = e->loop;
273 if (e->writing) {
274 e->writing = 0;
275 aeDeleteFileEvent(loop,e->fd,AE_WRITABLE);
276 }
277}
278
279static void redisAeCleanup(void *privdata) {
280 redisAeEvents *e = (redisAeEvents*)privdata;
281 redisAeDelRead(privdata);
282 redisAeDelWrite(privdata);
283 zfree(e);
284}
285
286static int redisAeAttach(aeEventLoop *loop, redisAsyncContext *ac) {
287 redisContext *c = &(ac->c);
288 redisAeEvents *e;
289
290 /* Nothing should be attached when something is already attached */
291 if (ac->ev.data != NULL)
292 return REDIS_ERR;
293
294 /* Create container for context and r/w events */
295 e = (redisAeEvents*)zmalloc(sizeof(*e));
296 e->context = ac;
297 e->loop = loop;
298 e->fd = c->fd;
299 e->reading = e->writing = 0;
300
301 /* Register functions to start/stop listening for events */
302 ac->ev.addRead = redisAeAddRead;
303 ac->ev.delRead = redisAeDelRead;
304 ac->ev.addWrite = redisAeAddWrite;
305 ac->ev.delWrite = redisAeDelWrite;
306 ac->ev.cleanup = redisAeCleanup;
307 ac->ev.data = e;
308
309 return REDIS_OK;
310}
311
312/* ============================= Prototypes ================================= */
313
314void sentinelLinkEstablishedCallback(const redisAsyncContext *c, int status);
315void sentinelDisconnectCallback(const redisAsyncContext *c, int status);
316void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata);
317sentinelRedisInstance *sentinelGetMasterByName(char *name);
318char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master);
319char *sentinelGetObjectiveLeader(sentinelRedisInstance *master);
320int yesnotoi(char *s);
321void sentinelDisconnectInstanceFromContext(const redisAsyncContext *c);
75fb6e5b 322void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c);
6b5daa2d 323const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri);
672102c2 324void sentinelAbortFailover(sentinelRedisInstance *ri);
baace5fc 325void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...);
ce7b838f 326sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master);
3f194a9d 327void sentinelScheduleScriptExecution(char *path, ...);
cada7f96 328void sentinelStartFailover(sentinelRedisInstance *master, int state);
6b5daa2d 329
330/* ========================= Dictionary types =============================== */
331
332unsigned int dictSdsHash(const void *key);
333int dictSdsKeyCompare(void *privdata, const void *key1, const void *key2);
334void releaseSentinelRedisInstance(sentinelRedisInstance *ri);
335
336void dictInstancesValDestructor (void *privdata, void *obj) {
337 releaseSentinelRedisInstance(obj);
338}
339
340/* Instance name (sds) -> instance (sentinelRedisInstance pointer)
341 *
342 * also used for: sentinelRedisInstance->sentinels dictionary that maps
343 * sentinels ip:port to last seen time in Pub/Sub hello message. */
344dictType instancesDictType = {
345 dictSdsHash, /* hash function */
346 NULL, /* key dup */
347 NULL, /* val dup */
348 dictSdsKeyCompare, /* key compare */
349 NULL, /* key destructor */
350 dictInstancesValDestructor /* val destructor */
351};
352
353/* Instance runid (sds) -> votes (long casted to void*)
354 *
355 * This is useful into sentinelGetObjectiveLeader() function in order to
356 * count the votes and understand who is the leader. */
357dictType leaderVotesDictType = {
358 dictSdsHash, /* hash function */
359 NULL, /* key dup */
360 NULL, /* val dup */
361 dictSdsKeyCompare, /* key compare */
362 NULL, /* key destructor */
363 NULL /* val destructor */
364};
365
366/* =========================== Initialization =============================== */
367
368void sentinelCommand(redisClient *c);
369
370struct redisCommand sentinelcmds[] = {
371 {"ping",pingCommand,1,"",0,NULL,0,0,0,0,0},
372 {"sentinel",sentinelCommand,-2,"",0,NULL,0,0,0,0,0},
373 {"subscribe",subscribeCommand,-2,"",0,NULL,0,0,0,0,0},
374 {"unsubscribe",unsubscribeCommand,-1,"",0,NULL,0,0,0,0,0},
375 {"psubscribe",psubscribeCommand,-2,"",0,NULL,0,0,0,0,0},
376 {"punsubscribe",punsubscribeCommand,-1,"",0,NULL,0,0,0,0,0}
377};
378
379/* This function overwrites a few normal Redis config default with Sentinel
380 * specific defaults. */
381void initSentinelConfig(void) {
382 server.port = REDIS_SENTINEL_PORT;
383}
384
385/* Perform the Sentinel mode initialization. */
386void initSentinel(void) {
387 int j;
388
389 /* Remove usual Redis commands from the command table, then just add
390 * the SENTINEL command. */
391 dictEmpty(server.commands);
392 for (j = 0; j < sizeof(sentinelcmds)/sizeof(sentinelcmds[0]); j++) {
393 int retval;
394 struct redisCommand *cmd = sentinelcmds+j;
395
396 retval = dictAdd(server.commands, sdsnew(cmd->name), cmd);
397 redisAssert(retval == DICT_OK);
398 }
399
400 /* Initialize various data structures. */
401 sentinel.masters = dictCreate(&instancesDictType,NULL);
402 sentinel.tilt = 0;
403 sentinel.tilt_start_time = mstime();
404 sentinel.previous_time = mstime();
3f194a9d 405 sentinel.running_scripts = 0;
406 sentinel.scripts_queue = listCreate();
6b5daa2d 407}
408
409/* ============================== sentinelAddr ============================== */
410
411/* Create a sentinelAddr object and return it on success.
412 * On error NULL is returned and errno is set to:
413 * ENOENT: Can't resolve the hostname.
414 * EINVAL: Invalid port number.
415 */
416sentinelAddr *createSentinelAddr(char *hostname, int port) {
417 char buf[32];
418 sentinelAddr *sa;
419
420 if (port <= 0 || port > 65535) {
421 errno = EINVAL;
422 return NULL;
423 }
424 if (anetResolve(NULL,hostname,buf) == ANET_ERR) {
425 errno = ENOENT;
426 return NULL;
427 }
428 sa = zmalloc(sizeof(*sa));
429 sa->ip = sdsnew(buf);
430 sa->port = port;
431 return sa;
432}
433
434/* Free a Sentinel address. Can't fail. */
435void releaseSentinelAddr(sentinelAddr *sa) {
436 sdsfree(sa->ip);
437 zfree(sa);
438}
439
440/* =========================== Events notification ========================== */
441
6b5daa2d 442/* Send an event to log, pub/sub, user notification script.
443 *
444 * 'level' is the log level for logging. Only REDIS_WARNING events will trigger
445 * the execution of the user notification script.
446 *
447 * 'type' is the message type, also used as a pub/sub channel name.
448 *
449 * 'ri', is the redis instance target of this event if applicable, and is
450 * used to obtain the path of the notification script to execute.
451 *
452 * The remaining arguments are printf-alike.
453 * If the format specifier starts with the two characters "%@" then ri is
454 * not NULL, and the message is prefixed with an instance identifier in the
455 * following format:
456 *
457 * <instance type> <instance name> <ip> <port>
458 *
459 * If the instance type is not master, than the additional string is
460 * added to specify the originating master:
461 *
462 * @ <master name> <master ip> <master port>
463 *
464 * Any other specifier after "%@" is processed by printf itself.
465 */
466void sentinelEvent(int level, char *type, sentinelRedisInstance *ri,
467 const char *fmt, ...) {
468 va_list ap;
469 char msg[REDIS_MAX_LOGMSG_LEN];
470 robj *channel, *payload;
471
472 /* Handle %@ */
473 if (fmt[0] == '%' && fmt[1] == '@') {
474 sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ?
475 NULL : ri->master;
476
477 if (master) {
478 snprintf(msg, sizeof(msg), "%s %s %s %d @ %s %s %d",
479 sentinelRedisInstanceTypeStr(ri),
480 ri->name, ri->addr->ip, ri->addr->port,
481 master->name, master->addr->ip, master->addr->port);
482 } else {
483 snprintf(msg, sizeof(msg), "%s %s %s %d",
484 sentinelRedisInstanceTypeStr(ri),
485 ri->name, ri->addr->ip, ri->addr->port);
486 }
487 fmt += 2;
488 } else {
489 msg[0] = '\0';
490 }
491
492 /* Use vsprintf for the rest of the formatting if any. */
493 if (fmt[0] != '\0') {
494 va_start(ap, fmt);
495 vsnprintf(msg+strlen(msg), sizeof(msg)-strlen(msg), fmt, ap);
496 va_end(ap);
497 }
498
499 /* Log the message if the log level allows it to be logged. */
500 if (level >= server.verbosity)
501 redisLog(level,"%s %s",type,msg);
502
503 /* Publish the message via Pub/Sub if it's not a debugging one. */
504 if (level != REDIS_DEBUG) {
505 channel = createStringObject(type,strlen(type));
506 payload = createStringObject(msg,strlen(msg));
507 pubsubPublishMessage(channel,payload);
508 decrRefCount(channel);
509 decrRefCount(payload);
510 }
511
512 /* Call the notification script if applicable. */
513 if (level == REDIS_WARNING && ri != NULL) {
514 sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ?
515 ri : ri->master;
baace5fc 516 if (master->notification_script) {
3f194a9d 517 sentinelScheduleScriptExecution(master->notification_script,
518 type,msg,NULL);
519 }
520 }
521}
522
523/* ============================ script execution ============================ */
524
525/* Release a script job structure and all the associated data. */
526void sentinelReleaseScriptJob(sentinelScriptJob *sj) {
527 int j = 0;
528
529 while(sj->argv[j]) sdsfree(sj->argv[j++]);
530 zfree(sj->argv);
531 zfree(sj);
532}
533
534#define SENTINEL_SCRIPT_MAX_ARGS 16
535void sentinelScheduleScriptExecution(char *path, ...) {
536 va_list ap;
537 char *argv[SENTINEL_SCRIPT_MAX_ARGS+1];
538 int argc = 1;
539 sentinelScriptJob *sj;
540
541 va_start(ap, path);
542 while(argc < SENTINEL_SCRIPT_MAX_ARGS) {
543 argv[argc] = va_arg(ap,char*);
544 if (!argv[argc]) break;
545 argv[argc] = sdsnew(argv[argc]); /* Copy the string. */
546 argc++;
547 }
548 va_end(ap);
549 argv[0] = sdsnew(path);
550
551 sj = zmalloc(sizeof(*sj));
552 sj->flags = SENTINEL_SCRIPT_NONE;
553 sj->retry_num = 0;
554 sj->argv = zmalloc(sizeof(char*)*(argc+1));
555 sj->start_time = 0;
556 sj->pid = 0;
557 memcpy(sj->argv,argv,sizeof(char*)*(argc+1));
558
559 listAddNodeTail(sentinel.scripts_queue,sj);
560
561 /* Remove the oldest non running script if we already hit the limit. */
562 if (listLength(sentinel.scripts_queue) > SENTINEL_SCRIPT_MAX_QUEUE) {
563 listNode *ln;
564 listIter li;
565
566 listRewind(sentinel.scripts_queue,&li);
567 while ((ln = listNext(&li)) != NULL) {
568 sj = ln->value;
569
570 if (sj->flags & SENTINEL_SCRIPT_RUNNING) continue;
571 /* The first node is the oldest as we add on tail. */
572 listDelNode(sentinel.scripts_queue,ln);
573 sentinelReleaseScriptJob(sj);
574 break;
6b5daa2d 575 }
3f194a9d 576 redisAssert(listLength(sentinel.scripts_queue) <=
577 SENTINEL_SCRIPT_MAX_QUEUE);
578 }
579}
580
581/* Lookup a script in the scripts queue via pid, and returns the list node
582 * (so that we can easily remove it from the queue if needed). */
583listNode *sentinelGetScriptListNodeByPid(pid_t pid) {
584 listNode *ln;
585 listIter li;
586
587 listRewind(sentinel.scripts_queue,&li);
588 while ((ln = listNext(&li)) != NULL) {
589 sentinelScriptJob *sj = ln->value;
590
591 if ((sj->flags & SENTINEL_SCRIPT_RUNNING) && sj->pid == pid)
592 return ln;
593 }
594 return NULL;
595}
596
597/* Run pending scripts if we are not already at max number of running
598 * scripts. */
599void sentinelRunPendingScripts(void) {
600 listNode *ln;
601 listIter li;
602 mstime_t now = mstime();
603
604 /* Find jobs that are not running and run them, from the top to the
605 * tail of the queue, so we run older jobs first. */
606 listRewind(sentinel.scripts_queue,&li);
607 while (sentinel.running_scripts < SENTINEL_SCRIPT_MAX_RUNNING &&
608 (ln = listNext(&li)) != NULL)
609 {
610 sentinelScriptJob *sj = ln->value;
611 pid_t pid;
612
613 /* Skip if already running. */
614 if (sj->flags & SENTINEL_SCRIPT_RUNNING) continue;
615
616 /* Skip if it's a retry, but not enough time has elapsed. */
617 if (sj->start_time && sj->start_time > now) continue;
618
619 sj->flags |= SENTINEL_SCRIPT_RUNNING;
620 sj->start_time = mstime();
621 sj->retry_num++;
622 pid = fork();
623
624 if (pid == -1) {
625 /* Parent (fork error).
626 * We report fork errors as signal 99, in order to unify the
627 * reporting with other kind of errors. */
628 sentinelEvent(REDIS_WARNING,"-script-error",NULL,
629 "%s %d %d", sj->argv[0], 99, 0);
630 sj->flags &= ~SENTINEL_SCRIPT_RUNNING;
631 sj->pid = 0;
632 } else if (pid == 0) {
633 /* Child */
634 execve(sj->argv[0],sj->argv,environ);
635 /* If we are here an error occurred. */
636 _exit(2); /* Don't retry execution. */
637 } else {
638 sentinel.running_scripts++;
639 sj->pid = pid;
640 sentinelEvent(REDIS_DEBUG,"+script-child",NULL,"%ld",(long)pid);
641 }
642 }
643}
644
645/* How much to delay the execution of a script that we need to retry after
646 * an error?
647 *
648 * We double the retry delay for every further retry we do. So for instance
649 * if RETRY_DELAY is set to 30 seconds and the max number of retries is 10
650 * starting from the second attempt to execute the script the delays are:
651 * 30 sec, 60 sec, 2 min, 4 min, 8 min, 16 min, 32 min, 64 min, 128 min. */
652mstime_t sentinelScriptRetryDelay(int retry_num) {
653 mstime_t delay = SENTINEL_SCRIPT_RETRY_DELAY;
654
655 while (retry_num-- > 1) delay *= 2;
656 return delay;
657}
658
659/* Check for scripts that terminated, and remove them from the queue if the
660 * script terminated successfully. If instead the script was terminated by
661 * a signal, or returned exit code "1", it is scheduled to run again if
662 * the max number of retries did not already elapsed. */
663void sentinelCollectTerminatedScripts(void) {
664 int statloc;
665 pid_t pid;
666
667 while ((pid = wait3(&statloc,WNOHANG,NULL)) > 0) {
668 int exitcode = WEXITSTATUS(statloc);
669 int bysignal = 0;
670 listNode *ln;
671 sentinelScriptJob *sj;
672
673 if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
674 sentinelEvent(REDIS_DEBUG,"-script-child",NULL,"%ld %d %d",
675 (long)pid, exitcode, bysignal);
676
677 ln = sentinelGetScriptListNodeByPid(pid);
678 if (ln == NULL) {
679 redisLog(REDIS_WARNING,"wait3() returned a pid (%ld) we can't find in our scripts execution queue!", (long)pid);
680 continue;
681 }
682 sj = ln->value;
683
684 /* If the script was terminated by a signal or returns an
685 * exit code of "1" (that means: please retry), we reschedule it
686 * if the max number of retries is not already reached. */
687 if ((bysignal || exitcode == 1) &&
688 sj->retry_num != SENTINEL_SCRIPT_MAX_RETRY)
689 {
690 sj->flags &= ~SENTINEL_SCRIPT_RUNNING;
691 sj->pid = 0;
692 sj->start_time = mstime() +
693 sentinelScriptRetryDelay(sj->retry_num);
694 } else {
695 /* Otherwise let's remove the script, but log the event if the
696 * execution did not terminated in the best of the ways. */
697 if (bysignal || exitcode != 0) {
698 sentinelEvent(REDIS_WARNING,"-script-error",NULL,
699 "%s %d %d", sj->argv[0], bysignal, exitcode);
700 }
701 listDelNode(sentinel.scripts_queue,ln);
702 sentinelReleaseScriptJob(sj);
703 sentinel.running_scripts--;
704 }
705 }
706}
707
708/* Kill scripts in timeout, they'll be collected by the
709 * sentinelCollectTerminatedScripts() function. */
710void sentinelKillTimedoutScripts(void) {
711 listNode *ln;
712 listIter li;
713 mstime_t now = mstime();
714
715 listRewind(sentinel.scripts_queue,&li);
716 while ((ln = listNext(&li)) != NULL) {
717 sentinelScriptJob *sj = ln->value;
718
719 if (sj->flags & SENTINEL_SCRIPT_RUNNING &&
720 (now - sj->start_time) > SENTINEL_SCRIPT_MAX_RUNTIME)
721 {
722 sentinelEvent(REDIS_WARNING,"-script-timeout",NULL,"%s %ld",
723 sj->argv[0], (long)sj->pid);
724 kill(sj->pid,SIGKILL);
725 }
726 }
727}
728
729/* Implements SENTINEL PENDING-SCRIPTS command. */
730void sentinelPendingScriptsCommand(redisClient *c) {
731 listNode *ln;
732 listIter li;
733
734 addReplyMultiBulkLen(c,listLength(sentinel.scripts_queue));
735 listRewind(sentinel.scripts_queue,&li);
736 while ((ln = listNext(&li)) != NULL) {
737 sentinelScriptJob *sj = ln->value;
738 int j = 0;
739
740 addReplyMultiBulkLen(c,10);
741
742 addReplyBulkCString(c,"argv");
743 while (sj->argv[j]) j++;
744 addReplyMultiBulkLen(c,j);
745 j = 0;
746 while (sj->argv[j]) addReplyBulkCString(c,sj->argv[j++]);
747
748 addReplyBulkCString(c,"flags");
749 addReplyBulkCString(c,
750 (sj->flags & SENTINEL_SCRIPT_RUNNING) ? "running" : "scheduled");
751
752 addReplyBulkCString(c,"pid");
753 addReplyBulkLongLong(c,sj->pid);
754
755 if (sj->flags & SENTINEL_SCRIPT_RUNNING) {
756 addReplyBulkCString(c,"run-time");
757 addReplyBulkLongLong(c,mstime() - sj->start_time);
758 } else {
759 mstime_t delay = sj->start_time ? (sj->start_time-mstime()) : 0;
760 if (delay < 0) delay = 0;
761 addReplyBulkCString(c,"run-delay");
762 addReplyBulkLongLong(c,delay);
763 }
764
765 addReplyBulkCString(c,"retry-num");
766 addReplyBulkLongLong(c,sj->retry_num);
6b5daa2d 767 }
768}
769
6275004c 770/* This function calls, if any, the client reconfiguration script with the
771 * following parameters:
772 *
773 * <master-name> <role> <state> <from-ip> <from-port> <to-ip> <to-port>
774 *
775 * It is called every time a failover starts, ends, or is aborted.
776 *
777 * <state> is "start", "end" or "abort".
778 * <role> is either "leader" or "observer".
779 *
780 * from/to fields are respectively master -> promoted slave addresses for
781 * "start" and "end", or the reverse (promoted slave -> master) in case of
782 * "abort".
783 */
784void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, char *state, sentinelAddr *from, sentinelAddr *to) {
785 char fromport[32], toport[32];
786
787 if (master->client_reconfig_script == NULL) return;
788 ll2string(fromport,sizeof(fromport),from->port);
789 ll2string(toport,sizeof(toport),to->port);
790 sentinelScheduleScriptExecution(master->client_reconfig_script,
791 master->name,
792 (role == SENTINEL_LEADER) ? "leader" : "observer",
01477753 793 state, from->ip, fromport, to->ip, toport, NULL);
6275004c 794}
795
6b5daa2d 796/* ========================== sentinelRedisInstance ========================= */
797
798/* Create a redis instance, the following fields must be populated by the
799 * caller if needed:
800 * runid: set to NULL but will be populated once INFO output is received.
801 * info_refresh: is set to 0 to mean that we never received INFO so far.
802 *
803 * If SRI_MASTER is set into initial flags the instance is added to
804 * sentinel.masters table.
805 *
806 * if SRI_SLAVE or SRI_SENTINEL is set then 'master' must be not NULL and the
807 * instance is added into master->slaves or master->sentinels table.
808 *
809 * If the instance is a slave or sentinel, the name parameter is ignored and
810 * is created automatically as hostname:port.
811 *
812 * The function fails if hostname can't be resolved or port is out of range.
813 * When this happens NULL is returned and errno is set accordingly to the
814 * createSentinelAddr() function.
815 *
816 * The function may also fail and return NULL with errno set to EBUSY if
817 * a master or slave with the same name already exists. */
818sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *hostname, int port, int quorum, sentinelRedisInstance *master) {
819 sentinelRedisInstance *ri;
820 sentinelAddr *addr;
c14e0eca 821 dict *table = NULL;
6b5daa2d 822 char slavename[128], *sdsname;
823
824 redisAssert(flags & (SRI_MASTER|SRI_SLAVE|SRI_SENTINEL));
825 redisAssert((flags & SRI_MASTER) || master != NULL);
826
827 /* Check address validity. */
828 addr = createSentinelAddr(hostname,port);
829 if (addr == NULL) return NULL;
830
831 /* For slaves and sentinel we use ip:port as name. */
832 if (flags & (SRI_SLAVE|SRI_SENTINEL)) {
833 snprintf(slavename,sizeof(slavename),"%s:%d",hostname,port);
834 name = slavename;
835 }
836
837 /* Make sure the entry is not duplicated. This may happen when the same
838 * name for a master is used multiple times inside the configuration or
839 * if we try to add multiple times a slave or sentinel with same ip/port
840 * to a master. */
841 if (flags & SRI_MASTER) table = sentinel.masters;
842 else if (flags & SRI_SLAVE) table = master->slaves;
843 else if (flags & SRI_SENTINEL) table = master->sentinels;
844 sdsname = sdsnew(name);
845 if (dictFind(table,sdsname)) {
846 sdsfree(sdsname);
847 errno = EBUSY;
848 return NULL;
849 }
850
851 /* Create the instance object. */
852 ri = zmalloc(sizeof(*ri));
853 /* Note that all the instances are started in the disconnected state,
854 * the event loop will take care of connecting them. */
855 ri->flags = flags | SRI_DISCONNECTED;
856 ri->name = sdsname;
857 ri->runid = NULL;
858 ri->addr = addr;
859 ri->cc = NULL;
860 ri->pc = NULL;
861 ri->pending_commands = 0;
862 ri->cc_conn_time = 0;
863 ri->pc_conn_time = 0;
864 ri->pc_last_activity = 0;
865 ri->last_avail_time = mstime();
866 ri->last_pong_time = mstime();
867 ri->last_pub_time = mstime();
868 ri->last_hello_time = mstime();
869 ri->last_master_down_reply_time = mstime();
870 ri->s_down_since_time = 0;
871 ri->o_down_since_time = 0;
872 ri->down_after_period = master ? master->down_after_period :
873 SENTINEL_DOWN_AFTER_PERIOD;
874 ri->master_link_down_time = 0;
875 ri->slave_priority = SENTINEL_DEFAULT_SLAVE_PRIORITY;
876 ri->slave_reconf_sent_time = 0;
877 ri->slave_master_host = NULL;
878 ri->slave_master_port = 0;
879 ri->slave_master_link_status = SENTINEL_MASTER_LINK_STATUS_DOWN;
880 ri->sentinels = dictCreate(&instancesDictType,NULL);
881 ri->quorum = quorum;
882 ri->parallel_syncs = SENTINEL_DEFAULT_PARALLEL_SYNCS;
883 ri->master = master;
884 ri->slaves = dictCreate(&instancesDictType,NULL);
885 ri->info_refresh = 0;
886
887 /* Failover state. */
888 ri->leader = NULL;
889 ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
890 ri->failover_state_change_time = 0;
891 ri->failover_start_time = 0;
892 ri->failover_timeout = SENTINEL_DEFAULT_FAILOVER_TIMEOUT;
893 ri->promoted_slave = NULL;
baace5fc 894 ri->notification_script = NULL;
6b5daa2d 895 ri->client_reconfig_script = NULL;
896
897 /* Add into the right table. */
898 dictAdd(table, ri->name, ri);
899 return ri;
900}
901
902/* Release this instance and all its slaves, sentinels, hiredis connections.
903 * This function also takes care of unlinking the instance from the main
904 * masters table (if it is a master) or from its master sentinels/slaves table
905 * if it is a slave or sentinel. */
906void releaseSentinelRedisInstance(sentinelRedisInstance *ri) {
907 /* Release all its slaves or sentinels if any. */
908 dictRelease(ri->sentinels);
909 dictRelease(ri->slaves);
910
75fb6e5b 911 /* Release hiredis connections. */
912 if (ri->cc) sentinelKillLink(ri,ri->cc);
913 if (ri->pc) sentinelKillLink(ri,ri->pc);
6b5daa2d 914
915 /* Free other resources. */
916 sdsfree(ri->name);
917 sdsfree(ri->runid);
baace5fc 918 sdsfree(ri->notification_script);
6b5daa2d 919 sdsfree(ri->client_reconfig_script);
920 sdsfree(ri->slave_master_host);
921 sdsfree(ri->leader);
922 releaseSentinelAddr(ri->addr);
923
924 /* Clear state into the master if needed. */
925 if ((ri->flags & SRI_SLAVE) && (ri->flags & SRI_PROMOTED) && ri->master)
926 ri->master->promoted_slave = NULL;
927
928 zfree(ri);
929}
930
931/* Lookup a slave in a master Redis instance, by ip and port. */
932sentinelRedisInstance *sentinelRedisInstanceLookupSlave(
933 sentinelRedisInstance *ri, char *ip, int port)
934{
935 sds key;
936 sentinelRedisInstance *slave;
937
938 redisAssert(ri->flags & SRI_MASTER);
939 key = sdscatprintf(sdsempty(),"%s:%d",ip,port);
940 slave = dictFetchValue(ri->slaves,key);
941 sdsfree(key);
942 return slave;
943}
944
945/* Return the name of the type of the instance as a string. */
946const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri) {
947 if (ri->flags & SRI_MASTER) return "master";
948 else if (ri->flags & SRI_SLAVE) return "slave";
949 else if (ri->flags & SRI_SENTINEL) return "sentinel";
950 else return "unknown";
951}
952
953/* This function removes all the instances found in the dictionary of instances
954 * 'd', having either:
955 *
956 * 1) The same ip/port as specified.
957 * 2) The same runid.
958 *
959 * "1" and "2" don't need to verify at the same time, just one is enough.
960 * If "runid" is NULL it is not checked.
961 * Similarly if "ip" is NULL it is not checked.
962 *
963 * This function is useful because every time we add a new Sentinel into
964 * a master's Sentinels dictionary, we want to be very sure about not
965 * having duplicated instances for any reason. This is so important because
966 * we use those other sentinels in order to run our quorum protocol to
967 * understand if it's time to proceeed with the fail over.
968 *
969 * Making sure no duplication is possible we greately improve the robustness
970 * of the quorum (otherwise we may end counting the same instance multiple
971 * times for some reason).
972 *
973 * The function returns the number of Sentinels removed. */
974int removeMatchingSentinelsFromMaster(sentinelRedisInstance *master, char *ip, int port, char *runid) {
975 dictIterator *di;
976 dictEntry *de;
977 int removed = 0;
978
979 di = dictGetSafeIterator(master->sentinels);
980 while((de = dictNext(di)) != NULL) {
981 sentinelRedisInstance *ri = dictGetVal(de);
982
983 if ((ri->runid && runid && strcmp(ri->runid,runid) == 0) ||
984 (ip && strcmp(ri->addr->ip,ip) == 0 && port == ri->addr->port))
985 {
986 dictDelete(master->sentinels,ri->name);
987 removed++;
988 }
989 }
990 dictReleaseIterator(di);
991 return removed;
992}
993
994/* Search an instance with the same runid, ip and port into a dictionary
995 * of instances. Return NULL if not found, otherwise return the instance
996 * pointer.
997 *
998 * runid or ip can be NULL. In such a case the search is performed only
999 * by the non-NULL field. */
1000sentinelRedisInstance *getSentinelRedisInstanceByAddrAndRunID(dict *instances, char *ip, int port, char *runid) {
1001 dictIterator *di;
1002 dictEntry *de;
1003 sentinelRedisInstance *instance = NULL;
1004
1005 redisAssert(ip || runid); /* User must pass at least one search param. */
1006 di = dictGetIterator(instances);
1007 while((de = dictNext(di)) != NULL) {
1008 sentinelRedisInstance *ri = dictGetVal(de);
1009
1010 if (runid && !ri->runid) continue;
1011 if ((runid == NULL || strcmp(ri->runid, runid) == 0) &&
1012 (ip == NULL || (strcmp(ri->addr->ip, ip) == 0 &&
1013 ri->addr->port == port)))
1014 {
1015 instance = ri;
1016 break;
1017 }
1018 }
1019 dictReleaseIterator(di);
1020 return instance;
1021}
1022
1023/* Simple master lookup by name */
1024sentinelRedisInstance *sentinelGetMasterByName(char *name) {
1025 sentinelRedisInstance *ri;
1026 sds sdsname = sdsnew(name);
1027
1028 ri = dictFetchValue(sentinel.masters,sdsname);
1029 sdsfree(sdsname);
1030 return ri;
1031}
1032
1033/* Add the specified flags to all the instances in the specified dictionary. */
1034void sentinelAddFlagsToDictOfRedisInstances(dict *instances, int flags) {
1035 dictIterator *di;
1036 dictEntry *de;
1037
1038 di = dictGetIterator(instances);
1039 while((de = dictNext(di)) != NULL) {
1040 sentinelRedisInstance *ri = dictGetVal(de);
1041 ri->flags |= flags;
1042 }
1043 dictReleaseIterator(di);
1044}
1045
1046/* Remove the specified flags to all the instances in the specified
1047 * dictionary. */
1048void sentinelDelFlagsToDictOfRedisInstances(dict *instances, int flags) {
1049 dictIterator *di;
1050 dictEntry *de;
1051
1052 di = dictGetIterator(instances);
1053 while((de = dictNext(di)) != NULL) {
1054 sentinelRedisInstance *ri = dictGetVal(de);
1055 ri->flags &= ~flags;
1056 }
1057 dictReleaseIterator(di);
1058}
1059
1060/* Reset the state of a monitored master:
1061 * 1) Remove all slaves.
1062 * 2) Remove all sentinels.
1063 * 3) Remove most of the flags resulting from runtime operations.
1064 * 4) Reset timers to their default value.
1065 * 5) In the process of doing this undo the failover if in progress.
1066 * 6) Disconnect the connections with the master (will reconnect automatically).
1067 */
75fb6e5b 1068void sentinelResetMaster(sentinelRedisInstance *ri, int flags) {
6b5daa2d 1069 redisAssert(ri->flags & SRI_MASTER);
1070 dictRelease(ri->slaves);
1071 dictRelease(ri->sentinels);
1072 ri->slaves = dictCreate(&instancesDictType,NULL);
1073 ri->sentinels = dictCreate(&instancesDictType,NULL);
75fb6e5b 1074 if (ri->cc) sentinelKillLink(ri,ri->cc);
1075 if (ri->pc) sentinelKillLink(ri,ri->pc);
6b5daa2d 1076 ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED;
1077 if (ri->leader) {
1078 sdsfree(ri->leader);
1079 ri->leader = NULL;
1080 }
1081 ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
1082 ri->failover_state_change_time = 0;
1083 ri->failover_start_time = 0;
1084 ri->promoted_slave = NULL;
75fb6e5b 1085 sdsfree(ri->runid);
1086 sdsfree(ri->slave_master_host);
1087 ri->runid = NULL;
1088 ri->slave_master_host = NULL;
a23a5b6c 1089 ri->last_avail_time = mstime();
1090 ri->last_pong_time = mstime();
75fb6e5b 1091 if (flags & SENTINEL_GENERATE_EVENT)
1092 sentinelEvent(REDIS_WARNING,"+reset-master",ri,"%@");
6b5daa2d 1093}
1094
1095/* Call sentinelResetMaster() on every master with a name matching the specified
1096 * pattern. */
75fb6e5b 1097int sentinelResetMastersByPattern(char *pattern, int flags) {
6b5daa2d 1098 dictIterator *di;
1099 dictEntry *de;
1100 int reset = 0;
1101
1102 di = dictGetIterator(sentinel.masters);
1103 while((de = dictNext(di)) != NULL) {
1104 sentinelRedisInstance *ri = dictGetVal(de);
1105
1106 if (ri->name) {
1107 if (stringmatch(pattern,ri->name,0)) {
75fb6e5b 1108 sentinelResetMaster(ri,flags);
6b5daa2d 1109 reset++;
1110 }
1111 }
1112 }
1113 dictReleaseIterator(di);
1114 return reset;
1115}
1116
75fb6e5b 1117/* Reset the specified master with sentinelResetMaster(), and also change
1118 * the ip:port address, but take the name of the instance unmodified.
1119 *
1120 * This is used to handle the +switch-master and +redirect-to-master events.
1121 *
1122 * The function returns REDIS_ERR if the address can't be resolved for some
1123 * reason. Otherwise REDIS_OK is returned.
1124 *
1125 * TODO: make this reset so that original sentinels are re-added with
1126 * same ip / port / runid.
1127 */
1128
1129int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) {
1130 sentinelAddr *oldaddr, *newaddr;
1131
1132 newaddr = createSentinelAddr(ip,port);
1133 if (newaddr == NULL) return REDIS_ERR;
1134 sentinelResetMaster(master,SENTINEL_NO_FLAGS);
1135 oldaddr = master->addr;
1136 master->addr = newaddr;
1137 /* Release the old address at the end so we are safe even if the function
1138 * gets the master->addr->ip and master->addr->port as arguments. */
1139 releaseSentinelAddr(oldaddr);
1140 return REDIS_OK;
1141}
1142
6b5daa2d 1143/* ============================ Config handling ============================= */
1144char *sentinelHandleConfiguration(char **argv, int argc) {
1145 sentinelRedisInstance *ri;
1146
1147 if (!strcasecmp(argv[0],"monitor") && argc == 5) {
1148 /* monitor <name> <host> <port> <quorum> */
1149 int quorum = atoi(argv[4]);
1150
1151 if (quorum <= 0) return "Quorum must be 1 or greater.";
1152 if (createSentinelRedisInstance(argv[1],SRI_MASTER,argv[2],
1153 atoi(argv[3]),quorum,NULL) == NULL)
1154 {
1155 switch(errno) {
1156 case EBUSY: return "Duplicated master name.";
1157 case ENOENT: return "Can't resolve master instance hostname.";
1158 case EINVAL: return "Invalid port number";
1159 }
1160 }
1161 } else if (!strcasecmp(argv[0],"down-after-milliseconds") && argc == 3) {
1162 /* down-after-milliseconds <name> <milliseconds> */
1163 ri = sentinelGetMasterByName(argv[1]);
1164 if (!ri) return "No such master with specified name.";
1165 ri->down_after_period = atoi(argv[2]);
1166 if (ri->down_after_period <= 0)
1167 return "negative or zero time parameter.";
1168 } else if (!strcasecmp(argv[0],"failover-timeout") && argc == 3) {
1169 /* failover-timeout <name> <milliseconds> */
1170 ri = sentinelGetMasterByName(argv[1]);
1171 if (!ri) return "No such master with specified name.";
1172 ri->failover_timeout = atoi(argv[2]);
1173 if (ri->failover_timeout <= 0)
1174 return "negative or zero time parameter.";
1175 } else if (!strcasecmp(argv[0],"can-failover") && argc == 3) {
1176 /* can-failover <name> <yes/no> */
1177 int yesno = yesnotoi(argv[2]);
1178
1179 ri = sentinelGetMasterByName(argv[1]);
1180 if (!ri) return "No such master with specified name.";
1181 if (yesno == -1) return "Argument must be either yes or no.";
1182 if (yesno)
1183 ri->flags |= SRI_CAN_FAILOVER;
1184 else
1185 ri->flags &= ~SRI_CAN_FAILOVER;
1186 } else if (!strcasecmp(argv[0],"parallel-syncs") && argc == 3) {
1187 /* parallel-syncs <name> <milliseconds> */
1188 ri = sentinelGetMasterByName(argv[1]);
1189 if (!ri) return "No such master with specified name.";
1190 ri->parallel_syncs = atoi(argv[2]);
baace5fc 1191 } else if (!strcasecmp(argv[0],"notification-script") && argc == 3) {
1192 /* notification-script <name> <path> */
1193 ri = sentinelGetMasterByName(argv[1]);
1194 if (!ri) return "No such master with specified name.";
1195 if (access(argv[2],X_OK) == -1)
1196 return "Notification script seems non existing or non executable.";
1197 ri->notification_script = sdsnew(argv[2]);
1198 } else if (!strcasecmp(argv[0],"client-reconfig-script") && argc == 3) {
1199 /* client-reconfig-script <name> <path> */
1200 ri = sentinelGetMasterByName(argv[1]);
1201 if (!ri) return "No such master with specified name.";
1202 if (access(argv[2],X_OK) == -1)
1203 return "Client reconfiguration script seems non existing or "
1204 "non executable.";
1205 ri->client_reconfig_script = sdsnew(argv[2]);
6b5daa2d 1206 } else {
1207 return "Unrecognized sentinel configuration statement.";
1208 }
1209 return NULL;
1210}
1211
1212/* ====================== hiredis connection handling ======================= */
1213
75fb6e5b 1214/* Completely disconnect an hiredis link from an instance. */
1215void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c) {
9e5bef38 1216 if (ri->cc == c) {
1217 ri->cc = NULL;
1218 ri->pending_commands = 0;
1219 }
75fb6e5b 1220 if (ri->pc == c) ri->pc = NULL;
1221 c->data = NULL;
1222 ri->flags |= SRI_DISCONNECTED;
1223 redisAsyncFree(c);
1224}
1225
6b5daa2d 1226/* This function takes an hiredis context that is in an error condition
1227 * and make sure to mark the instance as disconnected performing the
1228 * cleanup needed.
1229 *
1230 * Note: we don't free the hiredis context as hiredis will do it for us
1231 * for async conenctions. */
1232void sentinelDisconnectInstanceFromContext(const redisAsyncContext *c) {
1233 sentinelRedisInstance *ri = c->data;
d876d6fe 1234 int pubsub;
6b5daa2d 1235
d876d6fe 1236 if (ri == NULL) return; /* The instance no longer exists. */
1237
1238 pubsub = (ri->pc == c);
6b5daa2d 1239 sentinelEvent(REDIS_DEBUG, pubsub ? "-pubsub-link" : "-cmd-link", ri,
1240 "%@ #%s", c->errstr);
1241 if (pubsub)
1242 ri->pc = NULL;
1243 else
1244 ri->cc = NULL;
1245 ri->flags |= SRI_DISCONNECTED;
1246}
1247
1248void sentinelLinkEstablishedCallback(const redisAsyncContext *c, int status) {
1249 if (status != REDIS_OK) {
1250 sentinelDisconnectInstanceFromContext(c);
1251 } else {
1252 sentinelRedisInstance *ri = c->data;
1253 int pubsub = (ri->pc == c);
1254
1255 sentinelEvent(REDIS_DEBUG, pubsub ? "+pubsub-link" : "+cmd-link", ri,
1256 "%@");
1257 }
1258}
1259
1260void sentinelDisconnectCallback(const redisAsyncContext *c, int status) {
1261 sentinelDisconnectInstanceFromContext(c);
1262}
1263
1264/* Create the async connections for the specified instance if the instance
1265 * is disconnected. Note that the SRI_DISCONNECTED flag is set even if just
1266 * one of the two links (commands and pub/sub) is missing. */
1267void sentinelReconnectInstance(sentinelRedisInstance *ri) {
1268 if (!(ri->flags & SRI_DISCONNECTED)) return;
1269
1270 /* Commands connection. */
1271 if (ri->cc == NULL) {
1272 ri->cc = redisAsyncConnect(ri->addr->ip,ri->addr->port);
1273 if (ri->cc->err) {
1274 sentinelEvent(REDIS_DEBUG,"-cmd-link-reconnection",ri,"%@ #%s",
1275 ri->cc->errstr);
75fb6e5b 1276 sentinelKillLink(ri,ri->cc);
6b5daa2d 1277 } else {
1278 ri->cc_conn_time = mstime();
1279 ri->cc->data = ri;
1280 redisAeAttach(server.el,ri->cc);
1281 redisAsyncSetConnectCallback(ri->cc,
1282 sentinelLinkEstablishedCallback);
1283 redisAsyncSetDisconnectCallback(ri->cc,
1284 sentinelDisconnectCallback);
1285 }
1286 }
1287 /* Pub / Sub */
1288 if ((ri->flags & SRI_MASTER) && ri->pc == NULL) {
1289 ri->pc = redisAsyncConnect(ri->addr->ip,ri->addr->port);
1290 if (ri->pc->err) {
1291 sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s",
1292 ri->pc->errstr);
75fb6e5b 1293 sentinelKillLink(ri,ri->pc);
6b5daa2d 1294 } else {
1295 int retval;
1296
1297 ri->pc_conn_time = mstime();
1298 ri->pc->data = ri;
1299 redisAeAttach(server.el,ri->pc);
1300 redisAsyncSetConnectCallback(ri->pc,
1301 sentinelLinkEstablishedCallback);
1302 redisAsyncSetDisconnectCallback(ri->pc,
1303 sentinelDisconnectCallback);
1304 /* Now we subscribe to the Sentinels "Hello" channel. */
1305 retval = redisAsyncCommand(ri->pc,
1306 sentinelReceiveHelloMessages, NULL, "SUBSCRIBE %s",
1307 SENTINEL_HELLO_CHANNEL);
1308 if (retval != REDIS_OK) {
1309 /* If we can't subscribe, the Pub/Sub connection is useless
1310 * and we can simply disconnect it and try again. */
75fb6e5b 1311 sentinelKillLink(ri,ri->pc);
6b5daa2d 1312 return;
1313 }
1314 }
1315 }
1316 /* Clear the DISCONNECTED flags only if we have both the connections
1317 * (or just the commands connection if this is a slave or a
1318 * sentinel instance). */
1319 if (ri->cc && (ri->flags & (SRI_SLAVE|SRI_SENTINEL) || ri->pc))
1320 ri->flags &= ~SRI_DISCONNECTED;
1321}
1322
1323/* ======================== Redis instances pinging ======================== */
1324
1325/* Process the INFO output from masters. */
1326void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
1327 sds *lines;
1328 int numlines, j;
1329 int role = 0;
d876d6fe 1330 int runid_changed = 0; /* true if runid changed. */
1331 int first_runid = 0; /* true if this is the first runid we receive. */
6b5daa2d 1332
1333 /* The following fields must be reset to a given value in the case they
1334 * are not found at all in the INFO output. */
1335 ri->master_link_down_time = 0;
1336
1337 /* Process line by line. */
1338 lines = sdssplitlen(info,strlen(info),"\r\n",2,&numlines);
1339 for (j = 0; j < numlines; j++) {
1340 sentinelRedisInstance *slave;
1341 sds l = lines[j];
1342
1343 /* run_id:<40 hex chars>*/
1344 if (sdslen(l) >= 47 && !memcmp(l,"run_id:",7)) {
1345 if (ri->runid == NULL) {
1346 ri->runid = sdsnewlen(l+7,40);
d876d6fe 1347 first_runid = 1;
6b5daa2d 1348 } else {
d876d6fe 1349 if (strncmp(ri->runid,l+7,40) != 0) {
1350 runid_changed = 1;
1351 sentinelEvent(REDIS_NOTICE,"+reboot",ri,"%@");
1352 sdsfree(ri->runid);
1353 ri->runid = sdsnewlen(l+7,40);
1354 }
6b5daa2d 1355 }
1356 }
1357
1358 /* slave0:<ip>,<port>,<state> */
1359 if ((ri->flags & SRI_MASTER) &&
1360 sdslen(l) >= 7 &&
1361 !memcmp(l,"slave",5) && isdigit(l[5]))
1362 {
1363 char *ip, *port, *end;
1364
1365 ip = strchr(l,':'); if (!ip) continue;
1366 ip++; /* Now ip points to start of ip address. */
1367 port = strchr(ip,','); if (!port) continue;
1368 *port = '\0'; /* nul term for easy access. */
1369 port++; /* Now port points to start of port number. */
1370 end = strchr(port,','); if (!end) continue;
1371 *end = '\0'; /* nul term for easy access. */
1372
1373 /* Check if we already have this slave into our table,
1374 * otherwise add it. */
1375 if (sentinelRedisInstanceLookupSlave(ri,ip,atoi(port)) == NULL) {
1376 if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,ip,
1377 atoi(port), ri->quorum,ri)) != NULL)
1378 {
1379 sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@");
1380 }
1381 }
1382 }
1383
1384 /* master_link_down_since_seconds:<seconds> */
1385 if (sdslen(l) >= 32 &&
1386 !memcmp(l,"master_link_down_since_seconds",30))
1387 {
1388 ri->master_link_down_time = strtoll(l+31,NULL,10)*1000;
1389 }
1390
1391 /* role:<role> */
1392 if (!memcmp(l,"role:master",11)) role = SRI_MASTER;
1393 else if (!memcmp(l,"role:slave",10)) role = SRI_SLAVE;
1394
1395 if (role == SRI_SLAVE) {
1396 /* master_host:<host> */
1397 if (sdslen(l) >= 12 && !memcmp(l,"master_host:",12)) {
1398 sdsfree(ri->slave_master_host);
1399 ri->slave_master_host = sdsnew(l+12);
1400 }
1401
1402 /* master_port:<port> */
1403 if (sdslen(l) >= 12 && !memcmp(l,"master_port:",12))
1404 ri->slave_master_port = atoi(l+12);
1405
1406 /* master_link_status:<status> */
1407 if (sdslen(l) >= 19 && !memcmp(l,"master_link_status:",19)) {
1408 ri->slave_master_link_status =
1409 (strcasecmp(l+19,"up") == 0) ?
1410 SENTINEL_MASTER_LINK_STATUS_UP :
1411 SENTINEL_MASTER_LINK_STATUS_DOWN;
1412 }
3ec701e0 1413
1414 /* slave_priority:<priority> */
1415 if (sdslen(l) >= 15 && !memcmp(l,"slave_priority:",15))
1416 ri->slave_priority = atoi(l+15);
6b5daa2d 1417 }
1418 }
1419 ri->info_refresh = mstime();
1420 sdsfreesplitres(lines,numlines);
1421
91c15ed1 1422 /* ---------------------------- Acting half ----------------------------- */
6b5daa2d 1423 if (sentinel.tilt) return;
1424
75fb6e5b 1425 /* Act if a master turned into a slave. */
1426 if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) {
1427 if (first_runid && ri->slave_master_host) {
1428 /* If it is the first time we receive INFO from it, but it's
1429 * a slave while it was configured as a master, we want to monitor
1430 * its master instead. */
1431 sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri,
1432 "%s %s %d %s %d",
1433 ri->name, ri->addr->ip, ri->addr->port,
1434 ri->slave_master_host, ri->slave_master_port);
1435 sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host,
1436 ri->slave_master_port);
1437 return;
1438 }
1439 }
1440
6b5daa2d 1441 /* Act if a slave turned into a master. */
1442 if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {
d876d6fe 1443 if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
1444 (runid_changed || first_runid))
1445 {
91c15ed1 1446 /* If a slave turned into maser but:
d876d6fe 1447 *
91c15ed1 1448 * 1) Failover not in progress.
1449 * 2) RunID hs changed, or its the first time we see an INFO output.
1450 *
1451 * We assume this is a reboot with a wrong configuration.
d876d6fe 1452 * Log the event and remove the slave. */
75fb6e5b 1453 int retval;
1454
d876d6fe 1455 sentinelEvent(REDIS_WARNING,"-slave-restart-as-master",ri,"%@ #removing it from the attached slaves");
1456 retval = dictDelete(ri->master->slaves,ri->name);
1457 redisAssert(retval == REDIS_OK);
1458 return;
1459 } else if (ri->flags & SRI_PROMOTED) {
6b5daa2d 1460 /* If this is a promoted slave we can change state to the
1461 * failover state machine. */
fd92b366 1462 if ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
6b5daa2d 1463 (ri->master->flags & SRI_I_AM_THE_LEADER) &&
1464 (ri->master->failover_state ==
1465 SENTINEL_FAILOVER_STATE_WAIT_PROMOTION))
1466 {
1467 ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;
1468 ri->master->failover_state_change_time = mstime();
1469 sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@");
1470 sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves",
1471 ri->master,"%@");
6275004c 1472 sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER,
1473 "start",ri->master->addr,ri->addr);
6b5daa2d 1474 }
fd92b366 1475 } else if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) ||
1476 ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
1477 (ri->master->flags & SRI_I_AM_THE_LEADER) &&
1478 ri->master->failover_state ==
1479 SENTINEL_FAILOVER_STATE_WAIT_START))
1480 {
1481 /* No failover in progress? Then it is the start of a failover
1482 * and we are an observer.
1483 *
1484 * We also do that if we are a leader doing a failover, in wait
1485 * start, but well, somebody else started before us. */
1486
1487 if (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) {
1488 sentinelEvent(REDIS_WARNING,"-failover-abort-race",
1489 ri->master, "%@");
1490 sentinelAbortFailover(ri->master);
6b5daa2d 1491 }
fd92b366 1492
1493 ri->master->flags |= SRI_FAILOVER_IN_PROGRESS;
1494 sentinelEvent(REDIS_WARNING,"+failover-detected",ri->master,"%@");
1495 ri->master->failover_state = SENTINEL_FAILOVER_STATE_DETECT_END;
1496 ri->master->failover_state_change_time = mstime();
1497 ri->master->promoted_slave = ri;
1498 ri->flags |= SRI_PROMOTED;
6275004c 1499 sentinelCallClientReconfScript(ri->master,SENTINEL_OBSERVER,
1500 "start", ri->master->addr,ri->addr);
fd92b366 1501 /* We are an observer, so we can only assume that the leader
1502 * is reconfiguring the slave instances. For this reason we
1503 * set all the instances as RECONF_SENT waiting for progresses
1504 * on this side. */
1505 sentinelAddFlagsToDictOfRedisInstances(ri->master->slaves,
1506 SRI_RECONF_SENT);
6b5daa2d 1507 }
1508 }
1509
1510 /* Detect if the slave that is in the process of being reconfigured
1511 * changed state. */
1512 if ((ri->flags & SRI_SLAVE) && role == SRI_SLAVE &&
1513 (ri->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG)))
1514 {
1515 /* SRI_RECONF_SENT -> SRI_RECONF_INPROG. */
1516 if ((ri->flags & SRI_RECONF_SENT) &&
1517 ri->slave_master_host &&
1518 strcmp(ri->slave_master_host,
1519 ri->master->promoted_slave->addr->ip) == 0 &&
1520 ri->slave_master_port == ri->master->promoted_slave->addr->port)
1521 {
1522 ri->flags &= ~SRI_RECONF_SENT;
1523 ri->flags |= SRI_RECONF_INPROG;
1524 sentinelEvent(REDIS_NOTICE,"+slave-reconf-inprog",ri,"%@");
1525 }
1526
1527 /* SRI_RECONF_INPROG -> SRI_RECONF_DONE */
1528 if ((ri->flags & SRI_RECONF_INPROG) &&
1529 ri->slave_master_link_status == SENTINEL_MASTER_LINK_STATUS_UP)
1530 {
1531 ri->flags &= ~SRI_RECONF_INPROG;
1532 ri->flags |= SRI_RECONF_DONE;
1533 sentinelEvent(REDIS_NOTICE,"+slave-reconf-done",ri,"%@");
1534 /* If we are moving forward (a new slave is now configured)
1535 * we update the change_time as we are conceptually passing
1536 * to the next slave. */
1537 ri->failover_state_change_time = mstime();
1538 }
1539 }
1540}
1541
1542void sentinelInfoReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1543 sentinelRedisInstance *ri = c->data;
1544 redisReply *r;
1545
2179c269 1546 if (ri) ri->pending_commands--;
1547 if (!reply || !ri) return;
6b5daa2d 1548 r = reply;
1549
1550 if (r->type == REDIS_REPLY_STRING) {
1551 sentinelRefreshInstanceInfo(ri,r->str);
1552 }
1553}
1554
1555/* Just discard the reply. We use this when we are not monitoring the return
1556 * value of the command but its effects directly. */
1557void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1558 sentinelRedisInstance *ri = c->data;
1559
2179c269 1560 if (ri) ri->pending_commands--;
6b5daa2d 1561}
1562
1563void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1564 sentinelRedisInstance *ri = c->data;
1565 redisReply *r;
1566
2179c269 1567 if (ri) ri->pending_commands--;
1568 if (!reply || !ri) return;
6b5daa2d 1569 r = reply;
1570
1571 if (r->type == REDIS_REPLY_STATUS ||
1572 r->type == REDIS_REPLY_ERROR) {
1573 /* Update the "instance available" field only if this is an
1574 * acceptable reply. */
1575 if (strncmp(r->str,"PONG",4) == 0 ||
1576 strncmp(r->str,"LOADING",7) == 0 ||
1577 strncmp(r->str,"MASTERDOWN",10) == 0)
1578 {
1579 ri->last_avail_time = mstime();
850789ce 1580 } else {
1581 /* Send a SCRIPT KILL command if the instance appears to be
1582 * down because of a busy script. */
1583 if (strncmp(r->str,"BUSY",4) == 0 &&
1584 (ri->flags & SRI_S_DOWN) &&
1585 !(ri->flags & SRI_SCRIPT_KILL_SENT))
1586 {
1587 redisAsyncCommand(ri->cc,
1588 sentinelDiscardReplyCallback, NULL, "SCRIPT KILL");
1589 ri->flags |= SRI_SCRIPT_KILL_SENT;
1590 }
6b5daa2d 1591 }
1592 }
1593 ri->last_pong_time = mstime();
1594}
1595
1596/* This is called when we get the reply about the PUBLISH command we send
1597 * to the master to advertise this sentinel. */
1598void sentinelPublishReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1599 sentinelRedisInstance *ri = c->data;
1600 redisReply *r;
1601
2179c269 1602 if (ri) ri->pending_commands--;
1603 if (!reply || !ri) return;
6b5daa2d 1604 r = reply;
1605
1606 /* Only update pub_time if we actually published our message. Otherwise
1607 * we'll retry against in 100 milliseconds. */
1608 if (r->type != REDIS_REPLY_ERROR)
1609 ri->last_pub_time = mstime();
1610}
1611
1612/* This is our Pub/Sub callback for the Hello channel. It's useful in order
1613 * to discover other sentinels attached at the same master. */
1614void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata) {
1615 sentinelRedisInstance *ri = c->data;
1616 redisReply *r;
1617
2179c269 1618 if (!reply || !ri) return;
6b5daa2d 1619 r = reply;
1620
1621 /* Update the last activity in the pubsub channel. Note that since we
1622 * receive our messages as well this timestamp can be used to detect
1623 * if the link is probably diconnected even if it seems otherwise. */
1624 ri->pc_last_activity = mstime();
1625
1626 /* Sanity check in the reply we expect, so that the code that follows
1627 * can avoid to check for details. */
1628 if (r->type != REDIS_REPLY_ARRAY ||
1629 r->elements != 3 ||
1630 r->element[0]->type != REDIS_REPLY_STRING ||
1631 r->element[1]->type != REDIS_REPLY_STRING ||
1632 r->element[2]->type != REDIS_REPLY_STRING ||
1633 strcmp(r->element[0]->str,"message") != 0) return;
1634
1635 /* We are not interested in meeting ourselves */
1636 if (strstr(r->element[2]->str,server.runid) != NULL) return;
1637
1638 {
1639 int numtokens, port, removed, canfailover;
1640 char **token = sdssplitlen(r->element[2]->str,
1641 r->element[2]->len,
1642 ":",1,&numtokens);
1643 sentinelRedisInstance *sentinel;
1644
1645 if (numtokens == 4) {
1646 /* First, try to see if we already have this sentinel. */
1647 port = atoi(token[1]);
1648 canfailover = atoi(token[3]);
1649 sentinel = getSentinelRedisInstanceByAddrAndRunID(
1650 ri->sentinels,token[0],port,token[2]);
1651
1652 if (!sentinel) {
1653 /* If not, remove all the sentinels that have the same runid
1654 * OR the same ip/port, because it's either a restart or a
1655 * network topology change. */
1656 removed = removeMatchingSentinelsFromMaster(ri,token[0],port,
1657 token[2]);
1658 if (removed) {
1659 sentinelEvent(REDIS_NOTICE,"-dup-sentinel",ri,
1660 "%@ #duplicate of %s:%d or %s",
1661 token[0],port,token[2]);
1662 }
1663
1664 /* Add the new sentinel. */
1665 sentinel = createSentinelRedisInstance(NULL,SRI_SENTINEL,
1666 token[0],port,ri->quorum,ri);
1667 if (sentinel) {
1668 sentinelEvent(REDIS_NOTICE,"+sentinel",sentinel,"%@");
1669 /* The runid is NULL after a new instance creation and
1670 * for Sentinels we don't have a later chance to fill it,
1671 * so do it now. */
1672 sentinel->runid = sdsnew(token[2]);
1673 }
1674 }
1675
1676 /* Update the state of the Sentinel. */
1677 if (sentinel) {
1678 sentinel->last_hello_time = mstime();
1679 if (canfailover)
1680 sentinel->flags |= SRI_CAN_FAILOVER;
1681 else
1682 sentinel->flags &= ~SRI_CAN_FAILOVER;
1683 }
1684 }
1685 sdsfreesplitres(token,numtokens);
1686 }
1687}
1688
1689void sentinelPingInstance(sentinelRedisInstance *ri) {
1690 mstime_t now = mstime();
1691 mstime_t info_period;
1692 int retval;
1693
1694 /* Return ASAP if we have already a PING or INFO already pending, or
1695 * in the case the instance is not properly connected. */
1696 if (ri->flags & SRI_DISCONNECTED) return;
1697
1698 /* For INFO, PING, PUBLISH that are not critical commands to send we
1699 * also have a limit of SENTINEL_MAX_PENDING_COMMANDS. We don't
1700 * want to use a lot of memory just because a link is not working
1701 * properly (note that anyway there is a redundant protection about this,
1702 * that is, the link will be disconnected and reconnected if a long
1703 * timeout condition is detected. */
1704 if (ri->pending_commands >= SENTINEL_MAX_PENDING_COMMANDS) return;
1705
1706 /* If this is a slave of a master in O_DOWN condition we start sending
1707 * it INFO every second, instead of the usual SENTINEL_INFO_PERIOD
1708 * period. In this state we want to closely monitor slaves in case they
1709 * are turned into masters by another Sentinel, or by the sysadmin. */
1710 if ((ri->flags & SRI_SLAVE) &&
1711 (ri->master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) {
1712 info_period = 1000;
1713 } else {
1714 info_period = SENTINEL_INFO_PERIOD;
1715 }
1716
1717 if ((ri->flags & SRI_SENTINEL) == 0 &&
1718 (ri->info_refresh == 0 ||
1719 (now - ri->info_refresh) > info_period))
1720 {
1721 /* Send INFO to masters and slaves, not sentinels. */
1722 retval = redisAsyncCommand(ri->cc,
1723 sentinelInfoReplyCallback, NULL, "INFO");
1724 if (retval != REDIS_OK) return;
1725 ri->pending_commands++;
1726 } else if ((now - ri->last_pong_time) > SENTINEL_PING_PERIOD) {
1727 /* Send PING to all the three kinds of instances. */
1728 retval = redisAsyncCommand(ri->cc,
1729 sentinelPingReplyCallback, NULL, "PING");
1730 if (retval != REDIS_OK) return;
1731 ri->pending_commands++;
1732 } else if ((ri->flags & SRI_MASTER) &&
1733 (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD)
1734 {
1735 /* PUBLISH hello messages only to masters. */
1736 struct sockaddr_in sa;
1737 socklen_t salen = sizeof(sa);
1738
1739 if (getsockname(ri->cc->c.fd,(struct sockaddr*)&sa,&salen) != -1) {
1740 char myaddr[128];
1741
1742 snprintf(myaddr,sizeof(myaddr),"%s:%d:%s:%d",
1743 inet_ntoa(sa.sin_addr), server.port, server.runid,
1744 (ri->flags & SRI_CAN_FAILOVER) != 0);
1745 retval = redisAsyncCommand(ri->cc,
1746 sentinelPublishReplyCallback, NULL, "PUBLISH %s %s",
1747 SENTINEL_HELLO_CHANNEL,myaddr);
1748 if (retval != REDIS_OK) return;
1749 ri->pending_commands++;
1750 }
1751 }
1752}
1753
1754/* =========================== SENTINEL command ============================= */
1755
1756const char *sentinelFailoverStateStr(int state) {
1757 switch(state) {
1758 case SENTINEL_FAILOVER_STATE_NONE: return "none";
1759 case SENTINEL_FAILOVER_STATE_WAIT_START: return "wait_start";
1760 case SENTINEL_FAILOVER_STATE_SELECT_SLAVE: return "select_slave";
1761 case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE: return "send_slaveof_noone";
1762 case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion";
1763 case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves";
1764 case SENTINEL_FAILOVER_STATE_ALERT_CLIENTS: return "alert_clients";
1765 case SENTINEL_FAILOVER_STATE_DETECT_END: return "detect_end";
1766 case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config";
1767 default: return "unknown";
1768 }
1769}
1770
1771/* Redis instance to Redis protocol representation. */
1772void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) {
1773 char *flags = sdsempty();
1774 void *mbl;
1775 int fields = 0;
1776
1777 mbl = addDeferredMultiBulkLength(c);
1778
1779 addReplyBulkCString(c,"name");
1780 addReplyBulkCString(c,ri->name);
1781 fields++;
1782
1783 addReplyBulkCString(c,"ip");
1784 addReplyBulkCString(c,ri->addr->ip);
1785 fields++;
1786
1787 addReplyBulkCString(c,"port");
1788 addReplyBulkLongLong(c,ri->addr->port);
1789 fields++;
1790
1791 addReplyBulkCString(c,"runid");
1792 addReplyBulkCString(c,ri->runid ? ri->runid : "");
1793 fields++;
1794
1795 addReplyBulkCString(c,"flags");
1796 if (ri->flags & SRI_S_DOWN) flags = sdscat(flags,"s_down,");
1797 if (ri->flags & SRI_O_DOWN) flags = sdscat(flags,"o_down,");
1798 if (ri->flags & SRI_MASTER) flags = sdscat(flags,"master,");
1799 if (ri->flags & SRI_SLAVE) flags = sdscat(flags,"slave,");
1800 if (ri->flags & SRI_SENTINEL) flags = sdscat(flags,"sentinel,");
1801 if (ri->flags & SRI_DISCONNECTED) flags = sdscat(flags,"disconnected,");
1802 if (ri->flags & SRI_MASTER_DOWN) flags = sdscat(flags,"master_down,");
1803 if (ri->flags & SRI_FAILOVER_IN_PROGRESS)
1804 flags = sdscat(flags,"failover_in_progress,");
1805 if (ri->flags & SRI_I_AM_THE_LEADER)
1806 flags = sdscat(flags,"i_am_the_leader,");
1807 if (ri->flags & SRI_PROMOTED) flags = sdscat(flags,"promoted,");
1808 if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,");
1809 if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,");
1810 if (ri->flags & SRI_RECONF_DONE) flags = sdscat(flags,"reconf_done,");
1811
1812 if (sdslen(flags) != 0) flags = sdsrange(flags,0,-2); /* remove last "," */
1813 addReplyBulkCString(c,flags);
1814 sdsfree(flags);
1815 fields++;
1816
1817 addReplyBulkCString(c,"pending-commands");
1818 addReplyBulkLongLong(c,ri->pending_commands);
1819 fields++;
1820
1821 if (ri->flags & SRI_FAILOVER_IN_PROGRESS) {
1822 addReplyBulkCString(c,"failover-state");
1823 addReplyBulkCString(c,(char*)sentinelFailoverStateStr(ri->failover_state));
1824 fields++;
1825 }
1826
1827 addReplyBulkCString(c,"last-ok-ping-reply");
1828 addReplyBulkLongLong(c,mstime() - ri->last_avail_time);
1829 fields++;
1830
1831 addReplyBulkCString(c,"last-ping-reply");
1832 addReplyBulkLongLong(c,mstime() - ri->last_pong_time);
1833 fields++;
1834
1835 if (ri->flags & SRI_S_DOWN) {
1836 addReplyBulkCString(c,"s-down-time");
1837 addReplyBulkLongLong(c,mstime()-ri->s_down_since_time);
1838 fields++;
1839 }
1840
1841 if (ri->flags & SRI_O_DOWN) {
1842 addReplyBulkCString(c,"o-down-time");
1843 addReplyBulkLongLong(c,mstime()-ri->o_down_since_time);
1844 fields++;
1845 }
1846
1847 /* Masters and Slaves */
1848 if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
1849 addReplyBulkCString(c,"info-refresh");
1850 addReplyBulkLongLong(c,mstime() - ri->info_refresh);
1851 fields++;
1852 }
1853
1854 /* Only masters */
1855 if (ri->flags & SRI_MASTER) {
1856 addReplyBulkCString(c,"num-slaves");
1857 addReplyBulkLongLong(c,dictSize(ri->slaves));
1858 fields++;
1859
1860 addReplyBulkCString(c,"num-other-sentinels");
1861 addReplyBulkLongLong(c,dictSize(ri->sentinels));
1862 fields++;
1863
1864 addReplyBulkCString(c,"quorum");
1865 addReplyBulkLongLong(c,ri->quorum);
1866 fields++;
1867 }
1868
1869 /* Only slaves */
1870 if (ri->flags & SRI_SLAVE) {
1871 addReplyBulkCString(c,"master-link-down-time");
1872 addReplyBulkLongLong(c,ri->master_link_down_time);
1873 fields++;
1874
1875 addReplyBulkCString(c,"master-link-status");
1876 addReplyBulkCString(c,
1877 (ri->slave_master_link_status == SENTINEL_MASTER_LINK_STATUS_UP) ?
1878 "ok" : "err");
1879 fields++;
1880
1881 addReplyBulkCString(c,"master-host");
1882 addReplyBulkCString(c,
1883 ri->slave_master_host ? ri->slave_master_host : "?");
1884 fields++;
1885
1886 addReplyBulkCString(c,"master-port");
1887 addReplyBulkLongLong(c,ri->slave_master_port);
1888 fields++;
3ec701e0 1889
1890 addReplyBulkCString(c,"slave-priority");
1891 addReplyBulkLongLong(c,ri->slave_priority);
1892 fields++;
6b5daa2d 1893 }
1894
1895 /* Only sentinels */
1896 if (ri->flags & SRI_SENTINEL) {
1897 addReplyBulkCString(c,"last-hello-message");
1898 addReplyBulkLongLong(c,mstime() - ri->last_hello_time);
1899 fields++;
1900
1901 addReplyBulkCString(c,"can-failover-its-master");
1902 addReplyBulkLongLong(c,(ri->flags & SRI_CAN_FAILOVER) != 0);
1903 fields++;
1904
1905 if (ri->flags & SRI_MASTER_DOWN) {
1906 addReplyBulkCString(c,"subjective-leader");
1907 addReplyBulkCString(c,ri->leader ? ri->leader : "?");
1908 fields++;
1909 }
1910 }
1911
1912 setDeferredMultiBulkLength(c,mbl,fields*2);
1913}
1914
1915/* Output a number of instances contanined inside a dictionary as
1916 * Redis protocol. */
1917void addReplyDictOfRedisInstances(redisClient *c, dict *instances) {
1918 dictIterator *di;
1919 dictEntry *de;
1920
1921 di = dictGetIterator(instances);
1922 addReplyMultiBulkLen(c,dictSize(instances));
1923 while((de = dictNext(di)) != NULL) {
1924 sentinelRedisInstance *ri = dictGetVal(de);
1925
1926 addReplySentinelRedisInstance(c,ri);
1927 }
1928 dictReleaseIterator(di);
1929}
1930
1931/* Lookup the named master into sentinel.masters.
1932 * If the master is not found reply to the client with an error and returns
1933 * NULL. */
1934sentinelRedisInstance *sentinelGetMasterByNameOrReplyError(redisClient *c,
1935 robj *name)
1936{
1937 sentinelRedisInstance *ri;
1938
1939 ri = dictFetchValue(sentinel.masters,c->argv[2]->ptr);
1940 if (!ri) {
1941 addReplyError(c,"No such master with that name");
1942 return NULL;
1943 }
1944 return ri;
1945}
1946
1947void sentinelCommand(redisClient *c) {
1948 if (!strcasecmp(c->argv[1]->ptr,"masters")) {
1949 /* SENTINEL MASTERS */
1950 if (c->argc != 2) goto numargserr;
1951
1952 addReplyDictOfRedisInstances(c,sentinel.masters);
1953 } else if (!strcasecmp(c->argv[1]->ptr,"slaves")) {
1954 /* SENTINEL SLAVES <master-name> */
1955 sentinelRedisInstance *ri;
1956
1957 if (c->argc != 3) goto numargserr;
1958 if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
1959 return;
1960 addReplyDictOfRedisInstances(c,ri->slaves);
1961 } else if (!strcasecmp(c->argv[1]->ptr,"sentinels")) {
1962 /* SENTINEL SENTINELS <master-name> */
1963 sentinelRedisInstance *ri;
1964
1965 if (c->argc != 3) goto numargserr;
1966 if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
1967 return;
1968 addReplyDictOfRedisInstances(c,ri->sentinels);
1969 } else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) {
1970 /* SENTINEL IS-MASTER-DOWN-BY-ADDR <ip> <port> */
1971 sentinelRedisInstance *ri;
1972 char *leader = NULL;
1973 long port;
1974 int isdown = 0;
1975
1976 if (c->argc != 4) goto numargserr;
1977 if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK)
1978 return;
1979 ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters,
1980 c->argv[2]->ptr,port,NULL);
1981
1982 /* It exists? Is actually a master? Is subjectively down? It's down.
1983 * Note: if we are in tilt mode we always reply with "0". */
1984 if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) &&
1985 (ri->flags & SRI_MASTER))
1986 isdown = 1;
1987 if (ri) leader = sentinelGetSubjectiveLeader(ri);
1988
1989 /* Reply with a two-elements multi-bulk reply: down state, leader. */
1990 addReplyMultiBulkLen(c,2);
1991 addReply(c, isdown ? shared.cone : shared.czero);
1992 addReplyBulkCString(c, leader ? leader : "?");
1993 if (leader) sdsfree(leader);
1994 } else if (!strcasecmp(c->argv[1]->ptr,"reset")) {
1995 /* SENTINEL RESET <pattern> */
1996 if (c->argc != 3) goto numargserr;
75fb6e5b 1997 addReplyLongLong(c,sentinelResetMastersByPattern(c->argv[2]->ptr,SENTINEL_GENERATE_EVENT));
6b5daa2d 1998 } else if (!strcasecmp(c->argv[1]->ptr,"get-master-addr-by-name")) {
1999 /* SENTINEL GET-MASTER-ADDR-BY-NAME <master-name> */
2000 sentinelRedisInstance *ri;
2001
2002 if (c->argc != 3) goto numargserr;
2003 ri = sentinelGetMasterByName(c->argv[2]->ptr);
2004 if (ri == NULL) {
2005 addReply(c,shared.nullmultibulk);
2006 } else {
2007 sentinelAddr *addr = ri->addr;
2008
2009 if ((ri->flags & SRI_FAILOVER_IN_PROGRESS) && ri->promoted_slave)
2010 addr = ri->promoted_slave->addr;
2011 addReplyMultiBulkLen(c,2);
2012 addReplyBulkCString(c,addr->ip);
2013 addReplyBulkLongLong(c,addr->port);
2014 }
cada7f96 2015 } else if (!strcasecmp(c->argv[1]->ptr,"failover")) {
2016 /* SENTINEL FAILOVER <master-name> */
2017 sentinelRedisInstance *ri;
2018
2019 if (c->argc != 3) goto numargserr;
2020 if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
2021 return;
2022 if (ri->flags & SRI_FAILOVER_IN_PROGRESS) {
2023 addReplySds(c,sdsnew("-INPROG Failover already in progress\r\n"));
2024 return;
2025 }
2026 if (sentinelSelectSlave(ri) == NULL) {
2027 addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n"));
2028 return;
2029 }
2030 sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START);
2031 ri->flags |= SRI_FORCE_FAILOVER;
2032 addReply(c,shared.ok);
3f194a9d 2033 } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) {
2034 /* SENTINEL PENDING-SCRIPTS */
2035
2036 if (c->argc != 2) goto numargserr;
2037 sentinelPendingScriptsCommand(c);
6b5daa2d 2038 } else {
2039 addReplyErrorFormat(c,"Unknown sentinel subcommand '%s'",
2040 (char*)c->argv[1]->ptr);
2041 }
2042 return;
2043
2044numargserr:
2045 addReplyErrorFormat(c,"Wrong number of commands for 'sentinel %s'",
2046 (char*)c->argv[1]->ptr);
2047}
2048
2049/* ===================== SENTINEL availability checks ======================= */
2050
2051/* Is this instance down from our point of view? */
2052void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
2053 mstime_t elapsed = mstime() - ri->last_avail_time;
2054
2055 /* Check if we are in need for a reconnection of one of the
2056 * links, because we are detecting low activity.
2057 *
2058 * 1) Check if the command link seems connected, was connected not less
2059 * than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have an
2060 * idle time that is greater than down_after_period / 2 seconds. */
2061 if (ri->cc &&
2062 (mstime() - ri->cc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
2063 (mstime() - ri->last_pong_time) > (ri->down_after_period/2))
2064 {
75fb6e5b 2065 sentinelKillLink(ri,ri->cc);
6b5daa2d 2066 }
2067
2068 /* 2) Check if the pubsub link seems connected, was connected not less
2069 * than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have no
2070 * activity in the Pub/Sub channel for more than
2071 * SENTINEL_PUBLISH_PERIOD * 3.
2072 */
2073 if (ri->pc &&
2074 (mstime() - ri->pc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
2075 (mstime() - ri->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD*3))
2076 {
75fb6e5b 2077 sentinelKillLink(ri,ri->pc);
6b5daa2d 2078 }
2079
2080 /* Update the subjectively down flag. */
2081 if (elapsed > ri->down_after_period) {
2082 /* Is subjectively down */
2083 if ((ri->flags & SRI_S_DOWN) == 0) {
2084 sentinelEvent(REDIS_WARNING,"+sdown",ri,"%@");
2085 ri->s_down_since_time = mstime();
2086 ri->flags |= SRI_S_DOWN;
2087 }
2088 } else {
2089 /* Is subjectively up */
2090 if (ri->flags & SRI_S_DOWN) {
2091 sentinelEvent(REDIS_WARNING,"-sdown",ri,"%@");
850789ce 2092 ri->flags &= ~(SRI_S_DOWN|SRI_SCRIPT_KILL_SENT);
6b5daa2d 2093 }
2094 }
2095}
2096
2097/* Is this instance down accordingly to the configured quorum? */
2098void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
2099 dictIterator *di;
2100 dictEntry *de;
2101 int quorum = 0, odown = 0;
2102
2103 if (master->flags & SRI_S_DOWN) {
2104 /* Is down for enough sentinels? */
2105 quorum = 1; /* the current sentinel. */
2106 /* Count all the other sentinels. */
2107 di = dictGetIterator(master->sentinels);
2108 while((de = dictNext(di)) != NULL) {
2109 sentinelRedisInstance *ri = dictGetVal(de);
2110
2111 if (ri->flags & SRI_MASTER_DOWN) quorum++;
2112 }
2113 dictReleaseIterator(di);
2114 if (quorum >= master->quorum) odown = 1;
2115 }
2116
2117 /* Set the flag accordingly to the outcome. */
2118 if (odown) {
2119 if ((master->flags & SRI_O_DOWN) == 0) {
2120 sentinelEvent(REDIS_WARNING,"+odown",master,"%@ #quorum %d/%d",
2121 quorum, master->quorum);
2122 master->flags |= SRI_O_DOWN;
2123 master->o_down_since_time = mstime();
2124 }
2125 } else {
2126 if (master->flags & SRI_O_DOWN) {
2127 sentinelEvent(REDIS_WARNING,"-odown",master,"%@");
2128 master->flags &= ~SRI_O_DOWN;
2129 }
2130 }
2131}
2132
2133/* Receive the SENTINEL is-master-down-by-addr reply, see the
2134 * sentinelAskMasterStateToOtherSentinels() function for more information. */
2135void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *privdata) {
2136 sentinelRedisInstance *ri = c->data;
2137 redisReply *r;
2138
2179c269 2139 if (ri) ri->pending_commands--;
2140 if (!reply || !ri) return;
6b5daa2d 2141 r = reply;
2142
2143 /* Ignore every error or unexpected reply.
2144 * Note that if the command returns an error for any reason we'll
2145 * end clearing the SRI_MASTER_DOWN flag for timeout anyway. */
2146 if (r->type == REDIS_REPLY_ARRAY && r->elements == 2 &&
2147 r->element[0]->type == REDIS_REPLY_INTEGER &&
2148 r->element[1]->type == REDIS_REPLY_STRING)
2149 {
2150 ri->last_master_down_reply_time = mstime();
2151 if (r->element[0]->integer == 1) {
2152 ri->flags |= SRI_MASTER_DOWN;
2153 } else {
2154 ri->flags &= ~SRI_MASTER_DOWN;
2155 }
2156 sdsfree(ri->leader);
2157 ri->leader = sdsnew(r->element[1]->str);
2158 }
2159}
2160
2161/* If we think (subjectively) the master is down, we start sending
2162 * SENTINEL IS-MASTER-DOWN-BY-ADDR requests to other sentinels
2163 * in order to get the replies that allow to reach the quorum and
2164 * possibly also mark the master as objectively down. */
2165void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) {
2166 dictIterator *di;
2167 dictEntry *de;
2168
2169 di = dictGetIterator(master->sentinels);
2170 while((de = dictNext(di)) != NULL) {
2171 sentinelRedisInstance *ri = dictGetVal(de);
2172 mstime_t elapsed = mstime() - ri->last_master_down_reply_time;
2173 char port[32];
2174 int retval;
2175
2176 /* If the master state from other sentinel is too old, we clear it. */
2177 if (elapsed > SENTINEL_INFO_VALIDITY_TIME) {
2178 ri->flags &= ~SRI_MASTER_DOWN;
2179 sdsfree(ri->leader);
2180 ri->leader = NULL;
2181 }
2182
2183 /* Only ask if master is down to other sentinels if:
2184 *
2185 * 1) We believe it is down, or there is a failover in progress.
2186 * 2) Sentinel is connected.
2187 * 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */
2188 if ((master->flags & (SRI_S_DOWN|SRI_FAILOVER_IN_PROGRESS)) == 0)
2189 continue;
2190 if (ri->flags & SRI_DISCONNECTED) continue;
2191 if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD)
2192 continue;
2193
2194 /* Ask */
2195 ll2string(port,sizeof(port),master->addr->port);
2196 retval = redisAsyncCommand(ri->cc,
2197 sentinelReceiveIsMasterDownReply, NULL,
2198 "SENTINEL is-master-down-by-addr %s %s",
2199 master->addr->ip, port);
2200 if (retval == REDIS_OK) ri->pending_commands++;
2201 }
2202 dictReleaseIterator(di);
2203}
2204
2205/* =============================== FAILOVER ================================= */
2206
2207/* Given a master get the "subjective leader", that is, among all the sentinels
2208 * with given characteristics, the one with the lexicographically smaller
2209 * runid. The characteristics required are:
2210 *
2211 * 1) Has SRI_CAN_FAILOVER flag.
2212 * 2) Is not disconnected.
2213 * 3) Recently answered to our ping (no longer than
2214 * SENTINEL_INFO_VALIDITY_TIME milliseconds ago).
2215 *
2216 * The function returns a pointer to an sds string representing the runid of the
2217 * leader sentinel instance (from our point of view). Otherwise NULL is
2218 * returned if there are no suitable sentinels.
2219 */
2220
2221int compareRunID(const void *a, const void *b) {
2222 char **aptrptr = (char**)a, **bptrptr = (char**)b;
2223 return strcasecmp(*aptrptr, *bptrptr);
2224}
2225
2226char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master) {
2227 dictIterator *di;
2228 dictEntry *de;
2229 char **instance =
2230 zmalloc(sizeof(char*)*(dictSize(master->sentinels)+1));
2231 int instances = 0;
2232 char *leader = NULL;
2233
2234 if (master->flags & SRI_CAN_FAILOVER) {
2235 /* Add myself if I'm a Sentinel that can failover this master. */
2236 instance[instances++] = server.runid;
2237 }
2238
2239 di = dictGetIterator(master->sentinels);
2240 while((de = dictNext(di)) != NULL) {
2241 sentinelRedisInstance *ri = dictGetVal(de);
2242 mstime_t lag = mstime() - ri->last_avail_time;
2243
2244 if (lag > SENTINEL_INFO_VALIDITY_TIME ||
2245 !(ri->flags & SRI_CAN_FAILOVER) ||
2246 (ri->flags & SRI_DISCONNECTED) ||
2247 ri->runid == NULL)
2248 continue;
2249 instance[instances++] = ri->runid;
2250 }
2251 dictReleaseIterator(di);
2252
2253 /* If we have at least one instance passing our checks, order the array
2254 * by runid. */
2255 if (instances) {
2256 qsort(instance,instances,sizeof(char*),compareRunID);
2257 leader = sdsnew(instance[0]);
2258 }
2259 zfree(instance);
2260 return leader;
2261}
2262
2263struct sentinelLeader {
2264 char *runid;
2265 unsigned long votes;
2266};
2267
2268/* Helper function for sentinelGetObjectiveLeader, increment the counter
2269 * relative to the specified runid. */
2270void sentinelObjectiveLeaderIncr(dict *counters, char *runid) {
2271 dictEntry *de = dictFind(counters,runid);
2272 uint64_t oldval;
2273
2274 if (de) {
2275 oldval = dictGetUnsignedIntegerVal(de);
2276 dictSetUnsignedIntegerVal(de,oldval+1);
2277 } else {
2278 de = dictAddRaw(counters,runid);
2279 redisAssert(de != NULL);
2280 dictSetUnsignedIntegerVal(de,1);
2281 }
2282}
2283
2284/* Scan all the Sentinels attached to this master to check what is the
2285 * most voted leader among Sentinels. */
2286char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) {
2287 dict *counters;
2288 dictIterator *di;
2289 dictEntry *de;
2290 unsigned int voters = 0, voters_quorum;
2291 char *myvote;
2292 char *winner = NULL;
2293
2294 redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS));
2295 counters = dictCreate(&leaderVotesDictType,NULL);
2296
2297 /* Count my vote. */
2298 myvote = sentinelGetSubjectiveLeader(master);
2299 if (myvote) {
2300 sentinelObjectiveLeaderIncr(counters,myvote);
2301 voters++;
2302 }
2303
2304 /* Count other sentinels votes */
2305 di = dictGetIterator(master->sentinels);
2306 while((de = dictNext(di)) != NULL) {
2307 sentinelRedisInstance *ri = dictGetVal(de);
2308 if (ri->leader == NULL) continue;
2309 /* If the failover is not already in progress we are only interested
2310 * in Sentinels that believe the master is down. Otherwise the leader
2311 * selection is useful for the "failover-takedown" when the original
2312 * leader fails. In that case we consider all the voters. */
2313 if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) &&
2314 !(ri->flags & SRI_MASTER_DOWN)) continue;
2315 sentinelObjectiveLeaderIncr(counters,ri->leader);
2316 voters++;
2317 }
2318 dictReleaseIterator(di);
2319 voters_quorum = voters/2+1;
2320
2321 /* Check what's the winner. For the winner to win, it needs two conditions:
2322 * 1) Absolute majority between voters (50% + 1).
2323 * 2) And anyway at least master->quorum votes. */
2324 {
2325 uint64_t max_votes = 0; /* Max votes so far. */
2326
2327 di = dictGetIterator(counters);
2328 while((de = dictNext(di)) != NULL) {
2329 uint64_t votes = dictGetUnsignedIntegerVal(de);
2330
2331 if (max_votes < votes) {
2332 max_votes = votes;
2333 winner = dictGetKey(de);
2334 }
2335 }
2336 dictReleaseIterator(di);
2337 if (winner && (max_votes < voters_quorum || max_votes < master->quorum))
2338 winner = NULL;
2339 }
2340 winner = winner ? sdsnew(winner) : NULL;
2341 sdsfree(myvote);
2342 dictRelease(counters);
2343 return winner;
2344}
2345
cada7f96 2346/* Setup the master state to start a failover as a leader.
2347 *
2348 * State can be either:
2349 *
2350 * SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch.
2351 * SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover.
2352 */
2353void sentinelStartFailover(sentinelRedisInstance *master, int state) {
2354 redisAssert(master->flags & SRI_MASTER);
2355 redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START ||
2356 state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
2357
2358 master->failover_state = state;
2359 master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
2360 sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
2361
2362 /* Pick a random delay if it's a fresh failover (WAIT_START), and not
2363 * a recovery of a failover started by another sentinel. */
2364 if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
2365 master->failover_start_time = mstime() +
2366 SENTINEL_FAILOVER_FIXED_DELAY +
2367 (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
2368 sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
2369 "%@ #starting in %lld milliseconds",
2370 master->failover_start_time-mstime());
2371 }
2372 master->failover_state_change_time = mstime();
2373}
2374
6b5daa2d 2375/* This function checks if there are the conditions to start the failover,
2376 * that is:
2377 *
2378 * 1) Enough time has passed since O_DOWN.
2379 * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it.
2380 * 3) We are the objectively leader for this master.
2381 *
2382 * If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS
2383 * and SRI_I_AM_THE_LEADER.
2384 */
cada7f96 2385void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
6b5daa2d 2386 char *leader;
2387 int isleader;
2388
2389 /* We can't failover if the master is not in O_DOWN state or if
2390 * there is not already a failover in progress (to perform the
2391 * takedown if the leader died) or if this Sentinel is not allowed
2392 * to start a failover. */
2393 if (!(master->flags & SRI_CAN_FAILOVER) ||
2394 !(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) return;
2395
2396 leader = sentinelGetObjectiveLeader(master);
2397 isleader = leader && strcasecmp(leader,server.runid) == 0;
2398 sdsfree(leader);
2399
2400 /* If I'm not the leader, I can't failover for sure. */
2401 if (!isleader) return;
2402
2403 /* If the failover is already in progress there are two options... */
2404 if (master->flags & SRI_FAILOVER_IN_PROGRESS) {
2405 if (master->flags & SRI_I_AM_THE_LEADER) {
2406 /* 1) I'm flagged as leader so I already started the failover.
2407 * Just return. */
2408 return;
2409 } else {
2410 mstime_t elapsed = mstime() - master->failover_state_change_time;
2411
2412 /* 2) I'm the new leader, but I'm not flagged as leader in the
2413 * master: I did not started the failover, but the original
2414 * leader has no longer the leadership.
2415 *
2416 * In this case if the failover appears to be lagging
2417 * for at least 25% of the configured failover timeout,
2418 * I can assume I can take control. Otherwise
2419 * it's better to return and wait more. */
2420 if (elapsed < (master->failover_timeout/4)) return;
2421 sentinelEvent(REDIS_WARNING,"+failover-takedown",master,"%@");
2422 /* We have already an elected slave if we are in
2423 * FAILOVER_IN_PROGRESS state, that is, the slave that we
2424 * observed turning into a master. */
cada7f96 2425 sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
6b5daa2d 2426 /* As an observer we flagged all the slaves as RECONF_SENT but
2427 * now we are in charge of actually sending the reconfiguration
2428 * command so let's clear this flag for all the instances. */
2429 sentinelDelFlagsToDictOfRedisInstances(master->slaves,
2430 SRI_RECONF_SENT);
2431 }
2432 } else {
ce7b838f 2433 /* Brand new failover as SRI_FAILOVER_IN_PROGRESS was not set.
2434 *
2435 * Do we have a slave to promote? Otherwise don't start a failover
2436 * at all. */
2437 if (sentinelSelectSlave(master) == NULL) return;
cada7f96 2438 sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START);
6b5daa2d 2439 }
6b5daa2d 2440}
2441
2442/* Select a suitable slave to promote. The current algorithm only uses
2443 * the following parameters:
2444 *
2445 * 1) None of the following conditions: S_DOWN, O_DOWN, DISCONNECTED.
2446 * 2) last_avail_time more recent than SENTINEL_INFO_VALIDITY_TIME.
2447 * 3) info_refresh more recent than SENTINEL_INFO_VALIDITY_TIME.
2448 * 4) master_link_down_time no more than:
2449 * (now - master->s_down_since_time) + (master->down_after_period * 10).
3ec701e0 2450 * 5) Slave priority can't be zero, otherwise the slave is discareded.
6b5daa2d 2451 *
2452 * Among all the slaves matching the above conditions we select the slave
2453 * with lower slave_priority. If priority is the same we select the slave
2454 * with lexicographically smaller runid.
2455 *
2456 * The function returns the pointer to the selected slave, otherwise
2457 * NULL if no suitable slave was found.
2458 */
2459
2460int compareSlavesForPromotion(const void *a, const void *b) {
2461 sentinelRedisInstance **sa = (sentinelRedisInstance **)a,
2462 **sb = (sentinelRedisInstance **)b;
2463 if ((*sa)->slave_priority != (*sb)->slave_priority)
2464 return (*sa)->slave_priority - (*sb)->slave_priority;
2465 return strcasecmp((*sa)->runid,(*sb)->runid);
2466}
2467
2468sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
2469 sentinelRedisInstance **instance =
2470 zmalloc(sizeof(instance[0])*dictSize(master->slaves));
2471 sentinelRedisInstance *selected = NULL;
2472 int instances = 0;
2473 dictIterator *di;
2474 dictEntry *de;
cada7f96 2475 mstime_t max_master_down_time = 0;
6b5daa2d 2476
cada7f96 2477 if (master->flags & SRI_S_DOWN)
2478 max_master_down_time += mstime() - master->s_down_since_time;
2479 max_master_down_time += master->down_after_period * 10;
6b5daa2d 2480
2481 di = dictGetIterator(master->slaves);
2482 while((de = dictNext(di)) != NULL) {
2483 sentinelRedisInstance *slave = dictGetVal(de);
2484 mstime_t info_validity_time = mstime()-SENTINEL_INFO_VALIDITY_TIME;
2485
2486 if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue;
2487 if (slave->last_avail_time < info_validity_time) continue;
3ec701e0 2488 if (slave->slave_priority == 0) continue;
cada7f96 2489
2490 /* If the master is in SDOWN state we get INFO for slaves every second.
2491 * Otherwise we get it with the usual period so we need to account for
2492 * a larger delay. */
2493 if ((master->flags & SRI_S_DOWN) == 0)
2494 info_validity_time -= SENTINEL_INFO_PERIOD;
6b5daa2d 2495 if (slave->info_refresh < info_validity_time) continue;
2496 if (slave->master_link_down_time > max_master_down_time) continue;
2497 instance[instances++] = slave;
2498 }
2499 dictReleaseIterator(di);
2500 if (instances) {
2501 qsort(instance,instances,sizeof(sentinelRedisInstance*),
2502 compareSlavesForPromotion);
2503 selected = instance[0];
2504 }
2505 zfree(instance);
2506 return selected;
2507}
2508
2509/* ---------------- Failover state machine implementation ------------------- */
2510void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
75084e05 2511 /* If we in "wait start" but the master is no longer in ODOWN nor in
2512 * SDOWN condition we abort the failover. This is important as it
2513 * prevents a useless failover in a a notable case of netsplit, where
2514 * the senitnels are split from the redis instances. In this case
2515 * the failover will not start while there is the split because no
2516 * good slave can be reached. However when the split is resolved, we
2517 * can go to waitstart if the slave is back rechable a few milliseconds
2518 * before the master is. In that case when the master is back online
2519 * we cancel the failover. */
cada7f96 2520 if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) {
75084e05 2521 sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back",
2522 ri,"%@");
2523 sentinelAbortFailover(ri);
2524 return;
2525 }
2526
2527 /* Start the failover going to the next state if enough time has
2528 * elapsed. */
6b5daa2d 2529 if (mstime() >= ri->failover_start_time) {
2530 ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
2531 ri->failover_state_change_time = mstime();
2532 sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
2533 }
2534}
2535
2536void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
2537 sentinelRedisInstance *slave = sentinelSelectSlave(ri);
2538
2539 if (slave == NULL) {
672102c2 2540 sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@");
2541 sentinelAbortFailover(ri);
6b5daa2d 2542 } else {
2543 sentinelEvent(REDIS_WARNING,"+selected-slave",slave,"%@");
2544 slave->flags |= SRI_PROMOTED;
2545 ri->promoted_slave = slave;
2546 ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE;
2547 ri->failover_state_change_time = mstime();
2548 sentinelEvent(REDIS_NOTICE,"+failover-state-send-slaveof-noone",
2549 slave, "%@");
2550 }
2551}
2552
2553void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
2554 int retval;
2555
2556 if (ri->promoted_slave->flags & SRI_DISCONNECTED) return;
2557
2558 /* Send SLAVEOF NO ONE command to turn the slave into a master.
2559 * We actually register a generic callback for this command as we don't
2560 * really care about the reply. We check if it worked indirectly observing
2561 * if INFO returns a different role (master instead of slave). */
2562 retval = redisAsyncCommand(ri->promoted_slave->cc,
2563 sentinelDiscardReplyCallback, NULL, "SLAVEOF NO ONE");
2564 if (retval != REDIS_OK) return;
2565 ri->promoted_slave->pending_commands++;
2566 sentinelEvent(REDIS_NOTICE, "+failover-state-wait-promotion",
2567 ri->promoted_slave,"%@");
2568 ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;
2569 ri->failover_state_change_time = mstime();
2570}
2571
2572/* We actually wait for promotion indirectly checking with INFO when the
2573 * slave turns into a master. */
2574void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {
2575 mstime_t elapsed = mstime() - ri->failover_state_change_time;
2576
2577 if (elapsed >= SENTINEL_PROMOTION_RETRY_PERIOD) {
2578 sentinelEvent(REDIS_WARNING,"-promotion-timeout",ri->promoted_slave,
2579 "%@");
2580 sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
2581 ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
2582 ri->failover_state_change_time = mstime();
2583 ri->promoted_slave->flags &= ~SRI_PROMOTED;
2584 ri->promoted_slave = NULL;
2585 }
2586}
2587
2588void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {
2589 int not_reconfigured = 0, timeout = 0;
2590 dictIterator *di;
2591 dictEntry *de;
2592 mstime_t elapsed = mstime() - master->failover_state_change_time;
2593
2594 /* We can't consider failover finished if the promoted slave is
2595 * not reachable. */
2596 if (master->promoted_slave == NULL ||
2597 master->promoted_slave->flags & SRI_S_DOWN) return;
2598
2599 /* The failover terminates once all the reachable slaves are properly
2600 * configured. */
2601 di = dictGetIterator(master->slaves);
2602 while((de = dictNext(di)) != NULL) {
2603 sentinelRedisInstance *slave = dictGetVal(de);
2604
2605 if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
2606 if (slave->flags & SRI_S_DOWN) continue;
2607 not_reconfigured++;
2608 }
2609 dictReleaseIterator(di);
2610
2611 /* Force end of failover on timeout. */
2612 if (elapsed > master->failover_timeout) {
2613 not_reconfigured = 0;
2614 timeout = 1;
2615 sentinelEvent(REDIS_WARNING,"+failover-end-for-timeout",master,"%@");
2616 }
2617
2618 if (not_reconfigured == 0) {
6275004c 2619 int role = (master->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
2620 SENTINEL_OBSERVER;
2621
6b5daa2d 2622 sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@");
2623 master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG;
2624 master->failover_state_change_time = mstime();
6275004c 2625 sentinelCallClientReconfScript(master,role,"end",master->addr,
2626 master->promoted_slave->addr);
6b5daa2d 2627 }
2628
2629 /* If I'm the leader it is a good idea to send a best effort SLAVEOF
2630 * command to all the slaves still not reconfigured to replicate with
2631 * the new master. */
2632 if (timeout && (master->flags & SRI_I_AM_THE_LEADER)) {
2633 dictIterator *di;
2634 dictEntry *de;
2635 char master_port[32];
2636
2637 ll2string(master_port,sizeof(master_port),
2638 master->promoted_slave->addr->port);
2639
2640 di = dictGetIterator(master->slaves);
2641 while((de = dictNext(di)) != NULL) {
2642 sentinelRedisInstance *slave = dictGetVal(de);
2643 int retval;
2644
2645 if (slave->flags &
2646 (SRI_RECONF_DONE|SRI_RECONF_SENT|SRI_DISCONNECTED)) continue;
2647
2648 retval = redisAsyncCommand(slave->cc,
2649 sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s",
2650 master->promoted_slave->addr->ip,
2651 master_port);
2652 if (retval == REDIS_OK) {
2653 sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent-be",slave,"%@");
2654 slave->flags |= SRI_RECONF_SENT;
2655 }
2656 }
2657 dictReleaseIterator(di);
2658 }
2659}
2660
2661/* Send SLAVE OF <new master address> to all the remaining slaves that
2662 * still don't appear to have the configuration updated. */
2663void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
2664 dictIterator *di;
2665 dictEntry *de;
2666 int in_progress = 0;
2667
2668 di = dictGetIterator(master->slaves);
2669 while((de = dictNext(di)) != NULL) {
2670 sentinelRedisInstance *slave = dictGetVal(de);
2671
2672 if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG))
2673 in_progress++;
2674 }
2675 dictReleaseIterator(di);
2676
2677 di = dictGetIterator(master->slaves);
2678 while(in_progress < master->parallel_syncs &&
2679 (de = dictNext(di)) != NULL)
2680 {
2681 sentinelRedisInstance *slave = dictGetVal(de);
2682 int retval;
2683 char master_port[32];
2684
2685 /* Skip the promoted slave, and already configured slaves. */
2686 if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
2687
2688 /* Clear the SRI_RECONF_SENT flag if too much time elapsed without
2689 * the slave moving forward to the next state. */
2690 if ((slave->flags & SRI_RECONF_SENT) &&
2691 (mstime() - slave->slave_reconf_sent_time) >
2692 SENTINEL_SLAVE_RECONF_RETRY_PERIOD)
2693 {
2694 sentinelEvent(REDIS_NOTICE,"-slave-reconf-sent-timeout",slave,"%@");
2695 slave->flags &= ~SRI_RECONF_SENT;
2696 }
2697
2698 /* Nothing to do for instances that are disconnected or already
2699 * in RECONF_SENT state. */
2700 if (slave->flags & (SRI_DISCONNECTED|SRI_RECONF_SENT|SRI_RECONF_INPROG))
2701 continue;
2702
2703 /* Send SLAVEOF <new master>. */
2704 ll2string(master_port,sizeof(master_port),
2705 master->promoted_slave->addr->port);
2706 retval = redisAsyncCommand(slave->cc,
2707 sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s",
2708 master->promoted_slave->addr->ip,
2709 master_port);
2710 if (retval == REDIS_OK) {
2711 slave->flags |= SRI_RECONF_SENT;
2712 slave->pending_commands++;
2713 slave->slave_reconf_sent_time = mstime();
2714 sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent",slave,"%@");
2715 in_progress++;
2716 }
2717 }
2718 dictReleaseIterator(di);
2719 sentinelFailoverDetectEnd(master);
2720}
2721
2722/* This function is called when the slave is in
2723 * SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state. In this state we need
2724 * to remove it from the master table and add the promoted slave instead.
2725 *
2726 * If there are no promoted slaves as this instance is unique, we remove
2727 * and re-add it with the same address to trigger a complete state
2728 * refresh. */
2729void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) {
75fb6e5b 2730 sentinelRedisInstance *ref = master->promoted_slave ?
2731 master->promoted_slave : master;
2732
2733 sentinelEvent(REDIS_WARNING,"+switch-master",master,"%s %s %d %s %d",
2734 master->name, master->addr->ip, master->addr->port,
2735 ref->addr->ip, ref->addr->port);
2736
2737 sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port);
6b5daa2d 2738}
2739
2740void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
2741 redisAssert(ri->flags & SRI_MASTER);
2742
2743 if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
2744
2745 switch(ri->failover_state) {
2746 case SENTINEL_FAILOVER_STATE_WAIT_START:
2747 sentinelFailoverWaitStart(ri);
2748 break;
2749 case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
2750 sentinelFailoverSelectSlave(ri);
2751 break;
2752 case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
2753 sentinelFailoverSendSlaveOfNoOne(ri);
2754 break;
2755 case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
2756 sentinelFailoverWaitPromotion(ri);
2757 break;
2758 case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
2759 sentinelFailoverReconfNextSlave(ri);
2760 break;
2761 case SENTINEL_FAILOVER_STATE_DETECT_END:
2762 sentinelFailoverDetectEnd(ri);
2763 break;
2764 }
2765}
2766
672102c2 2767/* Abort a failover in progress with the following steps:
2768 * 1) If this instance is the leaer send a SLAVEOF command to all the already
2769 * reconfigured slaves if any to configure them to replicate with the
2770 * original master.
2771 * 2) For both leaders and observers: clear the failover flags and state in
2772 * the master instance.
2773 * 3) If there is already a promoted slave and we are the leader, and this
2774 * slave is not DISCONNECTED, try to reconfigure it to replicate
2775 * back to the master as well, sending a best effort SLAVEOF command.
6b5daa2d 2776 */
672102c2 2777void sentinelAbortFailover(sentinelRedisInstance *ri) {
2778 char master_port[32];
6b5daa2d 2779 dictIterator *di;
2780 dictEntry *de;
6275004c 2781 int sentinel_role;
6b5daa2d 2782
672102c2 2783 redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS);
2784 ll2string(master_port,sizeof(master_port),ri->addr->port);
6b5daa2d 2785
2786 /* Clear failover related flags from slaves.
2787 * Also if we are the leader make sure to send SLAVEOF commands to all the
2788 * already reconfigured slaves in order to turn them back into slaves of
2789 * the original master. */
6b5daa2d 2790 di = dictGetIterator(ri->slaves);
2791 while((de = dictNext(di)) != NULL) {
2792 sentinelRedisInstance *slave = dictGetVal(de);
672102c2 2793 if ((ri->flags & SRI_I_AM_THE_LEADER) &&
2794 !(slave->flags & SRI_DISCONNECTED) &&
2795 (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG|
2796 SRI_RECONF_DONE)))
2797 {
6b5daa2d 2798 int retval;
2799
6b5daa2d 2800 retval = redisAsyncCommand(slave->cc,
2801 sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s",
2802 ri->addr->ip,
2803 master_port);
2804 if (retval == REDIS_OK)
2805 sentinelEvent(REDIS_NOTICE,"-slave-reconf-undo",slave,"%@");
2806 }
2807 slave->flags &= ~(SRI_RECONF_SENT|SRI_RECONF_INPROG|SRI_RECONF_DONE);
2808 }
2809 dictReleaseIterator(di);
2810
6275004c 2811 sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
2812 SENTINEL_OBSERVER;
cada7f96 2813 ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER);
6b5daa2d 2814 ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
2815 ri->failover_state_change_time = mstime();
672102c2 2816 if (ri->promoted_slave) {
6275004c 2817 sentinelCallClientReconfScript(ri,sentinel_role,"abort",
2818 ri->promoted_slave->addr,ri->addr);
672102c2 2819 ri->promoted_slave->flags &= ~SRI_PROMOTED;
2820 ri->promoted_slave = NULL;
2821 }
2822}
2823
2824/* The following is called only for master instances and will abort the
2825 * failover process if:
2826 *
2827 * 1) The failover is in progress.
2828 * 2) We already promoted a slave.
2829 * 3) The promoted slave is in extended SDOWN condition.
2830 */
2831void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) {
2832 /* Failover is in progress? Do we have a promoted slave? */
2833 if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS) || !ri->promoted_slave) return;
2834
2835 /* Is the promoted slave into an extended SDOWN state? */
2836 if (!(ri->promoted_slave->flags & SRI_S_DOWN) ||
2837 (mstime() - ri->promoted_slave->s_down_since_time) <
2838 (ri->down_after_period * SENTINEL_EXTENDED_SDOWN_MULTIPLIER)) return;
2839
2840 sentinelEvent(REDIS_WARNING,"-failover-abort-x-sdown",ri->promoted_slave,"%@");
2841 sentinelAbortFailover(ri);
6b5daa2d 2842}
2843
2844/* ======================== SENTINEL timer handler ==========================
2845 * This is the "main" our Sentinel, being sentinel completely non blocking
2846 * in design. The function is called every second.
2847 * -------------------------------------------------------------------------- */
2848
2849/* Perform scheduled operations for the specified Redis instance. */
2850void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
2851 /* ========== MONITORING HALF ============ */
2852 /* Every kind of instance */
2853 sentinelReconnectInstance(ri);
2854 sentinelPingInstance(ri);
2855
2856 /* Masters and slaves */
2857 if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
2858 /* Nothing so far. */
2859 }
2860
2861 /* Only masters */
2862 if (ri->flags & SRI_MASTER) {
2863 sentinelAskMasterStateToOtherSentinels(ri);
2864 }
2865
2866 /* ============== ACTING HALF ============= */
2867 /* We don't proceed with the acting half if we are in TILT mode.
2868 * TILT happens when we find something odd with the time, like a
2869 * sudden change in the clock. */
2870 if (sentinel.tilt) {
2871 if (mstime()-sentinel.tilt_start_time < SENTINEL_TILT_PERIOD) return;
2872 sentinel.tilt = 0;
2873 sentinelEvent(REDIS_WARNING,"-tilt",NULL,"#tilt mode exited");
2874 }
2875
2876 /* Every kind of instance */
2877 sentinelCheckSubjectivelyDown(ri);
2878
2879 /* Masters and slaves */
2880 if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
2881 /* Nothing so far. */
2882 }
2883
2884 /* Only masters */
2885 if (ri->flags & SRI_MASTER) {
2886 sentinelCheckObjectivelyDown(ri);
cada7f96 2887 sentinelStartFailoverIfNeeded(ri);
6b5daa2d 2888 sentinelFailoverStateMachine(ri);
2889 sentinelAbortFailoverIfNeeded(ri);
2890 }
2891}
2892
2893/* Perform scheduled operations for all the instances in the dictionary.
2894 * Recursively call the function against dictionaries of slaves. */
2895void sentinelHandleDictOfRedisInstances(dict *instances) {
2896 dictIterator *di;
2897 dictEntry *de;
2898 sentinelRedisInstance *switch_to_promoted = NULL;
2899
2900 /* There are a number of things we need to perform against every master. */
2901 di = dictGetIterator(instances);
2902 while((de = dictNext(di)) != NULL) {
2903 sentinelRedisInstance *ri = dictGetVal(de);
2904
2905 sentinelHandleRedisInstance(ri);
2906 if (ri->flags & SRI_MASTER) {
2907 sentinelHandleDictOfRedisInstances(ri->slaves);
2908 sentinelHandleDictOfRedisInstances(ri->sentinels);
2909 if (ri->failover_state == SENTINEL_FAILOVER_STATE_UPDATE_CONFIG) {
2910 switch_to_promoted = ri;
2911 }
2912 }
2913 }
2914 if (switch_to_promoted)
2915 sentinelFailoverSwitchToPromotedSlave(switch_to_promoted);
2916 dictReleaseIterator(di);
2917}
2918
2919/* This function checks if we need to enter the TITL mode.
2920 *
2921 * The TILT mode is entered if we detect that between two invocations of the
2922 * timer interrupt, a negative amount of time, or too much time has passed.
2923 * Note that we expect that more or less just 100 milliseconds will pass
2924 * if everything is fine. However we'll see a negative number or a
2925 * difference bigger than SENTINEL_TILT_TRIGGER milliseconds if one of the
2926 * following conditions happen:
2927 *
2928 * 1) The Sentiel process for some time is blocked, for every kind of
2929 * random reason: the load is huge, the computer was freezed for some time
2930 * in I/O or alike, the process was stopped by a signal. Everything.
2931 * 2) The system clock was altered significantly.
2932 *
2933 * Under both this conditions we'll see everything as timed out and failing
2934 * without good reasons. Instead we enter the TILT mode and wait
2935 * for SENTIENL_TILT_PERIOD to elapse before starting to act again.
2936 *
2937 * During TILT time we still collect information, we just do not act. */
2938void sentinelCheckTiltCondition(void) {
2939 mstime_t now = mstime();
2940 mstime_t delta = now - sentinel.previous_time;
2941
2942 if (delta < 0 || delta > SENTINEL_TILT_TRIGGER) {
2943 sentinel.tilt = 1;
2944 sentinel.tilt_start_time = mstime();
2945 sentinelEvent(REDIS_WARNING,"+tilt",NULL,"#tilt mode entered");
2946 }
2947 sentinel.previous_time = mstime();
2948}
2949
2950void sentinelTimer(void) {
2951 sentinelCheckTiltCondition();
2952 sentinelHandleDictOfRedisInstances(sentinel.masters);
3f194a9d 2953 sentinelRunPendingScripts();
2954 sentinelCollectTerminatedScripts();
2955 sentinelKillTimedoutScripts();
6b5daa2d 2956}
2957