src/sentinel.c

   1 /* Redis Sentinel implementation
   2  * -----------------------------
   3  *
   4  * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  *   * Redistributions of source code must retain the above copyright notice,
  11  *     this list of conditions and the following disclaimer.
  12  *   * Redistributions in binary form must reproduce the above copyright
  13  *     notice, this list of conditions and the following disclaimer in the
  14  *     documentation and/or other materials provided with the distribution.
  15  *   * Neither the name of Redis nor the names of its contributors may be used
  16  *     to endorse or promote products derived from this software without
  17  *     specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include "redis.h"
  33 #include "hiredis.h"
  34 #include "async.h"
  35
  36 #include <ctype.h>
  37 #include <arpa/inet.h>
  38 #include <sys/socket.h>
  39 #include <sys/wait.h>
  40
  41 extern char **environ;
  42
  43 #define REDIS_SENTINEL_PORT 26379
  44
  45 /* ======================== Sentinel global state =========================== */
  46
  47 typedef long long mstime_t; /* millisecond time type. */
  48
  49 /* Address object, used to describe an ip:port pair. */
  50 typedef struct sentinelAddr {
  51     char *ip;
  52     int port;
  53 } sentinelAddr;
  54
  55 /* A Sentinel Redis Instance object is monitoring. */
  56 #define SRI_MASTER  (1<<0)
  57 #define SRI_SLAVE   (1<<1)
  58 #define SRI_SENTINEL (1<<2)
  59 #define SRI_DISCONNECTED (1<<3)
  60 #define SRI_S_DOWN (1<<4)   /* Subjectively down (no quorum). */
  61 #define SRI_O_DOWN (1<<5)   /* Objectively down (quorum reached). */
  62 #define SRI_MASTER_DOWN (1<<6) /* A Sentinel with this flag set thinks that
  63                                    its master is down. */
  64 /* SRI_CAN_FAILOVER when set in an SRI_MASTER instance means that we are
  65  * allowed to perform the failover for this master.
  66  * When set in a SRI_SENTINEL instance means that sentinel is allowed to
  67  * perform the failover on its master. */
  68 #define SRI_CAN_FAILOVER (1<<7)
  69 #define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for
  70                                            this master. */
  71 #define SRI_I_AM_THE_LEADER (1<<9)     /* We are the leader for this master. */
  72 #define SRI_PROMOTED (1<<10)            /* Slave selected for promotion. */
  73 #define SRI_RECONF_SENT (1<<11)     /* SLAVEOF <newmaster> sent. */
  74 #define SRI_RECONF_INPROG (1<<12)   /* Slave synchronization in progress. */
  75 #define SRI_RECONF_DONE (1<<13)     /* Slave synchronized with new master. */
  76 #define SRI_FORCE_FAILOVER (1<<14)  /* Force failover with master up. */
  77 #define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */
  78
  79 #define SENTINEL_INFO_PERIOD 10000
  80 #define SENTINEL_PING_PERIOD 1000
  81 #define SENTINEL_ASK_PERIOD 1000
  82 #define SENTINEL_PUBLISH_PERIOD 5000
  83 #define SENTINEL_DOWN_AFTER_PERIOD 30000
  84 #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello"
  85 #define SENTINEL_TILT_TRIGGER 2000
  86 #define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30)
  87 #define SENTINEL_DEFAULT_SLAVE_PRIORITY 100
  88 #define SENTINEL_PROMOTION_RETRY_PERIOD 30000
  89 #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000
  90 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1
  91 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000
  92 #define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000)
  93 #define SENTINEL_MAX_PENDING_COMMANDS 100
  94 #define SENTINEL_EXTENDED_SDOWN_MULTIPLIER 10
  95
  96 /* How many milliseconds is an information valid? This applies for instance
  97  * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */
  98 #define SENTINEL_INFO_VALIDITY_TIME 5000
  99 #define SENTINEL_FAILOVER_FIXED_DELAY 5000
 100 #define SENTINEL_FAILOVER_MAX_RANDOM_DELAY 10000
 101
 102 /* Failover machine different states. */
 103 #define SENTINEL_FAILOVER_STATE_NONE 0  /* No failover in progress. */
 104 #define SENTINEL_FAILOVER_STATE_WAIT_START 1  /* Wait for failover_start_time*/
 105 #define SENTINEL_FAILOVER_STATE_SELECT_SLAVE 2 /* Select slave to promote */
 106 #define SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE 3 /* Slave -> Master */
 107 #define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait slave to change role */
 108 #define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 5 /* SLAVEOF newmaster */
 109 #define SENTINEL_FAILOVER_STATE_WAIT_NEXT_SLAVE 6 /* wait replication */
 110 #define SENTINEL_FAILOVER_STATE_ALERT_CLIENTS 7 /* Run user script. */
 111 #define SENTINEL_FAILOVER_STATE_WAIT_ALERT_SCRIPT 8 /* Wait script exec. */
 112 #define SENTINEL_FAILOVER_STATE_DETECT_END 9 /* Check for failover end. */
 113 #define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 10 /* Monitor promoted slave. */
 114
 115 #define SENTINEL_MASTER_LINK_STATUS_UP 0
 116 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1
 117
 118 /* Generic flags that can be used with different functions. */
 119 #define SENTINEL_NO_FLAGS 0
 120 #define SENTINEL_GENERATE_EVENT 1
 121 #define SENTINEL_LEADER 2
 122 #define SENTINEL_OBSERVER 4
 123
 124 /* Script execution flags and limits. */
 125 #define SENTINEL_SCRIPT_NONE 0
 126 #define SENTINEL_SCRIPT_RUNNING 1
 127 #define SENTINEL_SCRIPT_MAX_QUEUE 256
 128 #define SENTINEL_SCRIPT_MAX_RUNNING 16
 129 #define SENTINEL_SCRIPT_MAX_RUNTIME 60000 /* 60 seconds max exec time. */
 130 #define SENTINEL_SCRIPT_MAX_RETRY 10
 131 #define SENTINEL_SCRIPT_RETRY_DELAY 30000 /* 30 seconds between retries. */
 132
 133 typedef struct sentinelRedisInstance {
 134     int flags;      /* See SRI_... defines */
 135     char *name;     /* Master name from the point of view of this sentinel. */
 136     char *runid;    /* run ID of this instance. */
 137     sentinelAddr *addr; /* Master host. */
 138     redisAsyncContext *cc; /* Hiredis context for commands. */
 139     redisAsyncContext *pc; /* Hiredis context for Pub / Sub. */
 140     int pending_commands;   /* Number of commands sent waiting for a reply. */
 141     mstime_t cc_conn_time; /* cc connection time. */
 142     mstime_t pc_conn_time; /* pc connection time. */
 143     mstime_t pc_last_activity; /* Last time we received any message. */
 144     mstime_t last_avail_time; /* Last time the instance replied to ping with
 145                                  a reply we consider valid. */
 146     mstime_t last_pong_time;  /* Last time the instance replied to ping,
 147                                  whatever the reply was. That's used to check
 148                                  if the link is idle and must be reconnected. */
 149     mstime_t last_pub_time;   /* Last time we sent hello via Pub/Sub. */
 150     mstime_t last_hello_time; /* Only used if SRI_SENTINEL is set. Last time
 151                                  we received an hello from this Sentinel
 152                                  via Pub/Sub. */
 153     mstime_t last_master_down_reply_time; /* Time of last reply to
 154                                              SENTINEL is-master-down command. */
 155     mstime_t s_down_since_time; /* Subjectively down since time. */
 156     mstime_t o_down_since_time; /* Objectively down since time. */
 157     mstime_t down_after_period; /* Consider it down after that period. */
 158     mstime_t info_refresh;  /* Time at which we received INFO output from it. */
 159
 160     /* Master specific. */
 161     dict *sentinels;    /* Other sentinels monitoring the same master. */
 162     dict *slaves;       /* Slaves for this master instance. */
 163     int quorum;         /* Number of sentinels that need to agree on failure. */
 164     int parallel_syncs; /* How many slaves to reconfigure at same time. */
 165
 166     /* Slave specific. */
 167     mstime_t master_link_down_time; /* Slave replication link down time. */
 168     int slave_priority; /* Slave priority according to its INFO output. */
 169     mstime_t slave_reconf_sent_time; /* Time at which we sent SLAVE OF <new> */
 170     struct sentinelRedisInstance *master; /* Master instance if SRI_SLAVE is set. */
 171     char *slave_master_host;    /* Master host as reported by INFO */
 172     int slave_master_port;      /* Master port as reported by INFO */
 173     int slave_master_link_status; /* Master link status as reported by INFO */
 174     /* Failover */
 175     char *leader;       /* If this is a master instance, this is the runid of
 176                            the Sentinel that should perform the failover. If
 177                            this is a Sentinel, this is the runid of the Sentinel
 178                            that this other Sentinel is voting as leader.
 179                            This field is valid only if SRI_MASTER_DOWN is
 180                            set on the Sentinel instance. */
 181     int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */
 182     mstime_t failover_state_change_time;
 183     mstime_t failover_start_time;   /* When to start to failover if leader. */
 184     mstime_t failover_timeout;      /* Max time to refresh failover state. */
 185     struct sentinelRedisInstance *promoted_slave; /* Promoted slave instance. */
 186     /* Scripts executed to notify admin or reconfigure clients: when they
 187      * are set to NULL no script is executed. */
 188     char *notification_script;
 189     char *client_reconfig_script;
 190 } sentinelRedisInstance;
 191
 192 /* Main state. */
 193 struct sentinelState {
 194     dict *masters;      /* Dictionary of master sentinelRedisInstances.
 195                            Key is the instance name, value is the
 196                            sentinelRedisInstance structure pointer. */
 197     int tilt;           /* Are we in TILT mode? */
 198     int running_scripts;    /* Number of scripts in execution right now. */
 199     mstime_t tilt_start_time;   /* When TITL started. */
 200     mstime_t previous_time;     /* Time last time we ran the time handler. */
 201     list *scripts_queue;    /* Queue of user scripts to execute. */
 202 } sentinel;
 203
 204 /* A script execution job. */
 205 typedef struct sentinelScriptJob {
 206     int flags;              /* Script job flags: SENTINEL_SCRIPT_* */
 207     int retry_num;          /* Number of times we tried to execute it. */
 208     char **argv;            /* Arguments to call the script. */
 209     mstime_t start_time;    /* Script execution time if the script is running,
 210                                otherwise 0 if we are allowed to retry the
 211                                execution at any time. If the script is not
 212                                running and it's not 0, it means: do not run
 213                                before the specified time. */
 214     pid_t pid;              /* Script execution pid. */
 215 } sentinelScriptJob;
 216
 217 /* ======================= hiredis ae.c adapters =============================
 218  * Note: this implementation is taken from hiredis/adapters/ae.h, however
 219  * we have our modified copy for Sentinel in order to use our allocator
 220  * and to have full control over how the adapter works. */
 221
 222 typedef struct redisAeEvents {
 223     redisAsyncContext *context;
 224     aeEventLoop *loop;
 225     int fd;
 226     int reading, writing;
 227 } redisAeEvents;
 228
 229 static void redisAeReadEvent(aeEventLoop *el, int fd, void *privdata, int mask) {
 230     ((void)el); ((void)fd); ((void)mask);
 231
 232     redisAeEvents *e = (redisAeEvents*)privdata;
 233     redisAsyncHandleRead(e->context);
 234 }
 235
 236 static void redisAeWriteEvent(aeEventLoop *el, int fd, void *privdata, int mask) {
 237     ((void)el); ((void)fd); ((void)mask);
 238
 239     redisAeEvents *e = (redisAeEvents*)privdata;
 240     redisAsyncHandleWrite(e->context);
 241 }
 242
 243 static void redisAeAddRead(void *privdata) {
 244     redisAeEvents *e = (redisAeEvents*)privdata;
 245     aeEventLoop *loop = e->loop;
 246     if (!e->reading) {
 247         e->reading = 1;
 248         aeCreateFileEvent(loop,e->fd,AE_READABLE,redisAeReadEvent,e);
 249     }
 250 }
 251
 252 static void redisAeDelRead(void *privdata) {
 253     redisAeEvents *e = (redisAeEvents*)privdata;
 254     aeEventLoop *loop = e->loop;
 255     if (e->reading) {
 256         e->reading = 0;
 257         aeDeleteFileEvent(loop,e->fd,AE_READABLE);
 258     }
 259 }
 260
 261 static void redisAeAddWrite(void *privdata) {
 262     redisAeEvents *e = (redisAeEvents*)privdata;
 263     aeEventLoop *loop = e->loop;
 264     if (!e->writing) {
 265         e->writing = 1;
 266         aeCreateFileEvent(loop,e->fd,AE_WRITABLE,redisAeWriteEvent,e);
 267     }
 268 }
 269
 270 static void redisAeDelWrite(void *privdata) {
 271     redisAeEvents *e = (redisAeEvents*)privdata;
 272     aeEventLoop *loop = e->loop;
 273     if (e->writing) {
 274         e->writing = 0;
 275         aeDeleteFileEvent(loop,e->fd,AE_WRITABLE);
 276     }
 277 }
 278
 279 static void redisAeCleanup(void *privdata) {
 280     redisAeEvents *e = (redisAeEvents*)privdata;
 281     redisAeDelRead(privdata);
 282     redisAeDelWrite(privdata);
 283     zfree(e);
 284 }
 285
 286 static int redisAeAttach(aeEventLoop *loop, redisAsyncContext *ac) {
 287     redisContext *c = &(ac->c);
 288     redisAeEvents *e;
 289
 290     /* Nothing should be attached when something is already attached */
 291     if (ac->ev.data != NULL)
 292         return REDIS_ERR;
 293
 294     /* Create container for context and r/w events */
 295     e = (redisAeEvents*)zmalloc(sizeof(*e));
 296     e->context = ac;
 297     e->loop = loop;
 298     e->fd = c->fd;
 299     e->reading = e->writing = 0;
 300
 301     /* Register functions to start/stop listening for events */
 302     ac->ev.addRead = redisAeAddRead;
 303     ac->ev.delRead = redisAeDelRead;
 304     ac->ev.addWrite = redisAeAddWrite;
 305     ac->ev.delWrite = redisAeDelWrite;
 306     ac->ev.cleanup = redisAeCleanup;
 307     ac->ev.data = e;
 308
 309     return REDIS_OK;
 310 }
 311
 312 /* ============================= Prototypes ================================= */
 313
 314 void sentinelLinkEstablishedCallback(const redisAsyncContext *c, int status);
 315 void sentinelDisconnectCallback(const redisAsyncContext *c, int status);
 316 void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata);
 317 sentinelRedisInstance *sentinelGetMasterByName(char *name);
 318 char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master);
 319 char *sentinelGetObjectiveLeader(sentinelRedisInstance *master);
 320 int yesnotoi(char *s);
 321 void sentinelDisconnectInstanceFromContext(const redisAsyncContext *c);
 322 void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c);
 323 const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri);
 324 void sentinelAbortFailover(sentinelRedisInstance *ri);
 325 void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...);
 326 sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master);
 327 void sentinelScheduleScriptExecution(char *path, ...);
 328 void sentinelStartFailover(sentinelRedisInstance *master, int state);
 329
 330 /* ========================= Dictionary types =============================== */
 331
 332 unsigned int dictSdsHash(const void *key);
 333 int dictSdsKeyCompare(void *privdata, const void *key1, const void *key2);
 334 void releaseSentinelRedisInstance(sentinelRedisInstance *ri);
 335
 336 void dictInstancesValDestructor (void *privdata, void *obj) {
 337     releaseSentinelRedisInstance(obj);
 338 }
 339
 340 /* Instance name (sds) -> instance (sentinelRedisInstance pointer)
 341  *
 342  * also used for: sentinelRedisInstance->sentinels dictionary that maps
 343  * sentinels ip:port to last seen time in Pub/Sub hello message. */
 344 dictType instancesDictType = {
 345     dictSdsHash,               /* hash function */
 346     NULL,                      /* key dup */
 347     NULL,                      /* val dup */
 348     dictSdsKeyCompare,         /* key compare */
 349     NULL,                      /* key destructor */
 350     dictInstancesValDestructor /* val destructor */
 351 };
 352
 353 /* Instance runid (sds) -> votes (long casted to void*)
 354  *
 355  * This is useful into sentinelGetObjectiveLeader() function in order to
 356  * count the votes and understand who is the leader. */
 357 dictType leaderVotesDictType = {
 358     dictSdsHash,               /* hash function */
 359     NULL,                      /* key dup */
 360     NULL,                      /* val dup */
 361     dictSdsKeyCompare,         /* key compare */
 362     NULL,                      /* key destructor */
 363     NULL                       /* val destructor */
 364 };
 365
 366 /* =========================== Initialization =============================== */
 367
 368 void sentinelCommand(redisClient *c);
 369
 370 struct redisCommand sentinelcmds[] = {
 371     {"ping",pingCommand,1,"",0,NULL,0,0,0,0,0},
 372     {"sentinel",sentinelCommand,-2,"",0,NULL,0,0,0,0,0},
 373     {"subscribe",subscribeCommand,-2,"",0,NULL,0,0,0,0,0},
 374     {"unsubscribe",unsubscribeCommand,-1,"",0,NULL,0,0,0,0,0},
 375     {"psubscribe",psubscribeCommand,-2,"",0,NULL,0,0,0,0,0},
 376     {"punsubscribe",punsubscribeCommand,-1,"",0,NULL,0,0,0,0,0}
 377 };
 378
 379 /* This function overwrites a few normal Redis config default with Sentinel
 380  * specific defaults. */
 381 void initSentinelConfig(void) {
 382     server.port = REDIS_SENTINEL_PORT;
 383 }
 384
 385 /* Perform the Sentinel mode initialization. */
 386 void initSentinel(void) {
 387     int j;
 388
 389     /* Remove usual Redis commands from the command table, then just add
 390      * the SENTINEL command. */
 391     dictEmpty(server.commands);
 392     for (j = 0; j < sizeof(sentinelcmds)/sizeof(sentinelcmds[0]); j++) {
 393         int retval;
 394         struct redisCommand *cmd = sentinelcmds+j;
 395
 396         retval = dictAdd(server.commands, sdsnew(cmd->name), cmd);
 397         redisAssert(retval == DICT_OK);
 398     }
 399
 400     /* Initialize various data structures. */
 401     sentinel.masters = dictCreate(&instancesDictType,NULL);
 402     sentinel.tilt = 0;
 403     sentinel.tilt_start_time = mstime();
 404     sentinel.previous_time = mstime();
 405     sentinel.running_scripts = 0;
 406     sentinel.scripts_queue = listCreate();
 407 }
 408
 409 /* ============================== sentinelAddr ============================== */
 410
 411 /* Create a sentinelAddr object and return it on success.
 412  * On error NULL is returned and errno is set to:
 413  *  ENOENT: Can't resolve the hostname.
 414  *  EINVAL: Invalid port number.
 415  */
 416 sentinelAddr *createSentinelAddr(char *hostname, int port) {
 417     char buf[32];
 418     sentinelAddr *sa;
 419
 420     if (port <= 0 || port > 65535) {
 421         errno = EINVAL;
 422         return NULL;
 423     }
 424     if (anetResolve(NULL,hostname,buf) == ANET_ERR) {
 425         errno = ENOENT;
 426         return NULL;
 427     }
 428     sa = zmalloc(sizeof(*sa));
 429     sa->ip = sdsnew(buf);
 430     sa->port = port;
 431     return sa;
 432 }
 433
 434 /* Free a Sentinel address. Can't fail. */
 435 void releaseSentinelAddr(sentinelAddr *sa) {
 436     sdsfree(sa->ip);
 437     zfree(sa);
 438 }
 439
 440 /* =========================== Events notification ========================== */
 441
 442 /* Send an event to log, pub/sub, user notification script.
 443  *
 444  * 'level' is the log level for logging. Only REDIS_WARNING events will trigger
 445  * the execution of the user notification script.
 446  *
 447  * 'type' is the message type, also used as a pub/sub channel name.
 448  *
 449  * 'ri', is the redis instance target of this event if applicable, and is
 450  * used to obtain the path of the notification script to execute.
 451  *
 452  * The remaining arguments are printf-alike.
 453  * If the format specifier starts with the two characters "%@" then ri is
 454  * not NULL, and the message is prefixed with an instance identifier in the
 455  * following format:
 456  *
 457  *  <instance type> <instance name> <ip> <port>
 458  *
 459  *  If the instance type is not master, than the additional string is
 460  *  added to specify the originating master:
 461  *
 462  *  @ <master name> <master ip> <master port>
 463  *
 464  *  Any other specifier after "%@" is processed by printf itself.
 465  */
 466 void sentinelEvent(int level, char *type, sentinelRedisInstance *ri,
 467                    const char *fmt, ...) {
 468     va_list ap;
 469     char msg[REDIS_MAX_LOGMSG_LEN];
 470     robj *channel, *payload;
 471
 472     /* Handle %@ */
 473     if (fmt[0] == '%' && fmt[1] == '@') {
 474         sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ?
 475                                          NULL : ri->master;
 476
 477         if (master) {
 478             snprintf(msg, sizeof(msg), "%s %s %s %d @ %s %s %d",
 479                 sentinelRedisInstanceTypeStr(ri),
 480                 ri->name, ri->addr->ip, ri->addr->port,
 481                 master->name, master->addr->ip, master->addr->port);
 482         } else {
 483             snprintf(msg, sizeof(msg), "%s %s %s %d",
 484                 sentinelRedisInstanceTypeStr(ri),
 485                 ri->name, ri->addr->ip, ri->addr->port);
 486         }
 487         fmt += 2;
 488     } else {
 489         msg[0] = '\0';
 490     }
 491
 492     /* Use vsprintf for the rest of the formatting if any. */
 493     if (fmt[0] != '\0') {
 494         va_start(ap, fmt);
 495         vsnprintf(msg+strlen(msg), sizeof(msg)-strlen(msg), fmt, ap);
 496         va_end(ap);
 497     }
 498
 499     /* Log the message if the log level allows it to be logged. */
 500     if (level >= server.verbosity)
 501         redisLog(level,"%s %s",type,msg);
 502
 503     /* Publish the message via Pub/Sub if it's not a debugging one. */
 504     if (level != REDIS_DEBUG) {
 505         channel = createStringObject(type,strlen(type));
 506         payload = createStringObject(msg,strlen(msg));
 507         pubsubPublishMessage(channel,payload);
 508         decrRefCount(channel);
 509         decrRefCount(payload);
 510     }
 511
 512     /* Call the notification script if applicable. */
 513     if (level == REDIS_WARNING && ri != NULL) {
 514         sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ?
 515                                          ri : ri->master;
 516         if (master->notification_script) {
 517             sentinelScheduleScriptExecution(master->notification_script,
 518                 type,msg,NULL);
 519         }
 520     }
 521 }
 522
 523 /* ============================ script execution ============================ */
 524
 525 /* Release a script job structure and all the associated data. */
 526 void sentinelReleaseScriptJob(sentinelScriptJob *sj) {
 527     int j = 0;
 528
 529     while(sj->argv[j]) sdsfree(sj->argv[j++]);
 530     zfree(sj->argv);
 531     zfree(sj);
 532 }
 533
 534 #define SENTINEL_SCRIPT_MAX_ARGS 16
 535 void sentinelScheduleScriptExecution(char *path, ...) {
 536     va_list ap;
 537     char *argv[SENTINEL_SCRIPT_MAX_ARGS+1];
 538     int argc = 1;
 539     sentinelScriptJob *sj;
 540
 541     va_start(ap, path);
 542     while(argc < SENTINEL_SCRIPT_MAX_ARGS) {
 543         argv[argc] = va_arg(ap,char*);
 544         if (!argv[argc]) break;
 545         argv[argc] = sdsnew(argv[argc]); /* Copy the string. */
 546         argc++;
 547     }
 548     va_end(ap);
 549     argv[0] = sdsnew(path);
 550
 551     sj = zmalloc(sizeof(*sj));
 552     sj->flags = SENTINEL_SCRIPT_NONE;
 553     sj->retry_num = 0;
 554     sj->argv = zmalloc(sizeof(char*)*(argc+1));
 555     sj->start_time = 0;
 556     sj->pid = 0;
 557     memcpy(sj->argv,argv,sizeof(char*)*(argc+1));
 558
 559     listAddNodeTail(sentinel.scripts_queue,sj);
 560
 561     /* Remove the oldest non running script if we already hit the limit. */
 562     if (listLength(sentinel.scripts_queue) > SENTINEL_SCRIPT_MAX_QUEUE) {
 563         listNode *ln;
 564         listIter li;
 565
 566         listRewind(sentinel.scripts_queue,&li);
 567         while ((ln = listNext(&li)) != NULL) {
 568             sj = ln->value;
 569
 570             if (sj->flags & SENTINEL_SCRIPT_RUNNING) continue;
 571             /* The first node is the oldest as we add on tail. */
 572             listDelNode(sentinel.scripts_queue,ln);
 573             sentinelReleaseScriptJob(sj);
 574             break;
 575         }
 576         redisAssert(listLength(sentinel.scripts_queue) <=
 577                     SENTINEL_SCRIPT_MAX_QUEUE);
 578     }
 579 }
 580
 581 /* Lookup a script in the scripts queue via pid, and returns the list node
 582  * (so that we can easily remove it from the queue if needed). */
 583 listNode *sentinelGetScriptListNodeByPid(pid_t pid) {
 584     listNode *ln;
 585     listIter li;
 586
 587     listRewind(sentinel.scripts_queue,&li);
 588     while ((ln = listNext(&li)) != NULL) {
 589         sentinelScriptJob *sj = ln->value;
 590
 591         if ((sj->flags & SENTINEL_SCRIPT_RUNNING) && sj->pid == pid)
 592             return ln;
 593     }
 594     return NULL;
 595 }
 596
 597 /* Run pending scripts if we are not already at max number of running
 598  * scripts. */
 599 void sentinelRunPendingScripts(void) {
 600     listNode *ln;
 601     listIter li;
 602     mstime_t now = mstime();
 603
 604     /* Find jobs that are not running and run them, from the top to the
 605      * tail of the queue, so we run older jobs first. */
 606     listRewind(sentinel.scripts_queue,&li);
 607     while (sentinel.running_scripts < SENTINEL_SCRIPT_MAX_RUNNING &&
 608            (ln = listNext(&li)) != NULL)
 609     {
 610         sentinelScriptJob *sj = ln->value;
 611         pid_t pid;
 612
 613         /* Skip if already running. */
 614         if (sj->flags & SENTINEL_SCRIPT_RUNNING) continue;
 615
 616         /* Skip if it's a retry, but not enough time has elapsed. */
 617         if (sj->start_time && sj->start_time > now) continue;
 618
 619         sj->flags |= SENTINEL_SCRIPT_RUNNING;
 620         sj->start_time = mstime();
 621         sj->retry_num++;
 622         pid = fork();
 623
 624         if (pid == -1) {
 625             /* Parent (fork error).
 626              * We report fork errors as signal 99, in order to unify the
 627              * reporting with other kind of errors. */
 628             sentinelEvent(REDIS_WARNING,"-script-error",NULL,
 629                           "%s %d %d", sj->argv[0], 99, 0);
 630             sj->flags &= ~SENTINEL_SCRIPT_RUNNING;
 631             sj->pid = 0;
 632         } else if (pid == 0) {
 633             /* Child */
 634             execve(sj->argv[0],sj->argv,environ);
 635             /* If we are here an error occurred. */
 636             _exit(2); /* Don't retry execution. */
 637         } else {
 638             sentinel.running_scripts++;
 639             sj->pid = pid;
 640             sentinelEvent(REDIS_DEBUG,"+script-child",NULL,"%ld",(long)pid);
 641         }
 642     }
 643 }
 644
 645 /* How much to delay the execution of a script that we need to retry after
 646  * an error?
 647  *
 648  * We double the retry delay for every further retry we do. So for instance
 649  * if RETRY_DELAY is set to 30 seconds and the max number of retries is 10
 650  * starting from the second attempt to execute the script the delays are:
 651  * 30 sec, 60 sec, 2 min, 4 min, 8 min, 16 min, 32 min, 64 min, 128 min. */
 652 mstime_t sentinelScriptRetryDelay(int retry_num) {
 653     mstime_t delay = SENTINEL_SCRIPT_RETRY_DELAY;
 654
 655     while (retry_num-- > 1) delay *= 2;
 656     return delay;
 657 }
 658
 659 /* Check for scripts that terminated, and remove them from the queue if the
 660  * script terminated successfully. If instead the script was terminated by
 661  * a signal, or returned exit code "1", it is scheduled to run again if
 662  * the max number of retries did not already elapsed. */
 663 void sentinelCollectTerminatedScripts(void) {
 664     int statloc;
 665     pid_t pid;
 666
 667     while ((pid = wait3(&statloc,WNOHANG,NULL)) > 0) {
 668         int exitcode = WEXITSTATUS(statloc);
 669         int bysignal = 0;
 670         listNode *ln;
 671         sentinelScriptJob *sj;
 672
 673         if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
 674         sentinelEvent(REDIS_DEBUG,"-script-child",NULL,"%ld %d %d",
 675             (long)pid, exitcode, bysignal);
 676
 677         ln = sentinelGetScriptListNodeByPid(pid);
 678         if (ln == NULL) {
 679             redisLog(REDIS_WARNING,"wait3() returned a pid (%ld) we can't find in our scripts execution queue!", (long)pid);
 680             continue;
 681         }
 682         sj = ln->value;
 683
 684         /* If the script was terminated by a signal or returns an
 685          * exit code of "1" (that means: please retry), we reschedule it
 686          * if the max number of retries is not already reached. */
 687         if ((bysignal || exitcode == 1) &&
 688             sj->retry_num != SENTINEL_SCRIPT_MAX_RETRY)
 689         {
 690             sj->flags &= ~SENTINEL_SCRIPT_RUNNING;
 691             sj->pid = 0;
 692             sj->start_time = mstime() +
 693                              sentinelScriptRetryDelay(sj->retry_num);
 694         } else {
 695             /* Otherwise let's remove the script, but log the event if the
 696              * execution did not terminated in the best of the ways. */
 697             if (bysignal || exitcode != 0) {
 698                 sentinelEvent(REDIS_WARNING,"-script-error",NULL,
 699                               "%s %d %d", sj->argv[0], bysignal, exitcode);
 700             }
 701             listDelNode(sentinel.scripts_queue,ln);
 702             sentinelReleaseScriptJob(sj);
 703             sentinel.running_scripts--;
 704         }
 705     }
 706 }
 707
 708 /* Kill scripts in timeout, they'll be collected by the
 709  * sentinelCollectTerminatedScripts() function. */
 710 void sentinelKillTimedoutScripts(void) {
 711     listNode *ln;
 712     listIter li;
 713     mstime_t now = mstime();
 714
 715     listRewind(sentinel.scripts_queue,&li);
 716     while ((ln = listNext(&li)) != NULL) {
 717         sentinelScriptJob *sj = ln->value;
 718
 719         if (sj->flags & SENTINEL_SCRIPT_RUNNING &&
 720             (now - sj->start_time) > SENTINEL_SCRIPT_MAX_RUNTIME)
 721         {
 722             sentinelEvent(REDIS_WARNING,"-script-timeout",NULL,"%s %ld",
 723                 sj->argv[0], (long)sj->pid);
 724             kill(sj->pid,SIGKILL);
 725         }
 726     }
 727 }
 728
 729 /* Implements SENTINEL PENDING-SCRIPTS command. */
 730 void sentinelPendingScriptsCommand(redisClient *c) {
 731     listNode *ln;
 732     listIter li;
 733
 734     addReplyMultiBulkLen(c,listLength(sentinel.scripts_queue));
 735     listRewind(sentinel.scripts_queue,&li);
 736     while ((ln = listNext(&li)) != NULL) {
 737         sentinelScriptJob *sj = ln->value;
 738         int j = 0;
 739
 740         addReplyMultiBulkLen(c,10);
 741
 742         addReplyBulkCString(c,"argv");
 743         while (sj->argv[j]) j++;
 744         addReplyMultiBulkLen(c,j);
 745         j = 0;
 746         while (sj->argv[j]) addReplyBulkCString(c,sj->argv[j++]);
 747
 748         addReplyBulkCString(c,"flags");
 749         addReplyBulkCString(c,
 750             (sj->flags & SENTINEL_SCRIPT_RUNNING) ? "running" : "scheduled");
 751
 752         addReplyBulkCString(c,"pid");
 753         addReplyBulkLongLong(c,sj->pid);
 754
 755         if (sj->flags & SENTINEL_SCRIPT_RUNNING) {
 756             addReplyBulkCString(c,"run-time");
 757             addReplyBulkLongLong(c,mstime() - sj->start_time);
 758         } else {
 759             mstime_t delay = sj->start_time ? (sj->start_time-mstime()) : 0;
 760             if (delay < 0) delay = 0;
 761             addReplyBulkCString(c,"run-delay");
 762             addReplyBulkLongLong(c,delay);
 763         }
 764
 765         addReplyBulkCString(c,"retry-num");
 766         addReplyBulkLongLong(c,sj->retry_num);
 767     }
 768 }
 769
 770 /* This function calls, if any, the client reconfiguration script with the
 771  * following parameters:
 772  *
 773  * <master-name> <role> <state> <from-ip> <from-port> <to-ip> <to-port>
 774  *
 775  * It is called every time a failover starts, ends, or is aborted.
 776  *
 777  * <state> is "start", "end" or "abort".
 778  * <role> is either "leader" or "observer".
 779  *
 780  * from/to fields are respectively master -> promoted slave addresses for
 781  * "start" and "end", or the reverse (promoted slave -> master) in case of
 782  * "abort".
 783  */
 784 void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, char *state, sentinelAddr *from, sentinelAddr *to) {
 785     char fromport[32], toport[32];
 786
 787     if (master->client_reconfig_script == NULL) return;
 788     ll2string(fromport,sizeof(fromport),from->port);
 789     ll2string(toport,sizeof(toport),to->port);
 790     sentinelScheduleScriptExecution(master->client_reconfig_script,
 791         master->name,
 792         (role == SENTINEL_LEADER) ? "leader" : "observer",
 793         state, from->ip, fromport, to->ip, toport, NULL);
 794 }
 795
 796 /* ========================== sentinelRedisInstance ========================= */
 797
 798 /* Create a redis instance, the following fields must be populated by the
 799  * caller if needed:
 800  * runid: set to NULL but will be populated once INFO output is received.
 801  * info_refresh: is set to 0 to mean that we never received INFO so far.
 802  *
 803  * If SRI_MASTER is set into initial flags the instance is added to
 804  * sentinel.masters table.
 805  *
 806  * if SRI_SLAVE or SRI_SENTINEL is set then 'master' must be not NULL and the
 807  * instance is added into master->slaves or master->sentinels table.
 808  *
 809  * If the instance is a slave or sentinel, the name parameter is ignored and
 810  * is created automatically as hostname:port.
 811  *
 812  * The function fails if hostname can't be resolved or port is out of range.
 813  * When this happens NULL is returned and errno is set accordingly to the
 814  * createSentinelAddr() function.
 815  *
 816  * The function may also fail and return NULL with errno set to EBUSY if
 817  * a master or slave with the same name already exists. */
 818 sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *hostname, int port, int quorum, sentinelRedisInstance *master) {
 819     sentinelRedisInstance *ri;
 820     sentinelAddr *addr;
 821     dict *table;
 822     char slavename[128], *sdsname;
 823
 824     redisAssert(flags & (SRI_MASTER|SRI_SLAVE|SRI_SENTINEL));
 825     redisAssert((flags & SRI_MASTER) || master != NULL);
 826
 827     /* Check address validity. */
 828     addr = createSentinelAddr(hostname,port);
 829     if (addr == NULL) return NULL;
 830
 831     /* For slaves and sentinel we use ip:port as name. */
 832     if (flags & (SRI_SLAVE|SRI_SENTINEL)) {
 833         snprintf(slavename,sizeof(slavename),"%s:%d",hostname,port);
 834         name = slavename;
 835     }
 836
 837     /* Make sure the entry is not duplicated. This may happen when the same
 838      * name for a master is used multiple times inside the configuration or
 839      * if we try to add multiple times a slave or sentinel with same ip/port
 840      * to a master. */
 841     if (flags & SRI_MASTER) table = sentinel.masters;
 842     else if (flags & SRI_SLAVE) table = master->slaves;
 843     else if (flags & SRI_SENTINEL) table = master->sentinels;
 844     sdsname = sdsnew(name);
 845     if (dictFind(table,sdsname)) {
 846         sdsfree(sdsname);
 847         errno = EBUSY;
 848         return NULL;
 849     }
 850
 851     /* Create the instance object. */
 852     ri = zmalloc(sizeof(*ri));
 853     /* Note that all the instances are started in the disconnected state,
 854      * the event loop will take care of connecting them. */
 855     ri->flags = flags | SRI_DISCONNECTED;
 856     ri->name = sdsname;
 857     ri->runid = NULL;
 858     ri->addr = addr;
 859     ri->cc = NULL;
 860     ri->pc = NULL;
 861     ri->pending_commands = 0;
 862     ri->cc_conn_time = 0;
 863     ri->pc_conn_time = 0;
 864     ri->pc_last_activity = 0;
 865     ri->last_avail_time = mstime();
 866     ri->last_pong_time = mstime();
 867     ri->last_pub_time = mstime();
 868     ri->last_hello_time = mstime();
 869     ri->last_master_down_reply_time = mstime();
 870     ri->s_down_since_time = 0;
 871     ri->o_down_since_time = 0;
 872     ri->down_after_period = master ? master->down_after_period :
 873                             SENTINEL_DOWN_AFTER_PERIOD;
 874     ri->master_link_down_time = 0;
 875     ri->slave_priority = SENTINEL_DEFAULT_SLAVE_PRIORITY;
 876     ri->slave_reconf_sent_time = 0;
 877     ri->slave_master_host = NULL;
 878     ri->slave_master_port = 0;
 879     ri->slave_master_link_status = SENTINEL_MASTER_LINK_STATUS_DOWN;
 880     ri->sentinels = dictCreate(&instancesDictType,NULL);
 881     ri->quorum = quorum;
 882     ri->parallel_syncs = SENTINEL_DEFAULT_PARALLEL_SYNCS;
 883     ri->master = master;
 884     ri->slaves = dictCreate(&instancesDictType,NULL);
 885     ri->info_refresh = 0;
 886
 887     /* Failover state. */
 888     ri->leader = NULL;
 889     ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
 890     ri->failover_state_change_time = 0;
 891     ri->failover_start_time = 0;
 892     ri->failover_timeout = SENTINEL_DEFAULT_FAILOVER_TIMEOUT;
 893     ri->promoted_slave = NULL;
 894     ri->notification_script = NULL;
 895     ri->client_reconfig_script = NULL;
 896
 897     /* Add into the right table. */
 898     dictAdd(table, ri->name, ri);
 899     return ri;
 900 }
 901
 902 /* Release this instance and all its slaves, sentinels, hiredis connections.
 903  * This function also takes care of unlinking the instance from the main
 904  * masters table (if it is a master) or from its master sentinels/slaves table
 905  * if it is a slave or sentinel. */
 906 void releaseSentinelRedisInstance(sentinelRedisInstance *ri) {
 907     /* Release all its slaves or sentinels if any. */
 908     dictRelease(ri->sentinels);
 909     dictRelease(ri->slaves);
 910
 911     /* Release hiredis connections. */
 912     if (ri->cc) sentinelKillLink(ri,ri->cc);
 913     if (ri->pc) sentinelKillLink(ri,ri->pc);
 914
 915     /* Free other resources. */
 916     sdsfree(ri->name);
 917     sdsfree(ri->runid);
 918     sdsfree(ri->notification_script);
 919     sdsfree(ri->client_reconfig_script);
 920     sdsfree(ri->slave_master_host);
 921     sdsfree(ri->leader);
 922     releaseSentinelAddr(ri->addr);
 923
 924     /* Clear state into the master if needed. */
 925     if ((ri->flags & SRI_SLAVE) && (ri->flags & SRI_PROMOTED) && ri->master)
 926         ri->master->promoted_slave = NULL;
 927
 928     zfree(ri);
 929 }
 930
 931 /* Lookup a slave in a master Redis instance, by ip and port. */
 932 sentinelRedisInstance *sentinelRedisInstanceLookupSlave(
 933                 sentinelRedisInstance *ri, char *ip, int port)
 934 {
 935     sds key;
 936     sentinelRedisInstance *slave;
 937
 938     redisAssert(ri->flags & SRI_MASTER);
 939     key = sdscatprintf(sdsempty(),"%s:%d",ip,port);
 940     slave = dictFetchValue(ri->slaves,key);
 941     sdsfree(key);
 942     return slave;
 943 }
 944
 945 /* Return the name of the type of the instance as a string. */
 946 const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri) {
 947     if (ri->flags & SRI_MASTER) return "master";
 948     else if (ri->flags & SRI_SLAVE) return "slave";
 949     else if (ri->flags & SRI_SENTINEL) return "sentinel";
 950     else return "unknown";
 951 }
 952
 953 /* This function removes all the instances found in the dictionary of instances
 954  * 'd', having either:
 955  *
 956  * 1) The same ip/port as specified.
 957  * 2) The same runid.
 958  *
 959  * "1" and "2" don't need to verify at the same time, just one is enough.
 960  * If "runid" is NULL it is not checked.
 961  * Similarly if "ip" is NULL it is not checked.
 962  *
 963  * This function is useful because every time we add a new Sentinel into
 964  * a master's Sentinels dictionary, we want to be very sure about not
 965  * having duplicated instances for any reason. This is so important because
 966  * we use those other sentinels in order to run our quorum protocol to
 967  * understand if it's time to proceeed with the fail over.
 968  *
 969  * Making sure no duplication is possible we greately improve the robustness
 970  * of the quorum (otherwise we may end counting the same instance multiple
 971  * times for some reason).
 972  *
 973  * The function returns the number of Sentinels removed. */
 974 int removeMatchingSentinelsFromMaster(sentinelRedisInstance *master, char *ip, int port, char *runid) {
 975     dictIterator *di;
 976     dictEntry *de;
 977     int removed = 0;
 978
 979     di = dictGetSafeIterator(master->sentinels);
 980     while((de = dictNext(di)) != NULL) {
 981         sentinelRedisInstance *ri = dictGetVal(de);
 982
 983         if ((ri->runid && runid && strcmp(ri->runid,runid) == 0) ||
 984             (ip && strcmp(ri->addr->ip,ip) == 0 && port == ri->addr->port))
 985         {
 986             dictDelete(master->sentinels,ri->name);
 987             removed++;
 988         }
 989     }
 990     dictReleaseIterator(di);
 991     return removed;
 992 }
 993
 994 /* Search an instance with the same runid, ip and port into a dictionary
 995  * of instances. Return NULL if not found, otherwise return the instance
 996  * pointer.
 997  *
 998  * runid or ip can be NULL. In such a case the search is performed only
 999  * by the non-NULL field. */
1000 sentinelRedisInstance *getSentinelRedisInstanceByAddrAndRunID(dict *instances, char *ip, int port, char *runid) {
1001     dictIterator *di;
1002     dictEntry *de;
1003     sentinelRedisInstance *instance = NULL;
1004
1005     redisAssert(ip || runid);   /* User must pass at least one search param. */
1006     di = dictGetIterator(instances);
1007     while((de = dictNext(di)) != NULL) {
1008         sentinelRedisInstance *ri = dictGetVal(de);
1009
1010         if (runid && !ri->runid) continue;
1011         if ((runid == NULL || strcmp(ri->runid, runid) == 0) &&
1012             (ip == NULL || (strcmp(ri->addr->ip, ip) == 0 &&
1013                             ri->addr->port == port)))
1014         {
1015             instance = ri;
1016             break;
1017         }
1018     }
1019     dictReleaseIterator(di);
1020     return instance;
1021 }
1022
1023 /* Simple master lookup by name */
1024 sentinelRedisInstance *sentinelGetMasterByName(char *name) {
1025     sentinelRedisInstance *ri;
1026     sds sdsname = sdsnew(name);
1027
1028     ri = dictFetchValue(sentinel.masters,sdsname);
1029     sdsfree(sdsname);
1030     return ri;
1031 }
1032
1033 /* Add the specified flags to all the instances in the specified dictionary. */
1034 void sentinelAddFlagsToDictOfRedisInstances(dict *instances, int flags) {
1035     dictIterator *di;
1036     dictEntry *de;
1037
1038     di = dictGetIterator(instances);
1039     while((de = dictNext(di)) != NULL) {
1040         sentinelRedisInstance *ri = dictGetVal(de);
1041         ri->flags |= flags;
1042     }
1043     dictReleaseIterator(di);
1044 }
1045
1046 /* Remove the specified flags to all the instances in the specified
1047  * dictionary. */
1048 void sentinelDelFlagsToDictOfRedisInstances(dict *instances, int flags) {
1049     dictIterator *di;
1050     dictEntry *de;
1051
1052     di = dictGetIterator(instances);
1053     while((de = dictNext(di)) != NULL) {
1054         sentinelRedisInstance *ri = dictGetVal(de);
1055         ri->flags &= ~flags;
1056     }
1057     dictReleaseIterator(di);
1058 }
1059
1060 /* Reset the state of a monitored master:
1061  * 1) Remove all slaves.
1062  * 2) Remove all sentinels.
1063  * 3) Remove most of the flags resulting from runtime operations.
1064  * 4) Reset timers to their default value.
1065  * 5) In the process of doing this undo the failover if in progress.
1066  * 6) Disconnect the connections with the master (will reconnect automatically).
1067  */
1068 void sentinelResetMaster(sentinelRedisInstance *ri, int flags) {
1069     redisAssert(ri->flags & SRI_MASTER);
1070     dictRelease(ri->slaves);
1071     dictRelease(ri->sentinels);
1072     ri->slaves = dictCreate(&instancesDictType,NULL);
1073     ri->sentinels = dictCreate(&instancesDictType,NULL);
1074     if (ri->cc) sentinelKillLink(ri,ri->cc);
1075     if (ri->pc) sentinelKillLink(ri,ri->pc);
1076     ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED;
1077     if (ri->leader) {
1078         sdsfree(ri->leader);
1079         ri->leader = NULL;
1080     }
1081     ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
1082     ri->failover_state_change_time = 0;
1083     ri->failover_start_time = 0;
1084     ri->promoted_slave = NULL;
1085     sdsfree(ri->runid);
1086     sdsfree(ri->slave_master_host);
1087     ri->runid = NULL;
1088     ri->slave_master_host = NULL;
1089     ri->last_avail_time = mstime();
1090     ri->last_pong_time = mstime();
1091     if (flags & SENTINEL_GENERATE_EVENT)
1092         sentinelEvent(REDIS_WARNING,"+reset-master",ri,"%@");
1093 }
1094
1095 /* Call sentinelResetMaster() on every master with a name matching the specified
1096  * pattern. */
1097 int sentinelResetMastersByPattern(char *pattern, int flags) {
1098     dictIterator *di;
1099     dictEntry *de;
1100     int reset = 0;
1101
1102     di = dictGetIterator(sentinel.masters);
1103     while((de = dictNext(di)) != NULL) {
1104         sentinelRedisInstance *ri = dictGetVal(de);
1105
1106         if (ri->name) {
1107             if (stringmatch(pattern,ri->name,0)) {
1108                 sentinelResetMaster(ri,flags);
1109                 reset++;
1110             }
1111         }
1112     }
1113     dictReleaseIterator(di);
1114     return reset;
1115 }
1116
1117 /* Reset the specified master with sentinelResetMaster(), and also change
1118  * the ip:port address, but take the name of the instance unmodified.
1119  *
1120  * This is used to handle the +switch-master and +redirect-to-master events.
1121  *
1122  * The function returns REDIS_ERR if the address can't be resolved for some
1123  * reason. Otherwise REDIS_OK is returned.
1124  *
1125  * TODO: make this reset so that original sentinels are re-added with
1126  * same ip / port / runid.
1127  */
1128
1129 int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) {
1130     sentinelAddr *oldaddr, *newaddr;
1131
1132     newaddr = createSentinelAddr(ip,port);
1133     if (newaddr == NULL) return REDIS_ERR;
1134     sentinelResetMaster(master,SENTINEL_NO_FLAGS);
1135     oldaddr = master->addr;
1136     master->addr = newaddr;
1137     /* Release the old address at the end so we are safe even if the function
1138      * gets the master->addr->ip and master->addr->port as arguments. */
1139     releaseSentinelAddr(oldaddr);
1140     return REDIS_OK;
1141 }
1142
1143 /* ============================ Config handling ============================= */
1144 char *sentinelHandleConfiguration(char **argv, int argc) {
1145     sentinelRedisInstance *ri;
1146
1147     if (!strcasecmp(argv[0],"monitor") && argc == 5) {
1148         /* monitor <name> <host> <port> <quorum> */
1149         int quorum = atoi(argv[4]);
1150
1151         if (quorum <= 0) return "Quorum must be 1 or greater.";
1152         if (createSentinelRedisInstance(argv[1],SRI_MASTER,argv[2],
1153                                         atoi(argv[3]),quorum,NULL) == NULL)
1154         {
1155             switch(errno) {
1156             case EBUSY: return "Duplicated master name.";
1157             case ENOENT: return "Can't resolve master instance hostname.";
1158             case EINVAL: return "Invalid port number";
1159             }
1160         }
1161     } else if (!strcasecmp(argv[0],"down-after-milliseconds") && argc == 3) {
1162         /* down-after-milliseconds <name> <milliseconds> */
1163         ri = sentinelGetMasterByName(argv[1]);
1164         if (!ri) return "No such master with specified name.";
1165         ri->down_after_period = atoi(argv[2]);
1166         if (ri->down_after_period <= 0)
1167             return "negative or zero time parameter.";
1168     } else if (!strcasecmp(argv[0],"failover-timeout") && argc == 3) {
1169         /* failover-timeout <name> <milliseconds> */
1170         ri = sentinelGetMasterByName(argv[1]);
1171         if (!ri) return "No such master with specified name.";
1172         ri->failover_timeout = atoi(argv[2]);
1173         if (ri->failover_timeout <= 0)
1174             return "negative or zero time parameter.";
1175     } else if (!strcasecmp(argv[0],"can-failover") && argc == 3) {
1176         /* can-failover <name> <yes/no> */
1177         int yesno = yesnotoi(argv[2]);
1178
1179         ri = sentinelGetMasterByName(argv[1]);
1180         if (!ri) return "No such master with specified name.";
1181         if (yesno == -1) return "Argument must be either yes or no.";
1182         if (yesno)
1183             ri->flags |= SRI_CAN_FAILOVER;
1184         else
1185             ri->flags &= ~SRI_CAN_FAILOVER;
1186    } else if (!strcasecmp(argv[0],"parallel-syncs") && argc == 3) {
1187         /* parallel-syncs <name> <milliseconds> */
1188         ri = sentinelGetMasterByName(argv[1]);
1189         if (!ri) return "No such master with specified name.";
1190         ri->parallel_syncs = atoi(argv[2]);
1191    } else if (!strcasecmp(argv[0],"notification-script") && argc == 3) {
1192         /* notification-script <name> <path> */
1193         ri = sentinelGetMasterByName(argv[1]);
1194         if (!ri) return "No such master with specified name.";
1195         if (access(argv[2],X_OK) == -1)
1196             return "Notification script seems non existing or non executable.";
1197         ri->notification_script = sdsnew(argv[2]);
1198    } else if (!strcasecmp(argv[0],"client-reconfig-script") && argc == 3) {
1199         /* client-reconfig-script <name> <path> */
1200         ri = sentinelGetMasterByName(argv[1]);
1201         if (!ri) return "No such master with specified name.";
1202         if (access(argv[2],X_OK) == -1)
1203             return "Client reconfiguration script seems non existing or "
1204                    "non executable.";
1205         ri->client_reconfig_script = sdsnew(argv[2]);
1206     } else {
1207         return "Unrecognized sentinel configuration statement.";
1208     }
1209     return NULL;
1210 }
1211
1212 /* ====================== hiredis connection handling ======================= */
1213
1214 /* Completely disconnect an hiredis link from an instance. */
1215 void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c) {
1216     if (ri->cc == c) {
1217         ri->cc = NULL;
1218         ri->pending_commands = 0;
1219     }
1220     if (ri->pc == c) ri->pc = NULL;
1221     c->data = NULL;
1222     ri->flags |= SRI_DISCONNECTED;
1223     redisAsyncFree(c);
1224 }
1225
1226 /* This function takes an hiredis context that is in an error condition
1227  * and make sure to mark the instance as disconnected performing the
1228  * cleanup needed.
1229  *
1230  * Note: we don't free the hiredis context as hiredis will do it for us
1231  * for async conenctions. */
1232 void sentinelDisconnectInstanceFromContext(const redisAsyncContext *c) {
1233     sentinelRedisInstance *ri = c->data;
1234     int pubsub;
1235
1236     if (ri == NULL) return; /* The instance no longer exists. */
1237
1238     pubsub = (ri->pc == c);
1239     sentinelEvent(REDIS_DEBUG, pubsub ? "-pubsub-link" : "-cmd-link", ri,
1240         "%@ #%s", c->errstr);
1241     if (pubsub)
1242         ri->pc = NULL;
1243     else
1244         ri->cc = NULL;
1245     ri->flags |= SRI_DISCONNECTED;
1246 }
1247
1248 void sentinelLinkEstablishedCallback(const redisAsyncContext *c, int status) {
1249     if (status != REDIS_OK) {
1250         sentinelDisconnectInstanceFromContext(c);
1251     } else {
1252         sentinelRedisInstance *ri = c->data;
1253         int pubsub = (ri->pc == c);
1254
1255         sentinelEvent(REDIS_DEBUG, pubsub ? "+pubsub-link" : "+cmd-link", ri,
1256             "%@");
1257     }
1258 }
1259
1260 void sentinelDisconnectCallback(const redisAsyncContext *c, int status) {
1261     sentinelDisconnectInstanceFromContext(c);
1262 }
1263
1264 /* Create the async connections for the specified instance if the instance
1265  * is disconnected. Note that the SRI_DISCONNECTED flag is set even if just
1266  * one of the two links (commands and pub/sub) is missing. */
1267 void sentinelReconnectInstance(sentinelRedisInstance *ri) {
1268     if (!(ri->flags & SRI_DISCONNECTED)) return;
1269
1270     /* Commands connection. */
1271     if (ri->cc == NULL) {
1272         ri->cc = redisAsyncConnect(ri->addr->ip,ri->addr->port);
1273         if (ri->cc->err) {
1274             sentinelEvent(REDIS_DEBUG,"-cmd-link-reconnection",ri,"%@ #%s",
1275                 ri->cc->errstr);
1276             sentinelKillLink(ri,ri->cc);
1277         } else {
1278             ri->cc_conn_time = mstime();
1279             ri->cc->data = ri;
1280             redisAeAttach(server.el,ri->cc);
1281             redisAsyncSetConnectCallback(ri->cc,
1282                                             sentinelLinkEstablishedCallback);
1283             redisAsyncSetDisconnectCallback(ri->cc,
1284                                             sentinelDisconnectCallback);
1285         }
1286     }
1287     /* Pub / Sub */
1288     if ((ri->flags & SRI_MASTER) && ri->pc == NULL) {
1289         ri->pc = redisAsyncConnect(ri->addr->ip,ri->addr->port);
1290         if (ri->pc->err) {
1291             sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s",
1292                 ri->pc->errstr);
1293             sentinelKillLink(ri,ri->pc);
1294         } else {
1295             int retval;
1296
1297             ri->pc_conn_time = mstime();
1298             ri->pc->data = ri;
1299             redisAeAttach(server.el,ri->pc);
1300             redisAsyncSetConnectCallback(ri->pc,
1301                                             sentinelLinkEstablishedCallback);
1302             redisAsyncSetDisconnectCallback(ri->pc,
1303                                             sentinelDisconnectCallback);
1304             /* Now we subscribe to the Sentinels "Hello" channel. */
1305             retval = redisAsyncCommand(ri->pc,
1306                 sentinelReceiveHelloMessages, NULL, "SUBSCRIBE %s",
1307                     SENTINEL_HELLO_CHANNEL);
1308             if (retval != REDIS_OK) {
1309                 /* If we can't subscribe, the Pub/Sub connection is useless
1310                  * and we can simply disconnect it and try again. */
1311                 sentinelKillLink(ri,ri->pc);
1312                 return;
1313             }
1314         }
1315     }
1316     /* Clear the DISCONNECTED flags only if we have both the connections
1317      * (or just the commands connection if this is a slave or a
1318      * sentinel instance). */
1319     if (ri->cc && (ri->flags & (SRI_SLAVE|SRI_SENTINEL) || ri->pc))
1320         ri->flags &= ~SRI_DISCONNECTED;
1321 }
1322
1323 /* ======================== Redis instances pinging  ======================== */
1324
1325 /* Process the INFO output from masters. */
1326 void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
1327     sds *lines;
1328     int numlines, j;
1329     int role = 0;
1330     int runid_changed = 0;  /* true if runid changed. */
1331     int first_runid = 0;    /* true if this is the first runid we receive. */
1332
1333     /* The following fields must be reset to a given value in the case they
1334      * are not found at all in the INFO output. */
1335     ri->master_link_down_time = 0;
1336
1337     /* Process line by line. */
1338     lines = sdssplitlen(info,strlen(info),"\r\n",2,&numlines);
1339     for (j = 0; j < numlines; j++) {
1340         sentinelRedisInstance *slave;
1341         sds l = lines[j];
1342
1343         /* run_id:<40 hex chars>*/
1344         if (sdslen(l) >= 47 && !memcmp(l,"run_id:",7)) {
1345             if (ri->runid == NULL) {
1346                 ri->runid = sdsnewlen(l+7,40);
1347                 first_runid = 1;
1348             } else {
1349                 if (strncmp(ri->runid,l+7,40) != 0) {
1350                     runid_changed = 1;
1351                     sentinelEvent(REDIS_NOTICE,"+reboot",ri,"%@");
1352                     sdsfree(ri->runid);
1353                     ri->runid = sdsnewlen(l+7,40);
1354                 }
1355             }
1356         }
1357
1358         /* slave0:<ip>,<port>,<state> */
1359         if ((ri->flags & SRI_MASTER) &&
1360             sdslen(l) >= 7 &&
1361             !memcmp(l,"slave",5) && isdigit(l[5]))
1362         {
1363             char *ip, *port, *end;
1364
1365             ip = strchr(l,':'); if (!ip) continue;
1366             ip++; /* Now ip points to start of ip address. */
1367             port = strchr(ip,','); if (!port) continue;
1368             *port = '\0'; /* nul term for easy access. */
1369             port++; /* Now port points to start of port number. */
1370             end = strchr(port,','); if (!end) continue;
1371             *end = '\0'; /* nul term for easy access. */
1372
1373             /* Check if we already have this slave into our table,
1374              * otherwise add it. */
1375             if (sentinelRedisInstanceLookupSlave(ri,ip,atoi(port)) == NULL) {
1376                 if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,ip,
1377                             atoi(port), ri->quorum,ri)) != NULL)
1378                 {
1379                     sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@");
1380                 }
1381             }
1382         }
1383
1384         /* master_link_down_since_seconds:<seconds> */
1385         if (sdslen(l) >= 32 &&
1386             !memcmp(l,"master_link_down_since_seconds",30))
1387         {
1388             ri->master_link_down_time = strtoll(l+31,NULL,10)*1000;
1389         }
1390
1391         /* role:<role> */
1392         if (!memcmp(l,"role:master",11)) role = SRI_MASTER;
1393         else if (!memcmp(l,"role:slave",10)) role = SRI_SLAVE;
1394
1395         if (role == SRI_SLAVE) {
1396             /* master_host:<host> */
1397             if (sdslen(l) >= 12 && !memcmp(l,"master_host:",12)) {
1398                 sdsfree(ri->slave_master_host);
1399                 ri->slave_master_host = sdsnew(l+12);
1400             }
1401
1402             /* master_port:<port> */
1403             if (sdslen(l) >= 12 && !memcmp(l,"master_port:",12))
1404                 ri->slave_master_port = atoi(l+12);
1405
1406             /* master_link_status:<status> */
1407             if (sdslen(l) >= 19 && !memcmp(l,"master_link_status:",19)) {
1408                 ri->slave_master_link_status =
1409                     (strcasecmp(l+19,"up") == 0) ?
1410                     SENTINEL_MASTER_LINK_STATUS_UP :
1411                     SENTINEL_MASTER_LINK_STATUS_DOWN;
1412             }
1413         }
1414     }
1415     ri->info_refresh = mstime();
1416     sdsfreesplitres(lines,numlines);
1417
1418     /* ---------------------------- Acting half ----------------------------- */
1419     if (sentinel.tilt) return;
1420
1421     /* Act if a master turned into a slave. */
1422     if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) {
1423         if (first_runid && ri->slave_master_host) {
1424             /* If it is the first time we receive INFO from it, but it's
1425              * a slave while it was configured as a master, we want to monitor
1426              * its master instead. */
1427             sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri,
1428                 "%s %s %d %s %d",
1429                 ri->name, ri->addr->ip, ri->addr->port,
1430                 ri->slave_master_host, ri->slave_master_port);
1431             sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host,
1432                                                    ri->slave_master_port);
1433             return;
1434         }
1435     }
1436
1437     /* Act if a slave turned into a master. */
1438     if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {
1439         if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
1440             (runid_changed || first_runid))
1441         {
1442             /* If a slave turned into maser but:
1443              *
1444              * 1) Failover not in progress.
1445              * 2) RunID hs changed, or its the first time we see an INFO output.
1446              *
1447              * We assume this is a reboot with a wrong configuration.
1448              * Log the event and remove the slave. */
1449             int retval;
1450
1451             sentinelEvent(REDIS_WARNING,"-slave-restart-as-master",ri,"%@ #removing it from the attached slaves");
1452             retval = dictDelete(ri->master->slaves,ri->name);
1453             redisAssert(retval == REDIS_OK);
1454             return;
1455         } else if (ri->flags & SRI_PROMOTED) {
1456             /* If this is a promoted slave we can change state to the
1457              * failover state machine. */
1458             if ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
1459                 (ri->master->flags & SRI_I_AM_THE_LEADER) &&
1460                 (ri->master->failover_state ==
1461                     SENTINEL_FAILOVER_STATE_WAIT_PROMOTION))
1462             {
1463                 ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;
1464                 ri->master->failover_state_change_time = mstime();
1465                 sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@");
1466                 sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves",
1467                     ri->master,"%@");
1468                 sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER,
1469                     "start",ri->master->addr,ri->addr);
1470             }
1471         } else if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) ||
1472                     ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
1473                      (ri->master->flags & SRI_I_AM_THE_LEADER) &&
1474                      ri->master->failover_state ==
1475                      SENTINEL_FAILOVER_STATE_WAIT_START))
1476         {
1477             /* No failover in progress? Then it is the start of a failover
1478              * and we are an observer.
1479              *
1480              * We also do that if we are a leader doing a failover, in wait
1481              * start, but well, somebody else started before us. */
1482
1483             if (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) {
1484                 sentinelEvent(REDIS_WARNING,"-failover-abort-race",
1485                                 ri->master, "%@");
1486                 sentinelAbortFailover(ri->master);
1487             }
1488
1489             ri->master->flags |= SRI_FAILOVER_IN_PROGRESS;
1490             sentinelEvent(REDIS_WARNING,"+failover-detected",ri->master,"%@");
1491             ri->master->failover_state = SENTINEL_FAILOVER_STATE_DETECT_END;
1492             ri->master->failover_state_change_time = mstime();
1493             ri->master->promoted_slave = ri;
1494             ri->flags |= SRI_PROMOTED;
1495             sentinelCallClientReconfScript(ri->master,SENTINEL_OBSERVER,
1496                 "start", ri->master->addr,ri->addr);
1497             /* We are an observer, so we can only assume that the leader
1498              * is reconfiguring the slave instances. For this reason we
1499              * set all the instances as RECONF_SENT waiting for progresses
1500              * on this side. */
1501             sentinelAddFlagsToDictOfRedisInstances(ri->master->slaves,
1502                 SRI_RECONF_SENT);
1503         }
1504     }
1505
1506     /* Detect if the slave that is in the process of being reconfigured
1507      * changed state. */
1508     if ((ri->flags & SRI_SLAVE) && role == SRI_SLAVE &&
1509         (ri->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG)))
1510     {
1511         /* SRI_RECONF_SENT -> SRI_RECONF_INPROG. */
1512         if ((ri->flags & SRI_RECONF_SENT) &&
1513             ri->slave_master_host &&
1514             strcmp(ri->slave_master_host,
1515                     ri->master->promoted_slave->addr->ip) == 0 &&
1516             ri->slave_master_port == ri->master->promoted_slave->addr->port)
1517         {
1518             ri->flags &= ~SRI_RECONF_SENT;
1519             ri->flags |= SRI_RECONF_INPROG;
1520             sentinelEvent(REDIS_NOTICE,"+slave-reconf-inprog",ri,"%@");
1521         }
1522
1523         /* SRI_RECONF_INPROG -> SRI_RECONF_DONE */
1524         if ((ri->flags & SRI_RECONF_INPROG) &&
1525             ri->slave_master_link_status == SENTINEL_MASTER_LINK_STATUS_UP)
1526         {
1527             ri->flags &= ~SRI_RECONF_INPROG;
1528             ri->flags |= SRI_RECONF_DONE;
1529             sentinelEvent(REDIS_NOTICE,"+slave-reconf-done",ri,"%@");
1530             /* If we are moving forward (a new slave is now configured)
1531              * we update the change_time as we are conceptually passing
1532              * to the next slave. */
1533             ri->failover_state_change_time = mstime();
1534         }
1535     }
1536 }
1537
1538 void sentinelInfoReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1539     sentinelRedisInstance *ri = c->data;
1540     redisReply *r;
1541
1542     if (ri) ri->pending_commands--;
1543     if (!reply || !ri) return;
1544     r = reply;
1545
1546     if (r->type == REDIS_REPLY_STRING) {
1547         sentinelRefreshInstanceInfo(ri,r->str);
1548     }
1549 }
1550
1551 /* Just discard the reply. We use this when we are not monitoring the return
1552  * value of the command but its effects directly. */
1553 void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1554     sentinelRedisInstance *ri = c->data;
1555
1556     if (ri) ri->pending_commands--;
1557 }
1558
1559 void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1560     sentinelRedisInstance *ri = c->data;
1561     redisReply *r;
1562
1563     if (ri) ri->pending_commands--;
1564     if (!reply || !ri) return;
1565     r = reply;
1566
1567     if (r->type == REDIS_REPLY_STATUS ||
1568         r->type == REDIS_REPLY_ERROR) {
1569         /* Update the "instance available" field only if this is an
1570          * acceptable reply. */
1571         if (strncmp(r->str,"PONG",4) == 0 ||
1572             strncmp(r->str,"LOADING",7) == 0 ||
1573             strncmp(r->str,"MASTERDOWN",10) == 0)
1574         {
1575             ri->last_avail_time = mstime();
1576         } else {
1577             /* Send a SCRIPT KILL command if the instance appears to be
1578              * down because of a busy script. */
1579             if (strncmp(r->str,"BUSY",4) == 0 &&
1580                 (ri->flags & SRI_S_DOWN) &&
1581                 !(ri->flags & SRI_SCRIPT_KILL_SENT))
1582             {
1583                 redisAsyncCommand(ri->cc,
1584                     sentinelDiscardReplyCallback, NULL, "SCRIPT KILL");
1585                 ri->flags |= SRI_SCRIPT_KILL_SENT;
1586             }
1587         }
1588     }
1589     ri->last_pong_time = mstime();
1590 }
1591
1592 /* This is called when we get the reply about the PUBLISH command we send
1593  * to the master to advertise this sentinel. */
1594 void sentinelPublishReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
1595     sentinelRedisInstance *ri = c->data;
1596     redisReply *r;
1597
1598     if (ri) ri->pending_commands--;
1599     if (!reply || !ri) return;
1600     r = reply;
1601
1602     /* Only update pub_time if we actually published our message. Otherwise
1603      * we'll retry against in 100 milliseconds. */
1604     if (r->type != REDIS_REPLY_ERROR)
1605         ri->last_pub_time = mstime();
1606 }
1607
1608 /* This is our Pub/Sub callback for the Hello channel. It's useful in order
1609  * to discover other sentinels attached at the same master. */
1610 void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata) {
1611     sentinelRedisInstance *ri = c->data;
1612     redisReply *r;
1613
1614     if (!reply || !ri) return;
1615     r = reply;
1616
1617     /* Update the last activity in the pubsub channel. Note that since we
1618      * receive our messages as well this timestamp can be used to detect
1619      * if the link is probably diconnected even if it seems otherwise. */
1620     ri->pc_last_activity = mstime();
1621
1622     /* Sanity check in the reply we expect, so that the code that follows
1623      * can avoid to check for details. */
1624     if (r->type != REDIS_REPLY_ARRAY ||
1625         r->elements != 3 ||
1626         r->element[0]->type != REDIS_REPLY_STRING ||
1627         r->element[1]->type != REDIS_REPLY_STRING ||
1628         r->element[2]->type != REDIS_REPLY_STRING ||
1629         strcmp(r->element[0]->str,"message") != 0) return;
1630
1631     /* We are not interested in meeting ourselves */
1632     if (strstr(r->element[2]->str,server.runid) != NULL) return;
1633
1634     {
1635         int numtokens, port, removed, canfailover;
1636         char **token = sdssplitlen(r->element[2]->str,
1637                                    r->element[2]->len,
1638                                    ":",1,&numtokens);
1639         sentinelRedisInstance *sentinel;
1640
1641         if (numtokens == 4) {
1642             /* First, try to see if we already have this sentinel. */
1643             port = atoi(token[1]);
1644             canfailover = atoi(token[3]);
1645             sentinel = getSentinelRedisInstanceByAddrAndRunID(
1646                             ri->sentinels,token[0],port,token[2]);
1647
1648             if (!sentinel) {
1649                 /* If not, remove all the sentinels that have the same runid
1650                  * OR the same ip/port, because it's either a restart or a
1651                  * network topology change. */
1652                 removed = removeMatchingSentinelsFromMaster(ri,token[0],port,
1653                                 token[2]);
1654                 if (removed) {
1655                     sentinelEvent(REDIS_NOTICE,"-dup-sentinel",ri,
1656                         "%@ #duplicate of %s:%d or %s",
1657                         token[0],port,token[2]);
1658                 }
1659
1660                 /* Add the new sentinel. */
1661                 sentinel = createSentinelRedisInstance(NULL,SRI_SENTINEL,
1662                                 token[0],port,ri->quorum,ri);
1663                 if (sentinel) {
1664                     sentinelEvent(REDIS_NOTICE,"+sentinel",sentinel,"%@");
1665                     /* The runid is NULL after a new instance creation and
1666                      * for Sentinels we don't have a later chance to fill it,
1667                      * so do it now. */
1668                     sentinel->runid = sdsnew(token[2]);
1669                 }
1670             }
1671
1672             /* Update the state of the Sentinel. */
1673             if (sentinel) {
1674                 sentinel->last_hello_time = mstime();
1675                 if (canfailover)
1676                     sentinel->flags |= SRI_CAN_FAILOVER;
1677                 else
1678                     sentinel->flags &= ~SRI_CAN_FAILOVER;
1679             }
1680         }
1681         sdsfreesplitres(token,numtokens);
1682     }
1683 }
1684
1685 void sentinelPingInstance(sentinelRedisInstance *ri) {
1686     mstime_t now = mstime();
1687     mstime_t info_period;
1688     int retval;
1689
1690     /* Return ASAP if we have already a PING or INFO already pending, or
1691      * in the case the instance is not properly connected. */
1692     if (ri->flags & SRI_DISCONNECTED) return;
1693
1694     /* For INFO, PING, PUBLISH that are not critical commands to send we
1695      * also have a limit of SENTINEL_MAX_PENDING_COMMANDS. We don't
1696      * want to use a lot of memory just because a link is not working
1697      * properly (note that anyway there is a redundant protection about this,
1698      * that is, the link will be disconnected and reconnected if a long
1699      * timeout condition is detected. */
1700     if (ri->pending_commands >= SENTINEL_MAX_PENDING_COMMANDS) return;
1701
1702     /* If this is a slave of a master in O_DOWN condition we start sending
1703      * it INFO every second, instead of the usual SENTINEL_INFO_PERIOD
1704      * period. In this state we want to closely monitor slaves in case they
1705      * are turned into masters by another Sentinel, or by the sysadmin. */
1706     if ((ri->flags & SRI_SLAVE) &&
1707         (ri->master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) {
1708         info_period = 1000;
1709     } else {
1710         info_period = SENTINEL_INFO_PERIOD;
1711     }
1712
1713     if ((ri->flags & SRI_SENTINEL) == 0 &&
1714         (ri->info_refresh == 0 ||
1715         (now - ri->info_refresh) > info_period))
1716     {
1717         /* Send INFO to masters and slaves, not sentinels. */
1718         retval = redisAsyncCommand(ri->cc,
1719             sentinelInfoReplyCallback, NULL, "INFO");
1720         if (retval != REDIS_OK) return;
1721         ri->pending_commands++;
1722     } else if ((now - ri->last_pong_time) > SENTINEL_PING_PERIOD) {
1723         /* Send PING to all the three kinds of instances. */
1724         retval = redisAsyncCommand(ri->cc,
1725             sentinelPingReplyCallback, NULL, "PING");
1726         if (retval != REDIS_OK) return;
1727         ri->pending_commands++;
1728     } else if ((ri->flags & SRI_MASTER) &&
1729                (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD)
1730     {
1731         /* PUBLISH hello messages only to masters. */
1732         struct sockaddr_in sa;
1733         socklen_t salen = sizeof(sa);
1734
1735         if (getsockname(ri->cc->c.fd,(struct sockaddr*)&sa,&salen) != -1) {
1736             char myaddr[128];
1737
1738             snprintf(myaddr,sizeof(myaddr),"%s:%d:%s:%d",
1739                 inet_ntoa(sa.sin_addr), server.port, server.runid,
1740                 (ri->flags & SRI_CAN_FAILOVER) != 0);
1741             retval = redisAsyncCommand(ri->cc,
1742                 sentinelPublishReplyCallback, NULL, "PUBLISH %s %s",
1743                     SENTINEL_HELLO_CHANNEL,myaddr);
1744             if (retval != REDIS_OK) return;
1745             ri->pending_commands++;
1746         }
1747     }
1748 }
1749
1750 /* =========================== SENTINEL command ============================= */
1751
1752 const char *sentinelFailoverStateStr(int state) {
1753     switch(state) {
1754     case SENTINEL_FAILOVER_STATE_NONE: return "none";
1755     case SENTINEL_FAILOVER_STATE_WAIT_START: return "wait_start";
1756     case SENTINEL_FAILOVER_STATE_SELECT_SLAVE: return "select_slave";
1757     case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE: return "send_slaveof_noone";
1758     case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion";
1759     case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves";
1760     case SENTINEL_FAILOVER_STATE_ALERT_CLIENTS: return "alert_clients";
1761     case SENTINEL_FAILOVER_STATE_DETECT_END: return "detect_end";
1762     case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config";
1763     default: return "unknown";
1764     }
1765 }
1766
1767 /* Redis instance to Redis protocol representation. */
1768 void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) {
1769     char *flags = sdsempty();
1770     void *mbl;
1771     int fields = 0;
1772
1773     mbl = addDeferredMultiBulkLength(c);
1774
1775     addReplyBulkCString(c,"name");
1776     addReplyBulkCString(c,ri->name);
1777     fields++;
1778
1779     addReplyBulkCString(c,"ip");
1780     addReplyBulkCString(c,ri->addr->ip);
1781     fields++;
1782
1783     addReplyBulkCString(c,"port");
1784     addReplyBulkLongLong(c,ri->addr->port);
1785     fields++;
1786
1787     addReplyBulkCString(c,"runid");
1788     addReplyBulkCString(c,ri->runid ? ri->runid : "");
1789     fields++;
1790
1791     addReplyBulkCString(c,"flags");
1792     if (ri->flags & SRI_S_DOWN) flags = sdscat(flags,"s_down,");
1793     if (ri->flags & SRI_O_DOWN) flags = sdscat(flags,"o_down,");
1794     if (ri->flags & SRI_MASTER) flags = sdscat(flags,"master,");
1795     if (ri->flags & SRI_SLAVE) flags = sdscat(flags,"slave,");
1796     if (ri->flags & SRI_SENTINEL) flags = sdscat(flags,"sentinel,");
1797     if (ri->flags & SRI_DISCONNECTED) flags = sdscat(flags,"disconnected,");
1798     if (ri->flags & SRI_MASTER_DOWN) flags = sdscat(flags,"master_down,");
1799     if (ri->flags & SRI_FAILOVER_IN_PROGRESS)
1800         flags = sdscat(flags,"failover_in_progress,");
1801     if (ri->flags & SRI_I_AM_THE_LEADER)
1802         flags = sdscat(flags,"i_am_the_leader,");
1803     if (ri->flags & SRI_PROMOTED) flags = sdscat(flags,"promoted,");
1804     if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,");
1805     if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,");
1806     if (ri->flags & SRI_RECONF_DONE) flags = sdscat(flags,"reconf_done,");
1807
1808     if (sdslen(flags) != 0) flags = sdsrange(flags,0,-2); /* remove last "," */
1809     addReplyBulkCString(c,flags);
1810     sdsfree(flags);
1811     fields++;
1812
1813     addReplyBulkCString(c,"pending-commands");
1814     addReplyBulkLongLong(c,ri->pending_commands);
1815     fields++;
1816
1817     if (ri->flags & SRI_FAILOVER_IN_PROGRESS) {
1818         addReplyBulkCString(c,"failover-state");
1819         addReplyBulkCString(c,(char*)sentinelFailoverStateStr(ri->failover_state));
1820         fields++;
1821     }
1822
1823     addReplyBulkCString(c,"last-ok-ping-reply");
1824     addReplyBulkLongLong(c,mstime() - ri->last_avail_time);
1825     fields++;
1826
1827     addReplyBulkCString(c,"last-ping-reply");
1828     addReplyBulkLongLong(c,mstime() - ri->last_pong_time);
1829     fields++;
1830
1831     if (ri->flags & SRI_S_DOWN) {
1832         addReplyBulkCString(c,"s-down-time");
1833         addReplyBulkLongLong(c,mstime()-ri->s_down_since_time);
1834         fields++;
1835     }
1836
1837     if (ri->flags & SRI_O_DOWN) {
1838         addReplyBulkCString(c,"o-down-time");
1839         addReplyBulkLongLong(c,mstime()-ri->o_down_since_time);
1840         fields++;
1841     }
1842
1843     /* Masters and Slaves */
1844     if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
1845         addReplyBulkCString(c,"info-refresh");
1846         addReplyBulkLongLong(c,mstime() - ri->info_refresh);
1847         fields++;
1848     }
1849
1850     /* Only masters */
1851     if (ri->flags & SRI_MASTER) {
1852         addReplyBulkCString(c,"num-slaves");
1853         addReplyBulkLongLong(c,dictSize(ri->slaves));
1854         fields++;
1855
1856         addReplyBulkCString(c,"num-other-sentinels");
1857         addReplyBulkLongLong(c,dictSize(ri->sentinels));
1858         fields++;
1859
1860         addReplyBulkCString(c,"quorum");
1861         addReplyBulkLongLong(c,ri->quorum);
1862         fields++;
1863     }
1864
1865     /* Only slaves */
1866     if (ri->flags & SRI_SLAVE) {
1867         addReplyBulkCString(c,"master-link-down-time");
1868         addReplyBulkLongLong(c,ri->master_link_down_time);
1869         fields++;
1870
1871         addReplyBulkCString(c,"master-link-status");
1872         addReplyBulkCString(c,
1873             (ri->slave_master_link_status == SENTINEL_MASTER_LINK_STATUS_UP) ?
1874             "ok" : "err");
1875         fields++;
1876
1877         addReplyBulkCString(c,"master-host");
1878         addReplyBulkCString(c,
1879             ri->slave_master_host ? ri->slave_master_host : "?");
1880         fields++;
1881
1882         addReplyBulkCString(c,"master-port");
1883         addReplyBulkLongLong(c,ri->slave_master_port);
1884         fields++;
1885     }
1886
1887     /* Only sentinels */
1888     if (ri->flags & SRI_SENTINEL) {
1889         addReplyBulkCString(c,"last-hello-message");
1890         addReplyBulkLongLong(c,mstime() - ri->last_hello_time);
1891         fields++;
1892
1893         addReplyBulkCString(c,"can-failover-its-master");
1894         addReplyBulkLongLong(c,(ri->flags & SRI_CAN_FAILOVER) != 0);
1895         fields++;
1896
1897         if (ri->flags & SRI_MASTER_DOWN) {
1898             addReplyBulkCString(c,"subjective-leader");
1899             addReplyBulkCString(c,ri->leader ? ri->leader : "?");
1900             fields++;
1901         }
1902     }
1903
1904     setDeferredMultiBulkLength(c,mbl,fields*2);
1905 }
1906
1907 /* Output a number of instances contanined inside a dictionary as
1908  * Redis protocol. */
1909 void addReplyDictOfRedisInstances(redisClient *c, dict *instances) {
1910     dictIterator *di;
1911     dictEntry *de;
1912
1913     di = dictGetIterator(instances);
1914     addReplyMultiBulkLen(c,dictSize(instances));
1915     while((de = dictNext(di)) != NULL) {
1916         sentinelRedisInstance *ri = dictGetVal(de);
1917
1918         addReplySentinelRedisInstance(c,ri);
1919     }
1920     dictReleaseIterator(di);
1921 }
1922
1923 /* Lookup the named master into sentinel.masters.
1924  * If the master is not found reply to the client with an error and returns
1925  * NULL. */
1926 sentinelRedisInstance *sentinelGetMasterByNameOrReplyError(redisClient *c,
1927                         robj *name)
1928 {
1929     sentinelRedisInstance *ri;
1930
1931     ri = dictFetchValue(sentinel.masters,c->argv[2]->ptr);
1932     if (!ri) {
1933         addReplyError(c,"No such master with that name");
1934         return NULL;
1935     }
1936     return ri;
1937 }
1938
1939 void sentinelCommand(redisClient *c) {
1940     if (!strcasecmp(c->argv[1]->ptr,"masters")) {
1941         /* SENTINEL MASTERS */
1942         if (c->argc != 2) goto numargserr;
1943
1944         addReplyDictOfRedisInstances(c,sentinel.masters);
1945     } else if (!strcasecmp(c->argv[1]->ptr,"slaves")) {
1946         /* SENTINEL SLAVES <master-name> */
1947         sentinelRedisInstance *ri;
1948
1949         if (c->argc != 3) goto numargserr;
1950         if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
1951             return;
1952         addReplyDictOfRedisInstances(c,ri->slaves);
1953     } else if (!strcasecmp(c->argv[1]->ptr,"sentinels")) {
1954         /* SENTINEL SENTINELS <master-name> */
1955         sentinelRedisInstance *ri;
1956
1957         if (c->argc != 3) goto numargserr;
1958         if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
1959             return;
1960         addReplyDictOfRedisInstances(c,ri->sentinels);
1961     } else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) {
1962         /* SENTINEL IS-MASTER-DOWN-BY-ADDR <ip> <port> */
1963         sentinelRedisInstance *ri;
1964         char *leader = NULL;
1965         long port;
1966         int isdown = 0;
1967
1968         if (c->argc != 4) goto numargserr;
1969         if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK)
1970             return;
1971         ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters,
1972             c->argv[2]->ptr,port,NULL);
1973
1974         /* It exists? Is actually a master? Is subjectively down? It's down.
1975          * Note: if we are in tilt mode we always reply with "0". */
1976         if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) &&
1977                                     (ri->flags & SRI_MASTER))
1978             isdown = 1;
1979         if (ri) leader = sentinelGetSubjectiveLeader(ri);
1980
1981         /* Reply with a two-elements multi-bulk reply: down state, leader. */
1982         addReplyMultiBulkLen(c,2);
1983         addReply(c, isdown ? shared.cone : shared.czero);
1984         addReplyBulkCString(c, leader ? leader : "?");
1985         if (leader) sdsfree(leader);
1986     } else if (!strcasecmp(c->argv[1]->ptr,"reset")) {
1987         /* SENTINEL RESET <pattern> */
1988         if (c->argc != 3) goto numargserr;
1989         addReplyLongLong(c,sentinelResetMastersByPattern(c->argv[2]->ptr,SENTINEL_GENERATE_EVENT));
1990     } else if (!strcasecmp(c->argv[1]->ptr,"get-master-addr-by-name")) {
1991         /* SENTINEL GET-MASTER-ADDR-BY-NAME <master-name> */
1992         sentinelRedisInstance *ri;
1993
1994         if (c->argc != 3) goto numargserr;
1995         ri = sentinelGetMasterByName(c->argv[2]->ptr);
1996         if (ri == NULL) {
1997             addReply(c,shared.nullmultibulk);
1998         } else {
1999             sentinelAddr *addr = ri->addr;
2000
2001             if ((ri->flags & SRI_FAILOVER_IN_PROGRESS) && ri->promoted_slave)
2002                 addr = ri->promoted_slave->addr;
2003             addReplyMultiBulkLen(c,2);
2004             addReplyBulkCString(c,addr->ip);
2005             addReplyBulkLongLong(c,addr->port);
2006         }
2007     } else if (!strcasecmp(c->argv[1]->ptr,"failover")) {
2008         /* SENTINEL FAILOVER <master-name> */
2009         sentinelRedisInstance *ri;
2010
2011         if (c->argc != 3) goto numargserr;
2012         if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
2013             return;
2014         if (ri->flags & SRI_FAILOVER_IN_PROGRESS) {
2015             addReplySds(c,sdsnew("-INPROG Failover already in progress\r\n"));
2016             return;
2017         }
2018         if (sentinelSelectSlave(ri) == NULL) {
2019             addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n"));
2020             return;
2021         }
2022         sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START);
2023         ri->flags |= SRI_FORCE_FAILOVER;
2024         addReply(c,shared.ok);
2025     } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) {
2026         /* SENTINEL PENDING-SCRIPTS */
2027
2028         if (c->argc != 2) goto numargserr;
2029         sentinelPendingScriptsCommand(c);
2030     } else {
2031         addReplyErrorFormat(c,"Unknown sentinel subcommand '%s'",
2032                                (char*)c->argv[1]->ptr);
2033     }
2034     return;
2035
2036 numargserr:
2037     addReplyErrorFormat(c,"Wrong number of commands for 'sentinel %s'",
2038                           (char*)c->argv[1]->ptr);
2039 }
2040
2041 /* ===================== SENTINEL availability checks ======================= */
2042
2043 /* Is this instance down from our point of view? */
2044 void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
2045     mstime_t elapsed = mstime() - ri->last_avail_time;
2046
2047     /* Check if we are in need for a reconnection of one of the
2048      * links, because we are detecting low activity.
2049      *
2050      * 1) Check if the command link seems connected, was connected not less
2051      *    than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have an
2052      *    idle time that is greater than down_after_period / 2 seconds. */
2053     if (ri->cc &&
2054         (mstime() - ri->cc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
2055         (mstime() - ri->last_pong_time) > (ri->down_after_period/2))
2056     {
2057         sentinelKillLink(ri,ri->cc);
2058     }
2059
2060     /* 2) Check if the pubsub link seems connected, was connected not less
2061      *    than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have no
2062      *    activity in the Pub/Sub channel for more than
2063      *    SENTINEL_PUBLISH_PERIOD * 3.
2064      */
2065     if (ri->pc &&
2066         (mstime() - ri->pc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
2067         (mstime() - ri->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD*3))
2068     {
2069         sentinelKillLink(ri,ri->pc);
2070     }
2071
2072     /* Update the subjectively down flag. */
2073     if (elapsed > ri->down_after_period) {
2074         /* Is subjectively down */
2075         if ((ri->flags & SRI_S_DOWN) == 0) {
2076             sentinelEvent(REDIS_WARNING,"+sdown",ri,"%@");
2077             ri->s_down_since_time = mstime();
2078             ri->flags |= SRI_S_DOWN;
2079         }
2080     } else {
2081         /* Is subjectively up */
2082         if (ri->flags & SRI_S_DOWN) {
2083             sentinelEvent(REDIS_WARNING,"-sdown",ri,"%@");
2084             ri->flags &= ~(SRI_S_DOWN|SRI_SCRIPT_KILL_SENT);
2085         }
2086     }
2087 }
2088
2089 /* Is this instance down accordingly to the configured quorum? */
2090 void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
2091     dictIterator *di;
2092     dictEntry *de;
2093     int quorum = 0, odown = 0;
2094
2095     if (master->flags & SRI_S_DOWN) {
2096         /* Is down for enough sentinels? */
2097         quorum = 1; /* the current sentinel. */
2098         /* Count all the other sentinels. */
2099         di = dictGetIterator(master->sentinels);
2100         while((de = dictNext(di)) != NULL) {
2101             sentinelRedisInstance *ri = dictGetVal(de);
2102
2103             if (ri->flags & SRI_MASTER_DOWN) quorum++;
2104         }
2105         dictReleaseIterator(di);
2106         if (quorum >= master->quorum) odown = 1;
2107     }
2108
2109     /* Set the flag accordingly to the outcome. */
2110     if (odown) {
2111         if ((master->flags & SRI_O_DOWN) == 0) {
2112             sentinelEvent(REDIS_WARNING,"+odown",master,"%@ #quorum %d/%d",
2113                 quorum, master->quorum);
2114             master->flags |= SRI_O_DOWN;
2115             master->o_down_since_time = mstime();
2116         }
2117     } else {
2118         if (master->flags & SRI_O_DOWN) {
2119             sentinelEvent(REDIS_WARNING,"-odown",master,"%@");
2120             master->flags &= ~SRI_O_DOWN;
2121         }
2122     }
2123 }
2124
2125 /* Receive the SENTINEL is-master-down-by-addr reply, see the
2126  * sentinelAskMasterStateToOtherSentinels() function for more information. */
2127 void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *privdata) {
2128     sentinelRedisInstance *ri = c->data;
2129     redisReply *r;
2130
2131     if (ri) ri->pending_commands--;
2132     if (!reply || !ri) return;
2133     r = reply;
2134
2135     /* Ignore every error or unexpected reply.
2136      * Note that if the command returns an error for any reason we'll
2137      * end clearing the SRI_MASTER_DOWN flag for timeout anyway. */
2138     if (r->type == REDIS_REPLY_ARRAY && r->elements == 2 &&
2139         r->element[0]->type == REDIS_REPLY_INTEGER &&
2140         r->element[1]->type == REDIS_REPLY_STRING)
2141     {
2142         ri->last_master_down_reply_time = mstime();
2143         if (r->element[0]->integer == 1) {
2144             ri->flags |= SRI_MASTER_DOWN;
2145         } else {
2146             ri->flags &= ~SRI_MASTER_DOWN;
2147         }
2148         sdsfree(ri->leader);
2149         ri->leader = sdsnew(r->element[1]->str);
2150     }
2151 }
2152
2153 /* If we think (subjectively) the master is down, we start sending
2154  * SENTINEL IS-MASTER-DOWN-BY-ADDR requests to other sentinels
2155  * in order to get the replies that allow to reach the quorum and
2156  * possibly also mark the master as objectively down. */
2157 void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) {
2158     dictIterator *di;
2159     dictEntry *de;
2160
2161     di = dictGetIterator(master->sentinels);
2162     while((de = dictNext(di)) != NULL) {
2163         sentinelRedisInstance *ri = dictGetVal(de);
2164         mstime_t elapsed = mstime() - ri->last_master_down_reply_time;
2165         char port[32];
2166         int retval;
2167
2168         /* If the master state from other sentinel is too old, we clear it. */
2169         if (elapsed > SENTINEL_INFO_VALIDITY_TIME) {
2170             ri->flags &= ~SRI_MASTER_DOWN;
2171             sdsfree(ri->leader);
2172             ri->leader = NULL;
2173         }
2174
2175         /* Only ask if master is down to other sentinels if:
2176          *
2177          * 1) We believe it is down, or there is a failover in progress.
2178          * 2) Sentinel is connected.
2179          * 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */
2180         if ((master->flags & (SRI_S_DOWN|SRI_FAILOVER_IN_PROGRESS)) == 0)
2181             continue;
2182         if (ri->flags & SRI_DISCONNECTED) continue;
2183         if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD)
2184             continue;
2185
2186         /* Ask */
2187         ll2string(port,sizeof(port),master->addr->port);
2188         retval = redisAsyncCommand(ri->cc,
2189                     sentinelReceiveIsMasterDownReply, NULL,
2190                     "SENTINEL is-master-down-by-addr %s %s",
2191                     master->addr->ip, port);
2192         if (retval == REDIS_OK) ri->pending_commands++;
2193     }
2194     dictReleaseIterator(di);
2195 }
2196
2197 /* =============================== FAILOVER ================================= */
2198
2199 /* Given a master get the "subjective leader", that is, among all the sentinels
2200  * with given characteristics, the one with the lexicographically smaller
2201  * runid. The characteristics required are:
2202  *
2203  * 1) Has SRI_CAN_FAILOVER flag.
2204  * 2) Is not disconnected.
2205  * 3) Recently answered to our ping (no longer than
2206  *    SENTINEL_INFO_VALIDITY_TIME milliseconds ago).
2207  *
2208  * The function returns a pointer to an sds string representing the runid of the
2209  * leader sentinel instance (from our point of view). Otherwise NULL is
2210  * returned if there are no suitable sentinels.
2211  */
2212
2213 int compareRunID(const void *a, const void *b) {
2214     char **aptrptr = (char**)a, **bptrptr = (char**)b;
2215     return strcasecmp(*aptrptr, *bptrptr);
2216 }
2217
2218 char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master) {
2219     dictIterator *di;
2220     dictEntry *de;
2221     char **instance =
2222         zmalloc(sizeof(char*)*(dictSize(master->sentinels)+1));
2223     int instances = 0;
2224     char *leader = NULL;
2225
2226     if (master->flags & SRI_CAN_FAILOVER) {
2227         /* Add myself if I'm a Sentinel that can failover this master. */
2228         instance[instances++] = server.runid;
2229     }
2230
2231     di = dictGetIterator(master->sentinels);
2232     while((de = dictNext(di)) != NULL) {
2233         sentinelRedisInstance *ri = dictGetVal(de);
2234         mstime_t lag = mstime() - ri->last_avail_time;
2235
2236         if (lag > SENTINEL_INFO_VALIDITY_TIME ||
2237             !(ri->flags & SRI_CAN_FAILOVER) ||
2238             (ri->flags & SRI_DISCONNECTED) ||
2239             ri->runid == NULL)
2240             continue;
2241         instance[instances++] = ri->runid;
2242     }
2243     dictReleaseIterator(di);
2244
2245     /* If we have at least one instance passing our checks, order the array
2246      * by runid. */
2247     if (instances) {
2248         qsort(instance,instances,sizeof(char*),compareRunID);
2249         leader = sdsnew(instance[0]);
2250     }
2251     zfree(instance);
2252     return leader;
2253 }
2254
2255 struct sentinelLeader {
2256     char *runid;
2257     unsigned long votes;
2258 };
2259
2260 /* Helper function for sentinelGetObjectiveLeader, increment the counter
2261  * relative to the specified runid. */
2262 void sentinelObjectiveLeaderIncr(dict *counters, char *runid) {
2263     dictEntry *de = dictFind(counters,runid);
2264     uint64_t oldval;
2265
2266     if (de) {
2267         oldval = dictGetUnsignedIntegerVal(de);
2268         dictSetUnsignedIntegerVal(de,oldval+1);
2269     } else {
2270         de = dictAddRaw(counters,runid);
2271         redisAssert(de != NULL);
2272         dictSetUnsignedIntegerVal(de,1);
2273     }
2274 }
2275
2276 /* Scan all the Sentinels attached to this master to check what is the
2277  * most voted leader among Sentinels. */
2278 char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) {
2279     dict *counters;
2280     dictIterator *di;
2281     dictEntry *de;
2282     unsigned int voters = 0, voters_quorum;
2283     char *myvote;
2284     char *winner = NULL;
2285
2286     redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS));
2287     counters = dictCreate(&leaderVotesDictType,NULL);
2288
2289     /* Count my vote. */
2290     myvote = sentinelGetSubjectiveLeader(master);
2291     if (myvote) {
2292         sentinelObjectiveLeaderIncr(counters,myvote);
2293         voters++;
2294     }
2295
2296     /* Count other sentinels votes */
2297     di = dictGetIterator(master->sentinels);
2298     while((de = dictNext(di)) != NULL) {
2299         sentinelRedisInstance *ri = dictGetVal(de);
2300         if (ri->leader == NULL) continue;
2301         /* If the failover is not already in progress we are only interested
2302          * in Sentinels that believe the master is down. Otherwise the leader
2303          * selection is useful for the "failover-takedown" when the original
2304          * leader fails. In that case we consider all the voters. */
2305         if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) &&
2306             !(ri->flags & SRI_MASTER_DOWN)) continue;
2307         sentinelObjectiveLeaderIncr(counters,ri->leader);
2308         voters++;
2309     }
2310     dictReleaseIterator(di);
2311     voters_quorum = voters/2+1;
2312
2313     /* Check what's the winner. For the winner to win, it needs two conditions:
2314      * 1) Absolute majority between voters (50% + 1).
2315      * 2) And anyway at least master->quorum votes. */
2316     {
2317         uint64_t max_votes = 0; /* Max votes so far. */
2318
2319         di = dictGetIterator(counters);
2320         while((de = dictNext(di)) != NULL) {
2321             uint64_t votes = dictGetUnsignedIntegerVal(de);
2322
2323             if (max_votes < votes) {
2324                 max_votes = votes;
2325                 winner = dictGetKey(de);
2326             }
2327         }
2328         dictReleaseIterator(di);
2329         if (winner && (max_votes < voters_quorum || max_votes < master->quorum))
2330             winner = NULL;
2331     }
2332     winner = winner ? sdsnew(winner) : NULL;
2333     sdsfree(myvote);
2334     dictRelease(counters);
2335     return winner;
2336 }
2337
2338 /* Setup the master state to start a failover as a leader.
2339  *
2340  * State can be either:
2341  *
2342  * SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch.
2343  * SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover.
2344  */
2345 void sentinelStartFailover(sentinelRedisInstance *master, int state) {
2346     redisAssert(master->flags & SRI_MASTER);
2347     redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START ||
2348                 state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
2349
2350     master->failover_state = state;
2351     master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
2352     sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
2353
2354     /* Pick a random delay if it's a fresh failover (WAIT_START), and not
2355      * a recovery of a failover started by another sentinel. */
2356     if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
2357         master->failover_start_time = mstime() +
2358             SENTINEL_FAILOVER_FIXED_DELAY +
2359             (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
2360         sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
2361             "%@ #starting in %lld milliseconds",
2362             master->failover_start_time-mstime());
2363     }
2364     master->failover_state_change_time = mstime();
2365 }
2366
2367 /* This function checks if there are the conditions to start the failover,
2368  * that is:
2369  *
2370  * 1) Enough time has passed since O_DOWN.
2371  * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it.
2372  * 3) We are the objectively leader for this master.
2373  *
2374  * If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS
2375  * and SRI_I_AM_THE_LEADER.
2376  */
2377 void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
2378     char *leader;
2379     int isleader;
2380
2381     /* We can't failover if the master is not in O_DOWN state or if
2382      * there is not already a failover in progress (to perform the
2383      * takedown if the leader died) or if this Sentinel is not allowed
2384      * to start a failover. */
2385     if (!(master->flags & SRI_CAN_FAILOVER) ||
2386         !(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) return;
2387
2388     leader = sentinelGetObjectiveLeader(master);
2389     isleader = leader && strcasecmp(leader,server.runid) == 0;
2390     sdsfree(leader);
2391
2392     /* If I'm not the leader, I can't failover for sure. */
2393     if (!isleader) return;
2394
2395     /* If the failover is already in progress there are two options... */
2396     if (master->flags & SRI_FAILOVER_IN_PROGRESS) {
2397         if (master->flags & SRI_I_AM_THE_LEADER) {
2398             /* 1) I'm flagged as leader so I already started the failover.
2399              *    Just return. */
2400             return;
2401         } else {
2402             mstime_t elapsed = mstime() - master->failover_state_change_time;
2403
2404             /* 2) I'm the new leader, but I'm not flagged as leader in the
2405              *    master: I did not started the failover, but the original
2406              *    leader has no longer the leadership.
2407              *
2408              *    In this case if the failover appears to be lagging
2409              *    for at least 25% of the configured failover timeout,
2410              *    I can assume I can take control. Otherwise
2411              *    it's better to return and wait more. */
2412             if (elapsed < (master->failover_timeout/4)) return;
2413             sentinelEvent(REDIS_WARNING,"+failover-takedown",master,"%@");
2414             /* We have already an elected slave if we are in
2415              * FAILOVER_IN_PROGRESS state, that is, the slave that we
2416              * observed turning into a master. */
2417             sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
2418             /* As an observer we flagged all the slaves as RECONF_SENT but
2419              * now we are in charge of actually sending the reconfiguration
2420              * command so let's clear this flag for all the instances. */
2421             sentinelDelFlagsToDictOfRedisInstances(master->slaves,
2422                 SRI_RECONF_SENT);
2423         }
2424     } else {
2425         /* Brand new failover as SRI_FAILOVER_IN_PROGRESS was not set.
2426          *
2427          * Do we have a slave to promote? Otherwise don't start a failover
2428          * at all. */
2429         if (sentinelSelectSlave(master) == NULL) return;
2430         sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START);
2431     }
2432 }
2433
2434 /* Select a suitable slave to promote. The current algorithm only uses
2435  * the following parameters:
2436  *
2437  * 1) None of the following conditions: S_DOWN, O_DOWN, DISCONNECTED.
2438  * 2) last_avail_time more recent than SENTINEL_INFO_VALIDITY_TIME.
2439  * 3) info_refresh more recent than SENTINEL_INFO_VALIDITY_TIME.
2440  * 4) master_link_down_time no more than:
2441  *     (now - master->s_down_since_time) + (master->down_after_period * 10).
2442  *
2443  * Among all the slaves matching the above conditions we select the slave
2444  * with lower slave_priority. If priority is the same we select the slave
2445  * with lexicographically smaller runid.
2446  *
2447  * The function returns the pointer to the selected slave, otherwise
2448  * NULL if no suitable slave was found.
2449  */
2450
2451 int compareSlavesForPromotion(const void *a, const void *b) {
2452     sentinelRedisInstance **sa = (sentinelRedisInstance **)a,
2453                           **sb = (sentinelRedisInstance **)b;
2454     if ((*sa)->slave_priority != (*sb)->slave_priority)
2455         return (*sa)->slave_priority - (*sb)->slave_priority;
2456     return strcasecmp((*sa)->runid,(*sb)->runid);
2457 }
2458
2459 sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
2460     sentinelRedisInstance **instance =
2461         zmalloc(sizeof(instance[0])*dictSize(master->slaves));
2462     sentinelRedisInstance *selected = NULL;
2463     int instances = 0;
2464     dictIterator *di;
2465     dictEntry *de;
2466     mstime_t max_master_down_time = 0;
2467
2468     if (master->flags & SRI_S_DOWN)
2469         max_master_down_time += mstime() - master->s_down_since_time;
2470     max_master_down_time += master->down_after_period * 10;
2471
2472     di = dictGetIterator(master->slaves);
2473     while((de = dictNext(di)) != NULL) {
2474         sentinelRedisInstance *slave = dictGetVal(de);
2475         mstime_t info_validity_time = mstime()-SENTINEL_INFO_VALIDITY_TIME;
2476
2477         if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue;
2478         if (slave->last_avail_time < info_validity_time) continue;
2479
2480         /* If the master is in SDOWN state we get INFO for slaves every second.
2481          * Otherwise we get it with the usual period so we need to account for
2482          * a larger delay. */
2483         if ((master->flags & SRI_S_DOWN) == 0)
2484             info_validity_time -= SENTINEL_INFO_PERIOD;
2485         if (slave->info_refresh < info_validity_time) continue;
2486         if (slave->master_link_down_time > max_master_down_time) continue;
2487         instance[instances++] = slave;
2488     }
2489     dictReleaseIterator(di);
2490     if (instances) {
2491         qsort(instance,instances,sizeof(sentinelRedisInstance*),
2492             compareSlavesForPromotion);
2493         selected = instance[0];
2494     }
2495     zfree(instance);
2496     return selected;
2497 }
2498
2499 /* ---------------- Failover state machine implementation ------------------- */
2500 void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
2501     /* If we in "wait start" but the master is no longer in ODOWN nor in
2502      * SDOWN condition we abort the failover. This is important as it
2503      * prevents a useless failover in a a notable case of netsplit, where
2504      * the senitnels are split from the redis instances. In this case
2505      * the failover will not start while there is the split because no
2506      * good slave can be reached. However when the split is resolved, we
2507      * can go to waitstart if the slave is back rechable a few milliseconds
2508      * before the master is. In that case when the master is back online
2509      * we cancel the failover. */
2510     if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) {
2511         sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back",
2512             ri,"%@");
2513         sentinelAbortFailover(ri);
2514         return;
2515     }
2516
2517     /* Start the failover going to the next state if enough time has
2518      * elapsed. */
2519     if (mstime() >= ri->failover_start_time) {
2520         ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
2521         ri->failover_state_change_time = mstime();
2522         sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
2523     }
2524 }
2525
2526 void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
2527     sentinelRedisInstance *slave = sentinelSelectSlave(ri);
2528
2529     if (slave == NULL) {
2530         sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@");
2531         sentinelAbortFailover(ri);
2532     } else {
2533         sentinelEvent(REDIS_WARNING,"+selected-slave",slave,"%@");
2534         slave->flags |= SRI_PROMOTED;
2535         ri->promoted_slave = slave;
2536         ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE;
2537         ri->failover_state_change_time = mstime();
2538         sentinelEvent(REDIS_NOTICE,"+failover-state-send-slaveof-noone",
2539             slave, "%@");
2540     }
2541 }
2542
2543 void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
2544     int retval;
2545
2546     if (ri->promoted_slave->flags & SRI_DISCONNECTED) return;
2547
2548     /* Send SLAVEOF NO ONE command to turn the slave into a master.
2549      * We actually register a generic callback for this command as we don't
2550      * really care about the reply. We check if it worked indirectly observing
2551      * if INFO returns a different role (master instead of slave). */
2552     retval = redisAsyncCommand(ri->promoted_slave->cc,
2553         sentinelDiscardReplyCallback, NULL, "SLAVEOF NO ONE");
2554     if (retval != REDIS_OK) return;
2555     ri->promoted_slave->pending_commands++;
2556     sentinelEvent(REDIS_NOTICE, "+failover-state-wait-promotion",
2557         ri->promoted_slave,"%@");
2558     ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;
2559     ri->failover_state_change_time = mstime();
2560 }
2561
2562 /* We actually wait for promotion indirectly checking with INFO when the
2563  * slave turns into a master. */
2564 void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {
2565     mstime_t elapsed = mstime() - ri->failover_state_change_time;
2566
2567     if (elapsed >= SENTINEL_PROMOTION_RETRY_PERIOD) {
2568         sentinelEvent(REDIS_WARNING,"-promotion-timeout",ri->promoted_slave,
2569             "%@");
2570         sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
2571         ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
2572         ri->failover_state_change_time = mstime();
2573         ri->promoted_slave->flags &= ~SRI_PROMOTED;
2574         ri->promoted_slave = NULL;
2575     }
2576 }
2577
2578 void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {
2579     int not_reconfigured = 0, timeout = 0;
2580     dictIterator *di;
2581     dictEntry *de;
2582     mstime_t elapsed = mstime() - master->failover_state_change_time;
2583
2584     /* We can't consider failover finished if the promoted slave is
2585      * not reachable. */
2586     if (master->promoted_slave == NULL ||
2587         master->promoted_slave->flags & SRI_S_DOWN) return;
2588
2589     /* The failover terminates once all the reachable slaves are properly
2590      * configured. */
2591     di = dictGetIterator(master->slaves);
2592     while((de = dictNext(di)) != NULL) {
2593         sentinelRedisInstance *slave = dictGetVal(de);
2594
2595         if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
2596         if (slave->flags & SRI_S_DOWN) continue;
2597         not_reconfigured++;
2598     }
2599     dictReleaseIterator(di);
2600
2601     /* Force end of failover on timeout. */
2602     if (elapsed > master->failover_timeout) {
2603         not_reconfigured = 0;
2604         timeout = 1;
2605         sentinelEvent(REDIS_WARNING,"+failover-end-for-timeout",master,"%@");
2606     }
2607
2608     if (not_reconfigured == 0) {
2609         int role = (master->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
2610                                                            SENTINEL_OBSERVER;
2611
2612         sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@");
2613         master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG;
2614         master->failover_state_change_time = mstime();
2615         sentinelCallClientReconfScript(master,role,"end",master->addr,
2616             master->promoted_slave->addr);
2617     }
2618
2619     /* If I'm the leader it is a good idea to send a best effort SLAVEOF
2620      * command to all the slaves still not reconfigured to replicate with
2621      * the new master. */
2622     if (timeout && (master->flags & SRI_I_AM_THE_LEADER)) {
2623         dictIterator *di;
2624         dictEntry *de;
2625         char master_port[32];
2626
2627         ll2string(master_port,sizeof(master_port),
2628             master->promoted_slave->addr->port);
2629
2630         di = dictGetIterator(master->slaves);
2631         while((de = dictNext(di)) != NULL) {
2632             sentinelRedisInstance *slave = dictGetVal(de);
2633             int retval;
2634
2635             if (slave->flags &
2636                 (SRI_RECONF_DONE|SRI_RECONF_SENT|SRI_DISCONNECTED)) continue;
2637
2638             retval = redisAsyncCommand(slave->cc,
2639                 sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s",
2640                     master->promoted_slave->addr->ip,
2641                     master_port);
2642             if (retval == REDIS_OK) {
2643                 sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent-be",slave,"%@");
2644                 slave->flags |= SRI_RECONF_SENT;
2645             }
2646         }
2647         dictReleaseIterator(di);
2648     }
2649 }
2650
2651 /* Send SLAVE OF <new master address> to all the remaining slaves that
2652  * still don't appear to have the configuration updated. */
2653 void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
2654     dictIterator *di;
2655     dictEntry *de;
2656     int in_progress = 0;
2657
2658     di = dictGetIterator(master->slaves);
2659     while((de = dictNext(di)) != NULL) {
2660         sentinelRedisInstance *slave = dictGetVal(de);
2661
2662         if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG))
2663             in_progress++;
2664     }
2665     dictReleaseIterator(di);
2666
2667     di = dictGetIterator(master->slaves);
2668     while(in_progress < master->parallel_syncs &&
2669           (de = dictNext(di)) != NULL)
2670     {
2671         sentinelRedisInstance *slave = dictGetVal(de);
2672         int retval;
2673         char master_port[32];
2674
2675         /* Skip the promoted slave, and already configured slaves. */
2676         if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
2677
2678         /* Clear the SRI_RECONF_SENT flag if too much time elapsed without
2679          * the slave moving forward to the next state. */
2680         if ((slave->flags & SRI_RECONF_SENT) &&
2681             (mstime() - slave->slave_reconf_sent_time) >
2682             SENTINEL_SLAVE_RECONF_RETRY_PERIOD)
2683         {
2684             sentinelEvent(REDIS_NOTICE,"-slave-reconf-sent-timeout",slave,"%@");
2685             slave->flags &= ~SRI_RECONF_SENT;
2686         }
2687
2688         /* Nothing to do for instances that are disconnected or already
2689          * in RECONF_SENT state. */
2690         if (slave->flags & (SRI_DISCONNECTED|SRI_RECONF_SENT|SRI_RECONF_INPROG))
2691             continue;
2692
2693         /* Send SLAVEOF <new master>. */
2694         ll2string(master_port,sizeof(master_port),
2695             master->promoted_slave->addr->port);
2696         retval = redisAsyncCommand(slave->cc,
2697             sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s",
2698                 master->promoted_slave->addr->ip,
2699                 master_port);
2700         if (retval == REDIS_OK) {
2701             slave->flags |= SRI_RECONF_SENT;
2702             slave->pending_commands++;
2703             slave->slave_reconf_sent_time = mstime();
2704             sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent",slave,"%@");
2705             in_progress++;
2706         }
2707     }
2708     dictReleaseIterator(di);
2709     sentinelFailoverDetectEnd(master);
2710 }
2711
2712 /* This function is called when the slave is in
2713  * SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state. In this state we need
2714  * to remove it from the master table and add the promoted slave instead.
2715  *
2716  * If there are no promoted slaves as this instance is unique, we remove
2717  * and re-add it with the same address to trigger a complete state
2718  * refresh. */
2719 void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) {
2720     sentinelRedisInstance *ref = master->promoted_slave ?
2721                                  master->promoted_slave : master;
2722
2723     sentinelEvent(REDIS_WARNING,"+switch-master",master,"%s %s %d %s %d",
2724         master->name, master->addr->ip, master->addr->port,
2725         ref->addr->ip, ref->addr->port);
2726
2727     sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port);
2728 }
2729
2730 void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
2731     redisAssert(ri->flags & SRI_MASTER);
2732
2733     if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
2734
2735     switch(ri->failover_state) {
2736         case SENTINEL_FAILOVER_STATE_WAIT_START:
2737             sentinelFailoverWaitStart(ri);
2738             break;
2739         case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
2740             sentinelFailoverSelectSlave(ri);
2741             break;
2742         case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
2743             sentinelFailoverSendSlaveOfNoOne(ri);
2744             break;
2745         case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
2746             sentinelFailoverWaitPromotion(ri);
2747             break;
2748         case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
2749             sentinelFailoverReconfNextSlave(ri);
2750             break;
2751         case SENTINEL_FAILOVER_STATE_DETECT_END:
2752             sentinelFailoverDetectEnd(ri);
2753             break;
2754     }
2755 }
2756
2757 /* Abort a failover in progress with the following steps:
2758  * 1) If this instance is the leaer send a SLAVEOF command to all the already
2759  *    reconfigured slaves if any to configure them to replicate with the
2760  *    original master.
2761  * 2) For both leaders and observers: clear the failover flags and state in
2762  *    the master instance.
2763  * 3) If there is already a promoted slave and we are the leader, and this
2764  *    slave is not DISCONNECTED, try to reconfigure it to replicate
2765  *    back to the master as well, sending a best effort SLAVEOF command.
2766  */
2767 void sentinelAbortFailover(sentinelRedisInstance *ri) {
2768     char master_port[32];
2769     dictIterator *di;
2770     dictEntry *de;
2771     int sentinel_role;
2772
2773     redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS);
2774     ll2string(master_port,sizeof(master_port),ri->addr->port);
2775
2776     /* Clear failover related flags from slaves.
2777      * Also if we are the leader make sure to send SLAVEOF commands to all the
2778      * already reconfigured slaves in order to turn them back into slaves of
2779      * the original master. */
2780     di = dictGetIterator(ri->slaves);
2781     while((de = dictNext(di)) != NULL) {
2782         sentinelRedisInstance *slave = dictGetVal(de);
2783         if ((ri->flags & SRI_I_AM_THE_LEADER) &&
2784             !(slave->flags & SRI_DISCONNECTED) &&
2785              (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG|
2786                               SRI_RECONF_DONE)))
2787         {
2788             int retval;
2789
2790             retval = redisAsyncCommand(slave->cc,
2791                 sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s",
2792                     ri->addr->ip,
2793                     master_port);
2794             if (retval == REDIS_OK)
2795                 sentinelEvent(REDIS_NOTICE,"-slave-reconf-undo",slave,"%@");
2796         }
2797         slave->flags &= ~(SRI_RECONF_SENT|SRI_RECONF_INPROG|SRI_RECONF_DONE);
2798     }
2799     dictReleaseIterator(di);
2800
2801     sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
2802                                                         SENTINEL_OBSERVER;
2803     ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER);
2804     ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
2805     ri->failover_state_change_time = mstime();
2806     if (ri->promoted_slave) {
2807         sentinelCallClientReconfScript(ri,sentinel_role,"abort",
2808             ri->promoted_slave->addr,ri->addr);
2809         ri->promoted_slave->flags &= ~SRI_PROMOTED;
2810         ri->promoted_slave = NULL;
2811     }
2812 }
2813
2814 /* The following is called only for master instances and will abort the
2815  * failover process if:
2816  *
2817  * 1) The failover is in progress.
2818  * 2) We already promoted a slave.
2819  * 3) The promoted slave is in extended SDOWN condition.
2820  */
2821 void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) {
2822     /* Failover is in progress? Do we have a promoted slave? */
2823     if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS) || !ri->promoted_slave) return;
2824
2825     /* Is the promoted slave into an extended SDOWN state? */
2826     if (!(ri->promoted_slave->flags & SRI_S_DOWN) ||
2827         (mstime() - ri->promoted_slave->s_down_since_time) <
2828         (ri->down_after_period * SENTINEL_EXTENDED_SDOWN_MULTIPLIER)) return;
2829
2830     sentinelEvent(REDIS_WARNING,"-failover-abort-x-sdown",ri->promoted_slave,"%@");
2831     sentinelAbortFailover(ri);
2832 }
2833
2834 /* ======================== SENTINEL timer handler ==========================
2835  * This is the "main" our Sentinel, being sentinel completely non blocking
2836  * in design. The function is called every second.
2837  * -------------------------------------------------------------------------- */
2838
2839 /* Perform scheduled operations for the specified Redis instance. */
2840 void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
2841     /* ========== MONITORING HALF ============ */
2842     /* Every kind of instance */
2843     sentinelReconnectInstance(ri);
2844     sentinelPingInstance(ri);
2845
2846     /* Masters and slaves */
2847     if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
2848         /* Nothing so far. */
2849     }
2850
2851     /* Only masters */
2852     if (ri->flags & SRI_MASTER) {
2853         sentinelAskMasterStateToOtherSentinels(ri);
2854     }
2855
2856     /* ============== ACTING HALF ============= */
2857     /* We don't proceed with the acting half if we are in TILT mode.
2858      * TILT happens when we find something odd with the time, like a
2859      * sudden change in the clock. */
2860     if (sentinel.tilt) {
2861         if (mstime()-sentinel.tilt_start_time < SENTINEL_TILT_PERIOD) return;
2862         sentinel.tilt = 0;
2863         sentinelEvent(REDIS_WARNING,"-tilt",NULL,"#tilt mode exited");
2864     }
2865
2866     /* Every kind of instance */
2867     sentinelCheckSubjectivelyDown(ri);
2868
2869     /* Masters and slaves */
2870     if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
2871         /* Nothing so far. */
2872     }
2873
2874     /* Only masters */
2875     if (ri->flags & SRI_MASTER) {
2876         sentinelCheckObjectivelyDown(ri);
2877         sentinelStartFailoverIfNeeded(ri);
2878         sentinelFailoverStateMachine(ri);
2879         sentinelAbortFailoverIfNeeded(ri);
2880     }
2881 }
2882
2883 /* Perform scheduled operations for all the instances in the dictionary.
2884  * Recursively call the function against dictionaries of slaves. */
2885 void sentinelHandleDictOfRedisInstances(dict *instances) {
2886     dictIterator *di;
2887     dictEntry *de;
2888     sentinelRedisInstance *switch_to_promoted = NULL;
2889
2890     /* There are a number of things we need to perform against every master. */
2891     di = dictGetIterator(instances);
2892     while((de = dictNext(di)) != NULL) {
2893         sentinelRedisInstance *ri = dictGetVal(de);
2894
2895         sentinelHandleRedisInstance(ri);
2896         if (ri->flags & SRI_MASTER) {
2897             sentinelHandleDictOfRedisInstances(ri->slaves);
2898             sentinelHandleDictOfRedisInstances(ri->sentinels);
2899             if (ri->failover_state == SENTINEL_FAILOVER_STATE_UPDATE_CONFIG) {
2900                 switch_to_promoted = ri;
2901             }
2902         }
2903     }
2904     if (switch_to_promoted)
2905         sentinelFailoverSwitchToPromotedSlave(switch_to_promoted);
2906     dictReleaseIterator(di);
2907 }
2908
2909 /* This function checks if we need to enter the TITL mode.
2910  *
2911  * The TILT mode is entered if we detect that between two invocations of the
2912  * timer interrupt, a negative amount of time, or too much time has passed.
2913  * Note that we expect that more or less just 100 milliseconds will pass
2914  * if everything is fine. However we'll see a negative number or a
2915  * difference bigger than SENTINEL_TILT_TRIGGER milliseconds if one of the
2916  * following conditions happen:
2917  *
2918  * 1) The Sentiel process for some time is blocked, for every kind of
2919  * random reason: the load is huge, the computer was freezed for some time
2920  * in I/O or alike, the process was stopped by a signal. Everything.
2921  * 2) The system clock was altered significantly.
2922  *
2923  * Under both this conditions we'll see everything as timed out and failing
2924  * without good reasons. Instead we enter the TILT mode and wait
2925  * for SENTIENL_TILT_PERIOD to elapse before starting to act again.
2926  *
2927  * During TILT time we still collect information, we just do not act. */
2928 void sentinelCheckTiltCondition(void) {
2929     mstime_t now = mstime();
2930     mstime_t delta = now - sentinel.previous_time;
2931
2932     if (delta < 0 || delta > SENTINEL_TILT_TRIGGER) {
2933         sentinel.tilt = 1;
2934         sentinel.tilt_start_time = mstime();
2935         sentinelEvent(REDIS_WARNING,"+tilt",NULL,"#tilt mode entered");
2936     }
2937     sentinel.previous_time = mstime();
2938 }
2939
2940 void sentinelTimer(void) {
2941     sentinelCheckTiltCondition();
2942     sentinelHandleDictOfRedisInstances(sentinel.masters);
2943     sentinelRunPendingScripts();
2944     sentinelCollectTerminatedScripts();
2945     sentinelKillTimedoutScripts();
2946 }
2947