Sentinel: SENTINEL FAILOVER command implemented.

[redis.git] / src / sentinel.c
diff --git a/src/sentinel.c b/src/sentinel.c

index d1c6befe2b8de44894181c52dc6b76b89d8e7b23..36bf233745092622c0f3b38894d696319a6cea5d 100644 (file)
--- a/src/sentinel.c
+++ b/src/sentinel.c
@@ -73,6 +73,7 @@ typedef struct sentinelAddr {
  #define SRI_RECONF_SENT (1<<11)     /* SLAVEOF <newmaster> sent. */
  #define SRI_RECONF_INPROG (1<<12)   /* Slave synchronization in progress. */
  #define SRI_RECONF_DONE (1<<13)     /* Slave synchronized with new master. */
+#define SRI_FORCE_FAILOVER (1<<14)  /* Force failover with master up. */
  
  #define SENTINEL_INFO_PERIOD 10000
  #define SENTINEL_PING_PERIOD 1000
@@ -116,6 +117,8 @@ typedef struct sentinelAddr {
  /* Generic flags that can be used with different functions. */
  #define SENTINEL_NO_FLAGS 0
  #define SENTINEL_GENERATE_EVENT 1
+#define SENTINEL_LEADER 2
+#define SENTINEL_OBSERVER 4
  
  /* Script execution flags and limits. */
  #define SENTINEL_SCRIPT_NONE 0
@@ -321,6 +324,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri);
  void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...);
  sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master);
  void sentinelScheduleScriptExecution(char *path, ...);
+void sentinelStartFailover(sentinelRedisInstance *master, int state);
  
  /* ========================= Dictionary types =============================== */
  
@@ -762,6 +766,32 @@ void sentinelPendingScriptsCommand(redisClient *c) {
      }
  }
  
+/* This function calls, if any, the client reconfiguration script with the
+ * following parameters:
+ *
+ * <master-name> <role> <state> <from-ip> <from-port> <to-ip> <to-port>
+ *
+ * It is called every time a failover starts, ends, or is aborted.
+ *
+ * <state> is "start", "end" or "abort".
+ * <role> is either "leader" or "observer".
+ *
+ * from/to fields are respectively master -> promoted slave addresses for
+ * "start" and "end", or the reverse (promoted slave -> master) in case of
+ * "abort".
+ */
+void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, char *state, sentinelAddr *from, sentinelAddr *to) {
+    char fromport[32], toport[32];
+
+    if (master->client_reconfig_script == NULL) return;
+    ll2string(fromport,sizeof(fromport),from->port);
+    ll2string(toport,sizeof(toport),to->port);
+    sentinelScheduleScriptExecution(master->client_reconfig_script,
+        master->name,
+        (role == SENTINEL_LEADER) ? "leader" : "observer",
+        state, from->ip, fromport, to->ip, toport);
+}
+
  /* ========================== sentinelRedisInstance ========================= */
  
  /* Create a redis instance, the following fields must be populated by the
@@ -1384,6 +1414,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
      ri->info_refresh = mstime();
      sdsfreesplitres(lines,numlines);
  
+    /* ---------------------------- Acting half ----------------------------- */
      if (sentinel.tilt) return;
  
      /* Act if a master turned into a slave. */
@@ -1407,11 +1438,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
          if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
              (runid_changed || first_runid))
          {
-            /* If a slave turned into a master, but at the same time the
-             * runid has changed, or it is simply the first time we see and
-             * INFO output from this instance, this is a reboot with a wrong
-             * configuration.
+            /* If a slave turned into maser but:
               *
+             * 1) Failover not in progress.
+             * 2) RunID hs changed, or its the first time we see an INFO output.
+             * 
+             * We assume this is a reboot with a wrong configuration.
               * Log the event and remove the slave. */
              int retval;
  
@@ -1422,8 +1454,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
          } else if (ri->flags & SRI_PROMOTED) {
              /* If this is a promoted slave we can change state to the
               * failover state machine. */
-            if (ri->master &&
-                (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
+            if ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
                  (ri->master->flags & SRI_I_AM_THE_LEADER) &&
                  (ri->master->failover_state ==
                      SENTINEL_FAILOVER_STATE_WAIT_PROMOTION))
@@ -1433,25 +1464,41 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
                  sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@");
                  sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves",
                      ri->master,"%@");
+                sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER,
+                    "start",ri->master->addr,ri->addr);
              }
-        } else {
-            /* Otherwise we interpret this as the start of the failover. */
-            if (ri->master &&
-                (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) == 0)
-            {
-                ri->master->flags |= SRI_FAILOVER_IN_PROGRESS;
-                sentinelEvent(REDIS_WARNING,"failover-detected",ri->master,"%@");
-                ri->master->failover_state = SENTINEL_FAILOVER_STATE_DETECT_END;
-                ri->master->failover_state_change_time = mstime();
-                ri->master->promoted_slave = ri;
-                ri->flags |= SRI_PROMOTED;
-                /* We are an observer, so we can only assume that the leader
-                 * is reconfiguring the slave instances. For this reason we
-                 * set all the instances as RECONF_SENT waiting for progresses
-                 * on this side. */
-                sentinelAddFlagsToDictOfRedisInstances(ri->master->slaves,
-                    SRI_RECONF_SENT);
+        } else if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) ||
+                    ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
+                     (ri->master->flags & SRI_I_AM_THE_LEADER) &&
+                     ri->master->failover_state ==
+                     SENTINEL_FAILOVER_STATE_WAIT_START))
+        {
+            /* No failover in progress? Then it is the start of a failover
+             * and we are an observer.
+             *
+             * We also do that if we are a leader doing a failover, in wait
+             * start, but well, somebody else started before us. */
+
+            if (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) {
+                sentinelEvent(REDIS_WARNING,"-failover-abort-race",
+                                ri->master, "%@");
+                sentinelAbortFailover(ri->master);
              }
+
+            ri->master->flags |= SRI_FAILOVER_IN_PROGRESS;
+            sentinelEvent(REDIS_WARNING,"+failover-detected",ri->master,"%@");
+            ri->master->failover_state = SENTINEL_FAILOVER_STATE_DETECT_END;
+            ri->master->failover_state_change_time = mstime();
+            ri->master->promoted_slave = ri;
+            ri->flags |= SRI_PROMOTED;
+            sentinelCallClientReconfScript(ri->master,SENTINEL_OBSERVER,
+                "start", ri->master->addr,ri->addr);
+            /* We are an observer, so we can only assume that the leader
+             * is reconfiguring the slave instances. For this reason we
+             * set all the instances as RECONF_SENT waiting for progresses
+             * on this side. */
+            sentinelAddFlagsToDictOfRedisInstances(ri->master->slaves,
+                SRI_RECONF_SENT);
          }
      }
  
@@ -1945,6 +1992,24 @@ void sentinelCommand(redisClient *c) {
              addReplyBulkCString(c,addr->ip);
              addReplyBulkLongLong(c,addr->port);
          }
+    } else if (!strcasecmp(c->argv[1]->ptr,"failover")) {
+        /* SENTINEL FAILOVER <master-name> */
+        sentinelRedisInstance *ri;
+
+        if (c->argc != 3) goto numargserr;
+        if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
+            return;
+        if (ri->flags & SRI_FAILOVER_IN_PROGRESS) {
+            addReplySds(c,sdsnew("-INPROG Failover already in progress\r\n"));
+            return;
+        }
+        if (sentinelSelectSlave(ri) == NULL) {
+            addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n"));
+            return;
+        }
+        sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START);
+        ri->flags |= SRI_FORCE_FAILOVER;
+        addReply(c,shared.ok);
      } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) {
          /* SENTINEL PENDING-SCRIPTS */
  
@@ -2258,6 +2323,35 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) {
      return winner;
  }
  
+/* Setup the master state to start a failover as a leader.
+ *
+ * State can be either:
+ *
+ * SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch.
+ * SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover.
+ */
+void sentinelStartFailover(sentinelRedisInstance *master, int state) {
+    redisAssert(master->flags & SRI_MASTER);
+    redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START ||
+                state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
+
+    master->failover_state = state;
+    master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
+    sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
+
+    /* Pick a random delay if it's a fresh failover (WAIT_START), and not
+     * a recovery of a failover started by another sentinel. */
+    if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
+        master->failover_start_time = mstime() +
+            SENTINEL_FAILOVER_FIXED_DELAY +
+            (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
+        sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
+            "%@ #starting in %lld milliseconds",
+            master->failover_start_time-mstime());
+    }
+    master->failover_state_change_time = mstime();
+}
+
  /* This function checks if there are the conditions to start the failover,
   * that is:
   *
@@ -2268,7 +2362,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) {
   * If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS
   * and SRI_I_AM_THE_LEADER.
   */
-void sentinelStartFailover(sentinelRedisInstance *master) {
+void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
      char *leader;
      int isleader;
  
@@ -2308,7 +2402,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) {
              /* We have already an elected slave if we are in
               * FAILOVER_IN_PROGRESS state, that is, the slave that we
               * observed turning into a master. */
-            master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;
+            sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
              /* As an observer we flagged all the slaves as RECONF_SENT but
               * now we are in charge of actually sending the reconfiguration
               * command so let's clear this flag for all the instances. */
@@ -2321,23 +2415,8 @@ void sentinelStartFailover(sentinelRedisInstance *master) {
           * Do we have a slave to promote? Otherwise don't start a failover
           * at all. */
          if (sentinelSelectSlave(master) == NULL) return;
-        master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
+        sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START);
      }
-
-    master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
-    sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
-
-    /* Pick a random delay if it's a fresh failover (WAIT_START), and not
-     * a recovery of a failover started by another sentinel. */
-    if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
-        master->failover_start_time = mstime() +
-            SENTINEL_FAILOVER_FIXED_DELAY +
-            (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
-        sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
-            "%@ #starting in %lld milliseconds",
-            master->failover_start_time-mstime());
-    }
-    master->failover_state_change_time = mstime();
  }
  
  /* Select a suitable slave to promote. The current algorithm only uses
@@ -2372,10 +2451,11 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
      int instances = 0;
      dictIterator *di;
      dictEntry *de;
-    mstime_t max_master_down_time;
+    mstime_t max_master_down_time = 0;
  
-    max_master_down_time = (mstime() - master->s_down_since_time) +
-                           (master->down_after_period * 10);
+    if (master->flags & SRI_S_DOWN)
+        max_master_down_time += mstime() - master->s_down_since_time;
+    max_master_down_time += master->down_after_period * 10;
  
      di = dictGetIterator(master->slaves);
      while((de = dictNext(di)) != NULL) {
@@ -2384,6 +2464,12 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
  
          if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue;
          if (slave->last_avail_time < info_validity_time) continue;
+
+        /* If the master is in SDOWN state we get INFO for slaves every second.
+         * Otherwise we get it with the usual period so we need to account for
+         * a larger delay. */
+        if ((master->flags & SRI_S_DOWN) == 0)
+            info_validity_time -= SENTINEL_INFO_PERIOD;
          if (slave->info_refresh < info_validity_time) continue;
          if (slave->master_link_down_time > max_master_down_time) continue;
          instance[instances++] = slave;
@@ -2409,7 +2495,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
       * can go to waitstart if the slave is back rechable a few milliseconds
       * before the master is. In that case when the master is back online
       * we cancel the failover. */
-    if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0) {
+    if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) {
          sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back",
              ri,"%@");
          sentinelAbortFailover(ri);
@@ -2508,9 +2594,14 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {
      }
  
      if (not_reconfigured == 0) {
+        int role = (master->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
+                                                           SENTINEL_OBSERVER;
+
          sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@");
          master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG;
          master->failover_state_change_time = mstime();
+        sentinelCallClientReconfScript(master,role,"end",master->addr,
+            master->promoted_slave->addr);
      }
  
      /* If I'm the leader it is a good idea to send a best effort SLAVEOF
@@ -2665,6 +2756,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) {
      char master_port[32];
      dictIterator *di;
      dictEntry *de;
+    int sentinel_role;
  
      redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS);
      ll2string(master_port,sizeof(master_port),ri->addr->port);
@@ -2694,10 +2786,14 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) {
      }
      dictReleaseIterator(di);
  
-    ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER);
+    sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
+                                                        SENTINEL_OBSERVER;
+    ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER);
      ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
      ri->failover_state_change_time = mstime();
      if (ri->promoted_slave) {
+        sentinelCallClientReconfScript(ri,sentinel_role,"abort",
+            ri->promoted_slave->addr,ri->addr);
          ri->promoted_slave->flags &= ~SRI_PROMOTED;
          ri->promoted_slave = NULL;
      }
@@ -2766,7 +2862,7 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
      /* Only masters */
      if (ri->flags & SRI_MASTER) {
          sentinelCheckObjectivelyDown(ri);
-        sentinelStartFailover(ri);
+        sentinelStartFailoverIfNeeded(ri);
          sentinelFailoverStateMachine(ri);
          sentinelAbortFailoverIfNeeded(ri);
      }