Sentinel: more easy master redirection if master is a slave.

[redis.git] / src / sentinel.c
diff --git a/src/sentinel.c b/src/sentinel.c

index 1c37d8e01701e0ed20a74eff35597e123cc1c62b..7fdc45e36bed6cd271913f4fa157307fdf354060 100644 (file)
--- a/src/sentinel.c
+++ b/src/sentinel.c
@@ -73,6 +73,8 @@ typedef struct sentinelAddr {
  #define SRI_RECONF_SENT (1<<11)     /* SLAVEOF <newmaster> sent. */
  #define SRI_RECONF_INPROG (1<<12)   /* Slave synchronization in progress. */
  #define SRI_RECONF_DONE (1<<13)     /* Slave synchronized with new master. */
+#define SRI_FORCE_FAILOVER (1<<14)  /* Force failover with master up. */
+#define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */
  
  #define SENTINEL_INFO_PERIOD 10000
  #define SENTINEL_PING_PERIOD 1000
@@ -323,6 +325,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri);
  void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...);
  sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master);
  void sentinelScheduleScriptExecution(char *path, ...);
+void sentinelStartFailover(sentinelRedisInstance *master, int state);
  
  /* ========================= Dictionary types =============================== */
  
@@ -363,6 +366,7 @@ dictType leaderVotesDictType = {
  /* =========================== Initialization =============================== */
  
  void sentinelCommand(redisClient *c);
+void sentinelInfoCommand(redisClient *c);
  
  struct redisCommand sentinelcmds[] = {
      {"ping",pingCommand,1,"",0,NULL,0,0,0,0,0},
@@ -370,7 +374,8 @@ struct redisCommand sentinelcmds[] = {
      {"subscribe",subscribeCommand,-2,"",0,NULL,0,0,0,0,0},
      {"unsubscribe",unsubscribeCommand,-1,"",0,NULL,0,0,0,0,0},
      {"psubscribe",psubscribeCommand,-2,"",0,NULL,0,0,0,0,0},
-    {"punsubscribe",punsubscribeCommand,-1,"",0,NULL,0,0,0,0,0}
+    {"punsubscribe",punsubscribeCommand,-1,"",0,NULL,0,0,0,0,0},
+    {"info",sentinelInfoCommand,-1,"",0,NULL,0,0,0,0,0}
  };
  
  /* This function overwrites a few normal Redis config default with Sentinel
@@ -787,7 +792,7 @@ void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, cha
      sentinelScheduleScriptExecution(master->client_reconfig_script,
          master->name,
          (role == SENTINEL_LEADER) ? "leader" : "observer",
-        state, from->ip, fromport, to->ip, toport);
+        state, from->ip, fromport, to->ip, toport, NULL);
  }
  
  /* ========================== sentinelRedisInstance ========================= */
@@ -815,7 +820,7 @@ void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, cha
  sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *hostname, int port, int quorum, sentinelRedisInstance *master) {
      sentinelRedisInstance *ri;
      sentinelAddr *addr;
-    dict *table;
+    dict *table = NULL;
      char slavename[128], *sdsname;
  
      redisAssert(flags & (SRI_MASTER|SRI_SLAVE|SRI_SENTINEL));
@@ -1407,6 +1412,10 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
                      SENTINEL_MASTER_LINK_STATUS_UP :
                      SENTINEL_MASTER_LINK_STATUS_DOWN;
              }
+
+            /* slave_priority:<priority> */
+            if (sdslen(l) >= 15 && !memcmp(l,"slave_priority:",15))
+                ri->slave_priority = atoi(l+15);
          }
      }
      ri->info_refresh = mstime();
@@ -1417,7 +1426,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
  
      /* Act if a master turned into a slave. */
      if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) {
-        if (first_runid && ri->slave_master_host) {
+        if ((first_runid || runid_changed) && ri->slave_master_host) {
              /* If it is the first time we receive INFO from it, but it's
               * a slave while it was configured as a master, we want to monitor
               * its master instead. */
@@ -1436,7 +1445,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
          if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
              (runid_changed || first_runid))
          {
-            /* If a slave turned into maser but:
+            /* If a slave turned into master but:
               *
               * 1) Failover not in progress.
               * 2) RunID hs changed, or its the first time we see an INFO output.
@@ -1570,6 +1579,17 @@ void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata
              strncmp(r->str,"MASTERDOWN",10) == 0)
          {
              ri->last_avail_time = mstime();
+        } else {
+            /* Send a SCRIPT KILL command if the instance appears to be
+             * down because of a busy script. */
+            if (strncmp(r->str,"BUSY",4) == 0 &&
+                (ri->flags & SRI_S_DOWN) &&
+                !(ri->flags & SRI_SCRIPT_KILL_SENT))
+            {
+                redisAsyncCommand(ri->cc,
+                    sentinelDiscardReplyCallback, NULL, "SCRIPT KILL");
+                ri->flags |= SRI_SCRIPT_KILL_SENT;
+            }
          }
      }
      ri->last_pong_time = mstime();
@@ -1868,6 +1888,10 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) {
          addReplyBulkCString(c,"master-port");
          addReplyBulkLongLong(c,ri->slave_master_port);
          fields++;
+
+        addReplyBulkCString(c,"slave-priority");
+        addReplyBulkLongLong(c,ri->slave_priority);
+        fields++;
      }
  
      /* Only sentinels */
@@ -1990,6 +2014,24 @@ void sentinelCommand(redisClient *c) {
              addReplyBulkCString(c,addr->ip);
              addReplyBulkLongLong(c,addr->port);
          }
+    } else if (!strcasecmp(c->argv[1]->ptr,"failover")) {
+        /* SENTINEL FAILOVER <master-name> */
+        sentinelRedisInstance *ri;
+
+        if (c->argc != 3) goto numargserr;
+        if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL)
+            return;
+        if (ri->flags & SRI_FAILOVER_IN_PROGRESS) {
+            addReplySds(c,sdsnew("-INPROG Failover already in progress\r\n"));
+            return;
+        }
+        if (sentinelSelectSlave(ri) == NULL) {
+            addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n"));
+            return;
+        }
+        sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START);
+        ri->flags |= SRI_FORCE_FAILOVER;
+        addReply(c,shared.ok);
      } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) {
          /* SENTINEL PENDING-SCRIPTS */
  
@@ -2006,6 +2048,65 @@ numargserr:
                            (char*)c->argv[1]->ptr);
  }
  
+void sentinelInfoCommand(redisClient *c) {
+    char *section = c->argc == 2 ? c->argv[1]->ptr : "default";
+    sds info = sdsempty();
+    int defsections = !strcasecmp(section,"default");
+    int sections = 0;
+
+    if (c->argc > 2) {
+        addReply(c,shared.syntaxerr);
+        return;
+    }
+
+    if (!strcasecmp(section,"server") || defsections) {
+        if (sections++) info = sdscat(info,"\r\n");
+        sds serversection = genRedisInfoString("server");
+        info = sdscatlen(info,serversection,sdslen(serversection));
+        sdsfree(serversection);
+    }
+
+    if (!strcasecmp(section,"sentinel") || defsections) {
+        dictIterator *di;
+        dictEntry *de;
+        int master_id = 0;
+
+        if (sections++) info = sdscat(info,"\r\n");
+        info = sdscatprintf(info,
+            "# Sentinel\r\n"
+            "sentinel_masters:%lu\r\n"
+            "sentinel_tilt:%d\r\n"
+            "sentinel_running_scripts:%d\r\n"
+            "sentinel_scripts_queue_length:%ld\r\n",
+            dictSize(sentinel.masters),
+            sentinel.tilt,
+            sentinel.running_scripts,
+            listLength(sentinel.scripts_queue));
+
+        di = dictGetIterator(sentinel.masters);
+        while((de = dictNext(di)) != NULL) {
+            sentinelRedisInstance *ri = dictGetVal(de);
+            char *status = "ok";
+
+            if (ri->flags & SRI_O_DOWN) status = "odown";
+            else if (ri->flags & SRI_S_DOWN) status = "sdown";
+            info = sdscatprintf(info,
+                "master%d:name=%s,status=%s,address=%s:%d,"
+                "slaves=%lu,sentinels=%lu\r\n",
+                master_id++, ri->name, status,
+                ri->addr->ip, ri->addr->port,
+                dictSize(ri->slaves),
+                dictSize(ri->sentinels)+1);
+        }
+        dictReleaseIterator(di);
+    }
+
+    addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
+        (unsigned long)sdslen(info)));
+    addReplySds(c,info);
+    addReply(c,shared.crlf);
+}
+
  /* ===================== SENTINEL availability checks ======================= */
  
  /* Is this instance down from our point of view? */
@@ -2049,7 +2150,7 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
          /* Is subjectively up */
          if (ri->flags & SRI_S_DOWN) {
              sentinelEvent(REDIS_WARNING,"-sdown",ri,"%@");
-            ri->flags &= ~SRI_S_DOWN;
+            ri->flags &= ~(SRI_S_DOWN|SRI_SCRIPT_KILL_SENT);
          }
      }
  }
@@ -2303,6 +2404,35 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) {
      return winner;
  }
  
+/* Setup the master state to start a failover as a leader.
+ *
+ * State can be either:
+ *
+ * SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch.
+ * SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover.
+ */
+void sentinelStartFailover(sentinelRedisInstance *master, int state) {
+    redisAssert(master->flags & SRI_MASTER);
+    redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START ||
+                state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
+
+    master->failover_state = state;
+    master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
+    sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
+
+    /* Pick a random delay if it's a fresh failover (WAIT_START), and not
+     * a recovery of a failover started by another sentinel. */
+    if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
+        master->failover_start_time = mstime() +
+            SENTINEL_FAILOVER_FIXED_DELAY +
+            (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
+        sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
+            "%@ #starting in %lld milliseconds",
+            master->failover_start_time-mstime());
+    }
+    master->failover_state_change_time = mstime();
+}
+
  /* This function checks if there are the conditions to start the failover,
   * that is:
   *
@@ -2313,7 +2443,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) {
   * If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS
   * and SRI_I_AM_THE_LEADER.
   */
-void sentinelStartFailover(sentinelRedisInstance *master) {
+void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
      char *leader;
      int isleader;
  
@@ -2353,7 +2483,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) {
              /* We have already an elected slave if we are in
               * FAILOVER_IN_PROGRESS state, that is, the slave that we
               * observed turning into a master. */
-            master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;
+            sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
              /* As an observer we flagged all the slaves as RECONF_SENT but
               * now we are in charge of actually sending the reconfiguration
               * command so let's clear this flag for all the instances. */
@@ -2366,23 +2496,8 @@ void sentinelStartFailover(sentinelRedisInstance *master) {
           * Do we have a slave to promote? Otherwise don't start a failover
           * at all. */
          if (sentinelSelectSlave(master) == NULL) return;
-        master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
-    }
-
-    master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
-    sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
-
-    /* Pick a random delay if it's a fresh failover (WAIT_START), and not
-     * a recovery of a failover started by another sentinel. */
-    if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
-        master->failover_start_time = mstime() +
-            SENTINEL_FAILOVER_FIXED_DELAY +
-            (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
-        sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
-            "%@ #starting in %lld milliseconds",
-            master->failover_start_time-mstime());
+        sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START);
      }
-    master->failover_state_change_time = mstime();
  }
  
  /* Select a suitable slave to promote. The current algorithm only uses
@@ -2393,6 +2508,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) {
   * 3) info_refresh more recent than SENTINEL_INFO_VALIDITY_TIME.
   * 4) master_link_down_time no more than:
   *     (now - master->s_down_since_time) + (master->down_after_period * 10).
+ * 5) Slave priority can't be zero, otherwise the slave is discareded.
   *
   * Among all the slaves matching the above conditions we select the slave
   * with lower slave_priority. If priority is the same we select the slave
@@ -2405,9 +2521,21 @@ void sentinelStartFailover(sentinelRedisInstance *master) {
  int compareSlavesForPromotion(const void *a, const void *b) {
      sentinelRedisInstance **sa = (sentinelRedisInstance **)a,
                            **sb = (sentinelRedisInstance **)b;
+    char *sa_runid, *sb_runid;
+
      if ((*sa)->slave_priority != (*sb)->slave_priority)
          return (*sa)->slave_priority - (*sb)->slave_priority;
-    return strcasecmp((*sa)->runid,(*sb)->runid);
+
+    /* If priority is the same, select the slave with that has the
+     * lexicographically smaller runid. Note that we try to handle runid
+     * == NULL as there are old Redis versions that don't publish runid in
+     * INFO. A NULL runid is considered bigger than any other runid. */
+    sa_runid = (*sa)->runid;
+    sb_runid = (*sb)->runid;
+    if (sa_runid == NULL && sb_runid == NULL) return 0;
+    else if (sa_runid == NULL) return 1;  /* a > b */
+    else if (sb_runid == NULL) return -1; /* a < b */
+    return strcasecmp(sa_runid, sb_runid);
  }
  
  sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
@@ -2417,10 +2545,11 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
      int instances = 0;
      dictIterator *di;
      dictEntry *de;
-    mstime_t max_master_down_time;
+    mstime_t max_master_down_time = 0;
  
-    max_master_down_time = (mstime() - master->s_down_since_time) +
-                           (master->down_after_period * 10);
+    if (master->flags & SRI_S_DOWN)
+        max_master_down_time += mstime() - master->s_down_since_time;
+    max_master_down_time += master->down_after_period * 10;
  
      di = dictGetIterator(master->slaves);
      while((de = dictNext(di)) != NULL) {
@@ -2429,6 +2558,13 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
  
          if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue;
          if (slave->last_avail_time < info_validity_time) continue;
+        if (slave->slave_priority == 0) continue;
+
+        /* If the master is in SDOWN state we get INFO for slaves every second.
+         * Otherwise we get it with the usual period so we need to account for
+         * a larger delay. */
+        if ((master->flags & SRI_S_DOWN) == 0)
+            info_validity_time -= SENTINEL_INFO_PERIOD;
          if (slave->info_refresh < info_validity_time) continue;
          if (slave->master_link_down_time > max_master_down_time) continue;
          instance[instances++] = slave;
@@ -2454,7 +2590,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
       * can go to waitstart if the slave is back rechable a few milliseconds
       * before the master is. In that case when the master is back online
       * we cancel the failover. */
-    if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0) {
+    if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) {
          sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back",
              ri,"%@");
          sentinelAbortFailover(ri);
@@ -2747,7 +2883,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) {
  
      sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
                                                          SENTINEL_OBSERVER;
-    ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER);
+    ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER);
      ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
      ri->failover_state_change_time = mstime();
      if (ri->promoted_slave) {
@@ -2821,7 +2957,7 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
      /* Only masters */
      if (ri->flags & SRI_MASTER) {
          sentinelCheckObjectivelyDown(ri);
-        sentinelStartFailover(ri);
+        sentinelStartFailoverIfNeeded(ri);
          sentinelFailoverStateMachine(ri);
          sentinelAbortFailoverIfNeeded(ri);
      }