X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/e01a415d3727f7972f2da46b662ccdfac95f839a..374eed7d2a8de9cc74ce5565b201645cb4ef5f29:/src/sentinel.c diff --git a/src/sentinel.c b/src/sentinel.c index 39a278fe..54c675b5 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -110,6 +110,10 @@ typedef struct sentinelAddr { #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 +/* Generic flags that can be used with different functions. */ +#define SENTINEL_NO_FLAGS 0 +#define SENTINEL_GENERATE_EVENT 1 + typedef struct sentinelRedisInstance { int flags; /* See SRI_... defines */ char *name; /* Master name from the point of view of this sentinel. */ @@ -284,7 +288,9 @@ char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master); char *sentinelGetObjectiveLeader(sentinelRedisInstance *master); int yesnotoi(char *s); void sentinelDisconnectInstanceFromContext(const redisAsyncContext *c); +void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c); const char *sentinelRedisInstanceTypeStr(sentinelRedisInstance *ri); +void sentinelAbortFailover(sentinelRedisInstance *ri); /* ========================= Dictionary types =============================== */ @@ -595,18 +601,9 @@ void releaseSentinelRedisInstance(sentinelRedisInstance *ri) { dictRelease(ri->sentinels); dictRelease(ri->slaves); - /* Release hiredis connections. Note that redisAsyncFree() will call - * the disconnection callback. */ - if (ri->cc) { - ri->cc->data = NULL; - redisAsyncFree(ri->cc); - ri->cc = NULL; - } - if (ri->pc) { - ri->pc->data = NULL; - redisAsyncFree(ri->pc); - ri->pc = NULL; - } + /* Release hiredis connections. */ + if (ri->cc) sentinelKillLink(ri,ri->cc); + if (ri->pc) sentinelKillLink(ri,ri->pc); /* Free other resources. */ sdsfree(ri->name); @@ -761,14 +758,14 @@ void sentinelDelFlagsToDictOfRedisInstances(dict *instances, int flags) { * 5) In the process of doing this undo the failover if in progress. * 6) Disconnect the connections with the master (will reconnect automatically). */ -void sentinelResetMaster(sentinelRedisInstance *ri) { +void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { redisAssert(ri->flags & SRI_MASTER); dictRelease(ri->slaves); dictRelease(ri->sentinels); ri->slaves = dictCreate(&instancesDictType,NULL); ri->sentinels = dictCreate(&instancesDictType,NULL); - if (ri->cc) redisAsyncFree(ri->cc); - if (ri->pc) redisAsyncFree(ri->pc); + if (ri->cc) sentinelKillLink(ri,ri->cc); + if (ri->pc) sentinelKillLink(ri,ri->pc); ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED; if (ri->leader) { sdsfree(ri->leader); @@ -778,12 +775,19 @@ void sentinelResetMaster(sentinelRedisInstance *ri) { ri->failover_state_change_time = 0; ri->failover_start_time = 0; ri->promoted_slave = NULL; - sentinelEvent(REDIS_WARNING,"+reset-master",ri,"%@"); + sdsfree(ri->runid); + sdsfree(ri->slave_master_host); + ri->runid = NULL; + ri->slave_master_host = NULL; + ri->last_avail_time = mstime(); + ri->last_pong_time = mstime(); + if (flags & SENTINEL_GENERATE_EVENT) + sentinelEvent(REDIS_WARNING,"+reset-master",ri,"%@"); } /* Call sentinelResetMaster() on every master with a name matching the specified * pattern. */ -int sentinelResetMastersByPattern(char *pattern) { +int sentinelResetMastersByPattern(char *pattern, int flags) { dictIterator *di; dictEntry *de; int reset = 0; @@ -794,7 +798,7 @@ int sentinelResetMastersByPattern(char *pattern) { if (ri->name) { if (stringmatch(pattern,ri->name,0)) { - sentinelResetMaster(ri); + sentinelResetMaster(ri,flags); reset++; } } @@ -803,6 +807,32 @@ int sentinelResetMastersByPattern(char *pattern) { return reset; } +/* Reset the specified master with sentinelResetMaster(), and also change + * the ip:port address, but take the name of the instance unmodified. + * + * This is used to handle the +switch-master and +redirect-to-master events. + * + * The function returns REDIS_ERR if the address can't be resolved for some + * reason. Otherwise REDIS_OK is returned. + * + * TODO: make this reset so that original sentinels are re-added with + * same ip / port / runid. + */ + +int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) { + sentinelAddr *oldaddr, *newaddr; + + newaddr = createSentinelAddr(ip,port); + if (newaddr == NULL) return REDIS_ERR; + sentinelResetMaster(master,SENTINEL_NO_FLAGS); + oldaddr = master->addr; + master->addr = newaddr; + /* Release the old address at the end so we are safe even if the function + * gets the master->addr->ip and master->addr->port as arguments. */ + releaseSentinelAddr(oldaddr); + return REDIS_OK; +} + /* ============================ Config handling ============================= */ char *sentinelHandleConfiguration(char **argv, int argc) { sentinelRedisInstance *ri; @@ -859,6 +889,18 @@ char *sentinelHandleConfiguration(char **argv, int argc) { /* ====================== hiredis connection handling ======================= */ +/* Completely disconnect an hiredis link from an instance. */ +void sentinelKillLink(sentinelRedisInstance *ri, redisAsyncContext *c) { + if (ri->cc == c) { + ri->cc = NULL; + ri->pending_commands = 0; + } + if (ri->pc == c) ri->pc = NULL; + c->data = NULL; + ri->flags |= SRI_DISCONNECTED; + redisAsyncFree(c); +} + /* This function takes an hiredis context that is in an error condition * and make sure to mark the instance as disconnected performing the * cleanup needed. @@ -909,8 +951,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { if (ri->cc->err) { sentinelEvent(REDIS_DEBUG,"-cmd-link-reconnection",ri,"%@ #%s", ri->cc->errstr); - redisAsyncFree(ri->cc); - ri->cc = NULL; + sentinelKillLink(ri,ri->cc); } else { ri->cc_conn_time = mstime(); ri->cc->data = ri; @@ -927,8 +968,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { if (ri->pc->err) { sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s", ri->pc->errstr); - redisAsyncFree(ri->pc); - ri->pc = NULL; + sentinelKillLink(ri,ri->pc); } else { int retval; @@ -946,8 +986,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { if (retval != REDIS_OK) { /* If we can't subscribe, the Pub/Sub connection is useless * and we can simply disconnect it and try again. */ - redisAsyncFree(ri->pc); - ri->pc = NULL; + sentinelKillLink(ri,ri->pc); return; } } @@ -1056,19 +1095,35 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { if (sentinel.tilt) return; + /* Act if a master turned into a slave. */ + if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { + if (first_runid && ri->slave_master_host) { + /* If it is the first time we receive INFO from it, but it's + * a slave while it was configured as a master, we want to monitor + * its master instead. */ + sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, + "%s %s %d %s %d", + ri->name, ri->addr->ip, ri->addr->port, + ri->slave_master_host, ri->slave_master_port); + sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, + ri->slave_master_port); + return; + } + } + /* Act if a slave turned into a master. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && (runid_changed || first_runid)) { - int retval; - /* If a slave turned into a master, but at the same time the * runid has changed, or it is simply the first time we see and * INFO output from this instance, this is a reboot with a wrong * configuration. * * Log the event and remove the slave. */ + int retval; + sentinelEvent(REDIS_WARNING,"-slave-restart-as-master",ri,"%@ #removing it from the attached slaves"); retval = dictDelete(ri->master->slaves,ri->name); redisAssert(retval == REDIS_OK); @@ -1145,8 +1200,8 @@ void sentinelInfoReplyCallback(redisAsyncContext *c, void *reply, void *privdata sentinelRedisInstance *ri = c->data; redisReply *r; - ri->pending_commands--; - if (!reply) return; + if (ri) ri->pending_commands--; + if (!reply || !ri) return; r = reply; if (r->type == REDIS_REPLY_STRING) { @@ -1159,15 +1214,15 @@ void sentinelInfoReplyCallback(redisAsyncContext *c, void *reply, void *privdata void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata) { sentinelRedisInstance *ri = c->data; - ri->pending_commands--; + if (ri) ri->pending_commands--; } void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata) { sentinelRedisInstance *ri = c->data; redisReply *r; - ri->pending_commands--; - if (!reply) return; + if (ri) ri->pending_commands--; + if (!reply || !ri) return; r = reply; if (r->type == REDIS_REPLY_STATUS || @@ -1190,8 +1245,8 @@ void sentinelPublishReplyCallback(redisAsyncContext *c, void *reply, void *privd sentinelRedisInstance *ri = c->data; redisReply *r; - ri->pending_commands--; - if (!reply) return; + if (ri) ri->pending_commands--; + if (!reply || !ri) return; r = reply; /* Only update pub_time if we actually published our message. Otherwise @@ -1206,7 +1261,7 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd sentinelRedisInstance *ri = c->data; redisReply *r; - if (!reply) return; + if (!reply || !ri) return; r = reply; /* Update the last activity in the pubsub channel. Note that since we @@ -1581,7 +1636,7 @@ void sentinelCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"reset")) { /* SENTINEL RESET */ if (c->argc != 3) goto numargserr; - addReplyLongLong(c,sentinelResetMastersByPattern(c->argv[2]->ptr)); + addReplyLongLong(c,sentinelResetMastersByPattern(c->argv[2]->ptr,SENTINEL_GENERATE_EVENT)); } else if (!strcasecmp(c->argv[1]->ptr,"get-master-addr-by-name")) { /* SENTINEL GET-MASTER-ADDR-BY-NAME */ sentinelRedisInstance *ri; @@ -1626,7 +1681,7 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { (mstime() - ri->cc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD && (mstime() - ri->last_pong_time) > (ri->down_after_period/2)) { - redisAsyncFree(ri->cc); /* will call the disconnection callback */ + sentinelKillLink(ri,ri->cc); } /* 2) Check if the pubsub link seems connected, was connected not less @@ -1638,7 +1693,7 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { (mstime() - ri->pc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD && (mstime() - ri->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD*3)) { - redisAsyncFree(ri->pc); /* will call the disconnection callback */ + sentinelKillLink(ri,ri->pc); } /* Update the subjectively down flag. */ @@ -1700,8 +1755,8 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p sentinelRedisInstance *ri = c->data; redisReply *r; - ri->pending_commands--; - if (!reply) return; + if (ri) ri->pending_commands--; + if (!reply || !ri) return; r = reply; /* Ignore every error or unexpected reply. @@ -2056,13 +2111,8 @@ void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { sentinelRedisInstance *slave = sentinelSelectSlave(ri); if (slave == NULL) { - sentinelEvent(REDIS_WARNING,"-no-good-slave",ri, - "%@ #retrying in %d seconds", - (SENTINEL_FAILOVER_FIXED_DELAY+ - SENTINEL_FAILOVER_MAX_RANDOM_DELAY)/1000); - ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START; - ri->failover_start_time = mstime() + SENTINEL_FAILOVER_FIXED_DELAY + - SENTINEL_FAILOVER_MAX_RANDOM_DELAY; + sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@"); + sentinelAbortFailover(ri); } else { sentinelEvent(REDIS_WARNING,"+selected-slave",slave,"%@"); slave->flags |= SRI_PROMOTED; @@ -2246,30 +2296,14 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { * and re-add it with the same address to trigger a complete state * refresh. */ void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { - sentinelRedisInstance *new, *ref = master->promoted_slave ? - master->promoted_slave : master; - int quorum = ref->quorum, parallel_syncs = ref->parallel_syncs; - char *name = sdsnew(master->name); - char *ip = sdsnew(ref->addr->ip), *oldip = sdsnew(master->addr->ip); - int port = ref->addr->port, oldport = master->addr->port; - int retval, oldflags = master->flags; - mstime_t old_down_after_period = master->down_after_period; - mstime_t old_failover_timeout = master->failover_timeout; - - retval = dictDelete(sentinel.masters,master->name); - redisAssert(retval == DICT_OK); - new = createSentinelRedisInstance(name,SRI_MASTER,ip,port,quorum,NULL); - redisAssert(new != NULL); - new->parallel_syncs = parallel_syncs; - new->flags |= (oldflags & SRI_CAN_FAILOVER); - new->down_after_period = old_down_after_period; - new->failover_timeout = old_failover_timeout; - /* TODO: ... set the scripts as well. */ - sentinelEvent(REDIS_WARNING,"+switch-master",new,"%s %s %d %s %d", - name, oldip, oldport, ip, port); - sdsfree(name); - sdsfree(ip); - sdsfree(oldip); + sentinelRedisInstance *ref = master->promoted_slave ? + master->promoted_slave : master; + + sentinelEvent(REDIS_WARNING,"+switch-master",master,"%s %s %d %s %d", + master->name, master->addr->ip, master->addr->port, + ref->addr->ip, ref->addr->port); + + sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port); } void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { @@ -2299,40 +2333,38 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { } } -/* The following is called only for master instances and will abort the - * failover process if: - * - * 1) The failover is in progress. - * 2) We already promoted a slave. - * 3) The promoted slave is in extended SDOWN condition. +/* Abort a failover in progress with the following steps: + * 1) If this instance is the leaer send a SLAVEOF command to all the already + * reconfigured slaves if any to configure them to replicate with the + * original master. + * 2) For both leaders and observers: clear the failover flags and state in + * the master instance. + * 3) If there is already a promoted slave and we are the leader, and this + * slave is not DISCONNECTED, try to reconfigure it to replicate + * back to the master as well, sending a best effort SLAVEOF command. */ -void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) { +void sentinelAbortFailover(sentinelRedisInstance *ri) { + char master_port[32]; dictIterator *di; dictEntry *de; - /* Failover is in progress? Do we have a promoted slave? */ - if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS) || !ri->promoted_slave) return; - - /* Is the promoted slave into an extended SDOWN state? */ - if (!(ri->promoted_slave->flags & SRI_S_DOWN) || - (mstime() - ri->promoted_slave->s_down_since_time) < - (ri->down_after_period * SENTINEL_EXTENDED_SDOWN_MULTIPLIER)) return; - - sentinelEvent(REDIS_WARNING,"-failover-abort-x-sdown",ri->promoted_slave,"%@"); + redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); + ll2string(master_port,sizeof(master_port),ri->addr->port); /* Clear failover related flags from slaves. * Also if we are the leader make sure to send SLAVEOF commands to all the * already reconfigured slaves in order to turn them back into slaves of * the original master. */ - di = dictGetIterator(ri->slaves); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *slave = dictGetVal(de); - if (ri->flags & SRI_I_AM_THE_LEADER) { - char master_port[32]; + if ((ri->flags & SRI_I_AM_THE_LEADER) && + !(slave->flags & SRI_DISCONNECTED) && + (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG| + SRI_RECONF_DONE))) + { int retval; - ll2string(master_port,sizeof(master_port),ri->addr->port); retval = redisAsyncCommand(slave->cc, sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %s", ri->addr->ip, @@ -2347,8 +2379,30 @@ void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) { ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); - ri->promoted_slave->flags &= ~SRI_PROMOTED; - ri->promoted_slave = NULL; + if (ri->promoted_slave) { + ri->promoted_slave->flags &= ~SRI_PROMOTED; + ri->promoted_slave = NULL; + } +} + +/* The following is called only for master instances and will abort the + * failover process if: + * + * 1) The failover is in progress. + * 2) We already promoted a slave. + * 3) The promoted slave is in extended SDOWN condition. + */ +void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) { + /* Failover is in progress? Do we have a promoted slave? */ + if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS) || !ri->promoted_slave) return; + + /* Is the promoted slave into an extended SDOWN state? */ + if (!(ri->promoted_slave->flags & SRI_S_DOWN) || + (mstime() - ri->promoted_slave->s_down_since_time) < + (ri->down_after_period * SENTINEL_EXTENDED_SDOWN_MULTIPLIER)) return; + + sentinelEvent(REDIS_WARNING,"-failover-abort-x-sdown",ri->promoted_slave,"%@"); + sentinelAbortFailover(ri); } /* ======================== SENTINEL timer handler ==========================