void clusterUpdateState(void);
int clusterNodeGetSlotBit(clusterNode *n, int slot);
sds clusterGenNodesDescription(void);
+clusterNode *clusterLookupNode(char *name);
+int clusterNodeAddSlave(clusterNode *master, clusterNode *slave);
+int clusterAddSlot(clusterNode *n, int slot);
/* -----------------------------------------------------------------------------
* Initialization
int clusterLoadConfig(char *filename) {
FILE *fp = fopen(filename,"r");
+ char *line;
+ int maxline, j;
- return REDIS_ERR;
if (fp == NULL) return REDIS_ERR;
+
+ /* Parse the file. Note that single liens of the cluster config file can
+ * be really long as they include all the hash slots of the node.
+ * This means in the worst possible case REDIS_CLUSTER_SLOTS/2 integers.
+ * To simplify we allocate 1024+REDIS_CLUSTER_SLOTS*16 bytes per line. */
+ maxline = 1024+REDIS_CLUSTER_SLOTS*16;
+ line = zmalloc(maxline);
+ while(fgets(line,maxline,fp) != NULL) {
+ int argc;
+ sds *argv = sdssplitargs(line,&argc);
+ clusterNode *n, *master;
+ char *p, *s;
+
+ /* Create this node if it does not exist */
+ n = clusterLookupNode(argv[0]);
+ if (!n) {
+ n = createClusterNode(argv[0],0);
+ clusterAddNode(n);
+ }
+ /* Address and port */
+ if ((p = strchr(argv[1],':')) == NULL) goto fmterr;
+ *p = '\0';
+ memcpy(n->ip,argv[1],strlen(argv[1])+1);
+ n->port = atoi(p+1);
+
+ /* Parse flags */
+ p = s = argv[2];
+ while(p) {
+ p = strchr(s,',');
+ if (p) *p = '\0';
+ if (!strcasecmp(s,"myself")) {
+ redisAssert(server.cluster.myself == NULL);
+ server.cluster.myself = n;
+ n->flags |= REDIS_NODE_MYSELF;
+ } else if (!strcasecmp(s,"master")) {
+ n->flags |= REDIS_NODE_MASTER;
+ } else if (!strcasecmp(s,"slave")) {
+ n->flags |= REDIS_NODE_SLAVE;
+ } else if (!strcasecmp(s,"fail?")) {
+ n->flags |= REDIS_NODE_PFAIL;
+ } else if (!strcasecmp(s,"fail")) {
+ n->flags |= REDIS_NODE_FAIL;
+ } else if (!strcasecmp(s,"handshake")) {
+ n->flags |= REDIS_NODE_HANDSHAKE;
+ } else if (!strcasecmp(s,"noaddr")) {
+ n->flags |= REDIS_NODE_NOADDR;
+ } else if (!strcasecmp(s,"noflags")) {
+ /* nothing to do */
+ } else {
+ redisPanic("Unknown flag in redis cluster config file");
+ }
+ if (p) s = p+1;
+ }
+
+ /* Get master if any. Set the master and populate master's
+ * slave list. */
+ if (argv[3][0] != '-') {
+ master = clusterLookupNode(argv[3]);
+ if (!master) {
+ master = createClusterNode(argv[3],0);
+ clusterAddNode(master);
+ }
+ n->slaveof = master;
+ clusterNodeAddSlave(master,n);
+ }
+
+ /* Set ping sent / pong received timestamps */
+ if (atoi(argv[4])) n->ping_sent = time(NULL);
+ if (atoi(argv[5])) n->pong_received = time(NULL);
+
+ /* Populate hash slots served by this instance. */
+ for (j = 7; j < argc; j++) {
+ int start, stop;
+
+ if ((p = strchr(argv[j],'-')) != NULL) {
+ *p = '\0';
+ start = atoi(argv[j]);
+ stop = atoi(p+1);
+ } else {
+ start = stop = atoi(argv[j]);
+ }
+ while(start <= stop) clusterAddSlot(n, start++);
+ }
+
+ sdssplitargs_free(argv,argc);
+ }
+ zfree(line);
fclose(fp);
+ /* Config sanity check */
+ redisAssert(server.cluster.myself != NULL);
redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s",
server.cluster.myself->name);
+ clusterUpdateState();
return REDIS_OK;
fmterr:
- redisLog(REDIS_WARNING,"Unrecovarable error: corrupted redis-cluster.conf file.");
+ redisLog(REDIS_WARNING,"Unrecovarable error: corrupted cluster config file.");
fclose(fp);
exit(1);
}
*
* This function writes the node config and returns 0, on error -1
* is returned. */
-int clusterSaveConfig(char *filename) {
+int clusterSaveConfig(void) {
sds ci = clusterGenNodesDescription();
int fd;
- if ((fd = open(filename,O_WRONLY|O_CREAT,0644)) == -1) goto err;
+ if ((fd = open(server.cluster.configfile,O_WRONLY|O_CREAT|O_TRUNC,0644))
+ == -1) goto err;
if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err;
close(fd);
sdsfree(ci);
return -1;
}
+void clusterSaveConfigOrDie(void) {
+ if (clusterSaveConfig() == -1) {
+ redisLog(REDIS_WARNING,"Fatal: can't update cluster config file.");
+ exit(1);
+ }
+}
+
void clusterInit(void) {
int saveconf = 0;
- server.cluster.myself = createClusterNode(NULL,REDIS_NODE_MYSELF);
+ server.cluster.myself = NULL;
server.cluster.state = REDIS_CLUSTER_FAIL;
server.cluster.nodes = dictCreate(&clusterNodesDictType,NULL);
server.cluster.node_timeout = 15;
sizeof(server.cluster.importing_slots_from));
memset(server.cluster.slots,0,
sizeof(server.cluster.slots));
- if (clusterLoadConfig("redis-cluster.conf") == REDIS_ERR) {
+ if (clusterLoadConfig(server.cluster.configfile) == REDIS_ERR) {
/* No configuration found. We will just use the random name provided
* by the createClusterNode() function. */
+ server.cluster.myself = createClusterNode(NULL,REDIS_NODE_MYSELF);
redisLog(REDIS_NOTICE,"No cluster configuration found, I'm %.40s",
server.cluster.myself->name);
+ clusterAddNode(server.cluster.myself);
saveconf = 1;
}
- clusterAddNode(server.cluster.myself);
- if (saveconf) {
- if (clusterSaveConfig("redis-cluster.conf") == -1) {
- redisLog(REDIS_WARNING,"Fatal: can't update cluster config file.");
- exit(1);
- }
- }
+ if (saveconf) clusterSaveConfigOrDie();
/* We need a listening TCP port for our cluster messaging needs */
server.cfd = anetTcpServer(server.neterr,
server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr);
/* Broadcast the failing node name to everybody */
clusterSendFail(node->name);
clusterUpdateState();
+ clusterSaveConfigOrDie();
}
} else {
/* If it's not in NOADDR state and we don't have it, we
sender = clusterLookupNode(hdr->sender);
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) {
+ int update_config = 0;
redisLog(REDIS_DEBUG,"Ping packet received: %p", link->node);
/* Add this node if it is new for us and the msg type is MEET.
nodeIp2String(node->ip,link);
node->port = ntohs(hdr->port);
clusterAddNode(node);
+ update_config = 1;
}
/* Get info from the gossip section */
/* Anyway reply with a PONG */
clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
+
+ /* Update config if needed */
+ if (update_config) clusterSaveConfigOrDie();
} else if (type == CLUSTERMSG_TYPE_PONG) {
- int update = 0;
+ int update_state = 0;
+ int update_config = 0;
redisLog(REDIS_DEBUG,"Pong packet received: %p", link->node);
if (link->node) {
redisLog(REDIS_DEBUG,"Handshake with node %.40s completed.",
link->node->name);
link->node->flags &= ~REDIS_NODE_HANDSHAKE;
+ update_config = 1;
} else if (memcmp(link->node->name,hdr->sender,
REDIS_CLUSTER_NAMELEN) != 0)
{
redisLog(REDIS_DEBUG,"PONG contains mismatching sender ID");
link->node->flags |= REDIS_NODE_NOADDR;
freeClusterLink(link);
+ update_config = 1;
/* FIXME: remove this node if we already have it.
*
* If we already have it but the IP is different, use
server.cluster.slots[j]->flags & REDIS_NODE_FAIL)
{
server.cluster.slots[j] = sender;
- update = 1;
+ update_state = update_config = 1;
}
}
}
clusterProcessGossipSection(hdr,link);
/* Update the cluster state if needed */
- if (update) clusterUpdateState();
+ if (update_state) clusterUpdateState();
+ if (update_config) clusterSaveConfigOrDie();
} else if (type == CLUSTERMSG_TYPE_FAIL && sender) {
clusterNode *failing;
failing = clusterLookupNode(hdr->data.fail.about.nodename);
- if (failing && !(failing->flags & REDIS_NODE_FAIL)) {
+ if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF)))
+ {
redisLog(REDIS_NOTICE,
"FAIL message received from %.40s about %.40s",
hdr->sender, hdr->data.fail.about.nodename);
failing->flags |= REDIS_NODE_FAIL;
failing->flags &= ~REDIS_NODE_PFAIL;
clusterUpdateState();
+ clusterSaveConfigOrDie();
}
} else {
redisLog(REDIS_NOTICE,"Received unknown packet type: %d", type);
* normal PING packets. */
node->flags &= ~REDIS_NODE_MEET;
- redisLog(REDIS_NOTICE,"Connecting with Node %.40s at %s:%d\n", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR);
+ redisLog(REDIS_NOTICE,"Connecting with Node %.40s at %s:%d", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR);
}
}
dictReleaseIterator(di);
int delay;
if (node->flags &
- (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE|
- REDIS_NODE_FAIL)) continue;
+ (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE))
+ continue;
/* Check only if we already sent a ping and did not received
* a reply yet. */
if (node->ping_sent == 0 ||
node->ping_sent <= node->pong_received) continue;
delay = time(NULL) - node->pong_received;
- if (node->flags & REDIS_NODE_PFAIL) {
+ if (delay < server.cluster.node_timeout) {
/* The PFAIL condition can be reversed without external
* help if it is not transitive (that is, if it does not
- * turn into a FAIL state). */
- if (delay < server.cluster.node_timeout)
+ * turn into a FAIL state).
+ *
+ * The FAIL condition is also reversible if there are no slaves
+ * for this host, so no slave election should be in progress.
+ *
+ * TODO: consider all the implications of resurrecting a
+ * FAIL node. */
+ if (node->flags & REDIS_NODE_PFAIL) {
node->flags &= ~REDIS_NODE_PFAIL;
+ } else if (node->flags & REDIS_NODE_FAIL && !node->numslaves) {
+ node->flags &= ~REDIS_NODE_FAIL;
+ clusterUpdateState();
+ }
} else {
- if (delay >= server.cluster.node_timeout) {
+ /* Timeout reached. Set the noad se possibly failing if it is
+ * not already in this state. */
+ if (!(node->flags & (REDIS_NODE_PFAIL|REDIS_NODE_FAIL))) {
redisLog(REDIS_DEBUG,"*** NODE %.40s possibly failing",
node->name);
node->flags |= REDIS_NODE_PFAIL;
* an error and REDIS_ERR is returned. */
int clusterAddSlot(clusterNode *n, int slot) {
redisAssert(clusterNodeSetSlotBit(n,slot) == 0);
- server.cluster.slots[slot] = server.cluster.myself;
- printf("SLOT %d added to %.40s\n", slot, n->name);
+ server.cluster.slots[slot] = n;
return REDIS_OK;
}
sds ci = sdsempty();
dictIterator *di;
dictEntry *de;
+ int j, start;
di = dictGetIterator(server.cluster.nodes);
while((de = dictNext(di)) != NULL) {
ci = sdscatprintf(ci,"- ");
/* Latency from the POV of this node, link status */
- ci = sdscatprintf(ci,"%ld %ld %s\n",
+ ci = sdscatprintf(ci,"%ld %ld %s",
(long) node->ping_sent,
(long) node->pong_received,
node->link ? "connected" : "disconnected");
+
+ /* Slots served by this instance */
+ start = -1;
+ for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
+ int bit;
+
+ if ((bit = clusterNodeGetSlotBit(node,j)) != 0) {
+ if (start == -1) start = j;
+ }
+ if (start != -1 && (!bit || j == REDIS_CLUSTER_SLOTS-1)) {
+ if (j == REDIS_CLUSTER_SLOTS-1) j++;
+
+ if (start == j-1) {
+ ci = sdscatprintf(ci," %d",start);
+ } else {
+ ci = sdscatprintf(ci," %d-%d",start,j-1);
+ }
+ start = -1;
+ }
+ }
+ ci = sdscatlen(ci,"\n",1);
}
dictReleaseIterator(di);
return ci;
}
zfree(slots);
clusterUpdateState();
+ clusterSaveConfigOrDie();
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) {
char *statestr[] = {"ok","fail","needhelp"};
"cluster_slots_ok:%d\r\n"
"cluster_slots_pfail:%d\r\n"
"cluster_slots_fail:%d\r\n"
+ "cluster_known_nodes:%lu\r\n"
, statestr[server.cluster.state],
slots_assigned,
slots_ok,
slots_pfail,
- slots_fail
+ slots_fail,
+ dictSize(server.cluster.nodes)
);
addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
(unsigned long)sdslen(info)));
/* Finally create the object from the serialized dump and
* store it at the specified key. */
- o = rdbLoadObject(data[0],fp);
- if (o == NULL) {
+ if ((data[0] > 4 && data[0] < 9) ||
+ data[0] > 11 ||
+ (o = rdbLoadObject(data[0],fp)) == NULL)
+ {
addReplyError(c,"Bad data format.");
fclose(fp);
return;
strerror(errno));
fclose(fp);
close(fd);
+ return;
file_rd_err:
redisLog(REDIS_WARNING,"Can't read from tmp file for MIGRATE: %s",
strerror(errno));
fclose(fp);
close(fd);
+ return;
socket_wr_err:
redisLog(REDIS_NOTICE,"Can't write to target node for MIGRATE: %s",
strerror(errno));
fclose(fp);
close(fd);
+ return;
socket_rd_err:
redisLog(REDIS_NOTICE,"Can't read from target node for MIGRATE: %s",
strerror(errno));
fclose(fp);
close(fd);
+ return;
+}
+
+/* DUMP keyname
+ * DUMP is actually not used by Redis Cluster but it is the obvious
+ * complement of RESTORE and can be useful for different applications. */
+void dumpCommand(redisClient *c) {
+ char buf[64];
+ FILE *fp;
+ robj *o, *dumpobj;
+ sds dump = NULL;
+ off_t payload_len;
+ unsigned int type;
+
+ /* Check if the key is here. */
+ if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) {
+ addReply(c,shared.nullbulk);
+ return;
+ }
+
+ /* Create temp file */
+ snprintf(buf,sizeof(buf),"redis-dump-%d.tmp",getpid());
+ fp = fopen(buf,"w+");
+ if (!fp) {
+ redisLog(REDIS_WARNING,"Can't open tmp file for MIGRATE: %s",
+ strerror(errno));
+ addReplyErrorFormat(c,"DUMP failed, tmp file creation error: %s.",
+ strerror(errno));
+ return;
+ }
+ unlink(buf);
+
+ /* Dump the serailized object and read it back in memory.
+ * We prefix it with a one byte containing the type ID.
+ * This is the serialization format understood by RESTORE. */
+ if (rdbSaveObject(fp,o) == -1) goto file_wr_err;
+ payload_len = ftello(fp);
+ if (fseeko(fp,0,SEEK_SET) == -1) goto file_rd_err;
+ dump = sdsnewlen(NULL,payload_len+1);
+ if (payload_len && fread(dump+1,payload_len,1,fp) != 1) goto file_rd_err;
+ fclose(fp);
+ type = o->type;
+ if (type == REDIS_LIST && o->encoding == REDIS_ENCODING_ZIPLIST)
+ type = REDIS_LIST_ZIPLIST;
+ else if (type == REDIS_HASH && o->encoding == REDIS_ENCODING_ZIPMAP)
+ type = REDIS_HASH_ZIPMAP;
+ else if (type == REDIS_SET && o->encoding == REDIS_ENCODING_INTSET)
+ type = REDIS_SET_INTSET;
+ else
+ type = o->type;
+ dump[0] = type;
+
+ /* Transfer to the client */
+ dumpobj = createObject(REDIS_STRING,dump);
+ addReplyBulk(c,dumpobj);
+ decrRefCount(dumpobj);
+ return;
+
+file_wr_err:
+ redisLog(REDIS_WARNING,"Can't write on tmp file for DUMP: %s",
+ strerror(errno));
+ addReplyErrorFormat(c,"DUMP failed, tmp file write error: %s.",
+ strerror(errno));
+ sdsfree(dump);
+ fclose(fp);
+ return;
+
+file_rd_err:
+ redisLog(REDIS_WARNING,"Can't read from tmp file for DUMP: %s",
+ strerror(errno));
+ addReplyErrorFormat(c,"DUMP failed, tmp file read error: %s.",
+ strerror(errno));
+ sdsfree(dump);
+ fclose(fp);
+ return;
}
/* -----------------------------------------------------------------------------