void clusterUpdateState(void);
int clusterNodeGetSlotBit(clusterNode *n, int slot);
sds clusterGenNodesDescription(void);
+clusterNode *clusterLookupNode(char *name);
+int clusterNodeAddSlave(clusterNode *master, clusterNode *slave);
+int clusterAddSlot(clusterNode *n, int slot);
/* -----------------------------------------------------------------------------
* Initialization
char *charset = "0123456789abcdef";
int j;
- if (!fp) {
- redisLog(REDIS_WARNING,
- "Unrecovarable error: can't open /dev/urandom:%s" ,strerror(errno));
- exit(1);
+ if (fp == NULL || fread(p,REDIS_CLUSTER_NAMELEN,1,fp) == 0) {
+ for (j = 0; j < REDIS_CLUSTER_NAMELEN; j++)
+ p[j] = rand();
}
- fread(p,REDIS_CLUSTER_NAMELEN,1,fp);
for (j = 0; j < REDIS_CLUSTER_NAMELEN; j++)
p[j] = charset[p[j] & 0x0F];
fclose(fp);
int clusterLoadConfig(char *filename) {
FILE *fp = fopen(filename,"r");
+ char *line;
+ int maxline, j;
- return REDIS_ERR;
if (fp == NULL) return REDIS_ERR;
+
+ /* Parse the file. Note that single liens of the cluster config file can
+ * be really long as they include all the hash slots of the node.
+ * This means in the worst possible case REDIS_CLUSTER_SLOTS/2 integers.
+ * To simplify we allocate 1024+REDIS_CLUSTER_SLOTS*16 bytes per line. */
+ maxline = 1024+REDIS_CLUSTER_SLOTS*16;
+ line = zmalloc(maxline);
+ while(fgets(line,maxline,fp) != NULL) {
+ int argc;
+ sds *argv = sdssplitargs(line,&argc);
+ clusterNode *n, *master;
+ char *p, *s;
+
+ /* Create this node if it does not exist */
+ n = clusterLookupNode(argv[0]);
+ if (!n) {
+ n = createClusterNode(argv[0],0);
+ clusterAddNode(n);
+ }
+ /* Address and port */
+ if ((p = strchr(argv[1],':')) == NULL) goto fmterr;
+ *p = '\0';
+ memcpy(n->ip,argv[1],strlen(argv[1])+1);
+ n->port = atoi(p+1);
+
+ /* Parse flags */
+ p = s = argv[2];
+ while(p) {
+ p = strchr(s,',');
+ if (p) *p = '\0';
+ if (!strcasecmp(s,"myself")) {
+ redisAssert(server.cluster.myself == NULL);
+ server.cluster.myself = n;
+ n->flags |= REDIS_NODE_MYSELF;
+ } else if (!strcasecmp(s,"master")) {
+ n->flags |= REDIS_NODE_MASTER;
+ } else if (!strcasecmp(s,"slave")) {
+ n->flags |= REDIS_NODE_SLAVE;
+ } else if (!strcasecmp(s,"fail?")) {
+ n->flags |= REDIS_NODE_PFAIL;
+ } else if (!strcasecmp(s,"fail")) {
+ n->flags |= REDIS_NODE_FAIL;
+ } else if (!strcasecmp(s,"handshake")) {
+ n->flags |= REDIS_NODE_HANDSHAKE;
+ } else if (!strcasecmp(s,"noaddr")) {
+ n->flags |= REDIS_NODE_NOADDR;
+ } else if (!strcasecmp(s,"noflags")) {
+ /* nothing to do */
+ } else {
+ redisPanic("Unknown flag in redis cluster config file");
+ }
+ if (p) s = p+1;
+ }
+
+ /* Get master if any. Set the master and populate master's
+ * slave list. */
+ if (argv[3][0] != '-') {
+ master = clusterLookupNode(argv[3]);
+ if (!master) {
+ master = createClusterNode(argv[3],0);
+ clusterAddNode(master);
+ }
+ n->slaveof = master;
+ clusterNodeAddSlave(master,n);
+ }
+
+ /* Set ping sent / pong received timestamps */
+ if (atoi(argv[4])) n->ping_sent = time(NULL);
+ if (atoi(argv[5])) n->pong_received = time(NULL);
+
+ /* Populate hash slots served by this instance. */
+ for (j = 7; j < argc; j++) {
+ int start, stop;
+
+ if (argv[j][0] == '[') {
+ /* Here we handle migrating / importing slots */
+ int slot;
+ char direction;
+ clusterNode *cn;
+
+ p = strchr(argv[j],'-');
+ redisAssert(p != NULL);
+ *p = '\0';
+ direction = p[1]; /* Either '>' or '<' */
+ slot = atoi(argv[j]+1);
+ p += 3;
+ cn = clusterLookupNode(p);
+ if (!cn) {
+ cn = createClusterNode(p,0);
+ clusterAddNode(cn);
+ }
+ if (direction == '>') {
+ server.cluster.migrating_slots_to[slot] = cn;
+ } else {
+ server.cluster.importing_slots_from[slot] = cn;
+ }
+ continue;
+ } else if ((p = strchr(argv[j],'-')) != NULL) {
+ *p = '\0';
+ start = atoi(argv[j]);
+ stop = atoi(p+1);
+ } else {
+ start = stop = atoi(argv[j]);
+ }
+ while(start <= stop) clusterAddSlot(n, start++);
+ }
+
+ sdssplitargs_free(argv,argc);
+ }
+ zfree(line);
fclose(fp);
+ /* Config sanity check */
+ redisAssert(server.cluster.myself != NULL);
redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s",
server.cluster.myself->name);
+ clusterUpdateState();
return REDIS_OK;
fmterr:
- redisLog(REDIS_WARNING,"Unrecovarable error: corrupted cluster.conf file.");
+ redisLog(REDIS_WARNING,"Unrecovarable error: corrupted cluster config file.");
fclose(fp);
exit(1);
}
*
* This function writes the node config and returns 0, on error -1
* is returned. */
-int clusterSaveConfig(char *filename) {
+int clusterSaveConfig(void) {
sds ci = clusterGenNodesDescription();
int fd;
- if ((fd = open(filename,O_WRONLY|O_CREAT,0644)) == -1) goto err;
+ if ((fd = open(server.cluster.configfile,O_WRONLY|O_CREAT|O_TRUNC,0644))
+ == -1) goto err;
if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err;
close(fd);
sdsfree(ci);
return -1;
}
+void clusterSaveConfigOrDie(void) {
+ if (clusterSaveConfig() == -1) {
+ redisLog(REDIS_WARNING,"Fatal: can't update cluster config file.");
+ exit(1);
+ }
+}
+
void clusterInit(void) {
- server.cluster.myself = createClusterNode(NULL,REDIS_NODE_MYSELF);
+ int saveconf = 0;
+
+ server.cluster.myself = NULL;
server.cluster.state = REDIS_CLUSTER_FAIL;
server.cluster.nodes = dictCreate(&clusterNodesDictType,NULL);
server.cluster.node_timeout = 15;
sizeof(server.cluster.importing_slots_from));
memset(server.cluster.slots,0,
sizeof(server.cluster.slots));
- if (clusterLoadConfig("cluster.conf") == REDIS_ERR) {
+ if (clusterLoadConfig(server.cluster.configfile) == REDIS_ERR) {
/* No configuration found. We will just use the random name provided
* by the createClusterNode() function. */
+ server.cluster.myself = createClusterNode(NULL,REDIS_NODE_MYSELF);
redisLog(REDIS_NOTICE,"No cluster configuration found, I'm %.40s",
server.cluster.myself->name);
- if (clusterSaveConfig("cluster.conf") == -1) {
- redisLog(REDIS_WARNING,"Fatal: can't update cluster config file.");
- exit(1);
- }
+ clusterAddNode(server.cluster.myself);
+ saveconf = 1;
}
- clusterAddNode(server.cluster.myself);
+ if (saveconf) clusterSaveConfigOrDie();
/* We need a listening TCP port for our cluster messaging needs */
server.cfd = anetTcpServer(server.neterr,
server.port+REDIS_CLUSTER_PORT_INCR, server.bindaddr);
}
if (aeCreateFileEvent(server.el, server.cfd, AE_READABLE,
clusterAcceptHandler, NULL) == AE_ERR) oom("creating file event");
+ server.cluster.slots_to_keys = zslCreate();
}
/* -----------------------------------------------------------------------------
de = dictFind(server.cluster.nodes,s);
sdsfree(s);
if (de == NULL) return NULL;
- return dictGetEntryVal(de);
+ return dictGetVal(de);
}
/* This is only used after the handshake. When we connect a given IP/PORT
* time PONG figure if it is newer than our figure.
* Note that it's not a problem if we have a PING already
* in progress against this node. */
- if (node->pong_received < ntohl(g->pong_received)) {
+ if (node->pong_received < (signed) ntohl(g->pong_received)) {
redisLog(REDIS_DEBUG,"Node pong_received updated by gossip");
node->pong_received = ntohl(g->pong_received);
}
/* Broadcast the failing node name to everybody */
clusterSendFail(node->name);
clusterUpdateState();
+ clusterSaveConfigOrDie();
}
} else {
/* If it's not in NOADDR state and we don't have it, we
/* Update the node address to the IP address that can be extracted
* from link->fd, and at the specified port. */
void nodeUpdateAddress(clusterNode *node, clusterLink *link, int port) {
+ /* TODO */
}
/* When this function is called, there is a packet to process starting
uint16_t type = ntohs(hdr->type);
clusterNode *sender;
- redisLog(REDIS_DEBUG,"--- packet to process %lu bytes (%lu) ---",
- (unsigned long) totlen, sdslen(link->rcvbuf));
+ redisLog(REDIS_DEBUG,"--- Processing packet of type %d, %lu bytes",
+ type, (unsigned long) totlen);
+
+ /* Perform sanity checks */
if (totlen < 8) return 1;
if (totlen > sdslen(link->rcvbuf)) return 1;
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
explen += sizeof(clusterMsgDataFail);
if (totlen != explen) return 1;
}
+ if (type == CLUSTERMSG_TYPE_PUBLISH) {
+ uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+
+ explen += sizeof(clusterMsgDataPublish) +
+ ntohl(hdr->data.publish.msg.channel_len) +
+ ntohl(hdr->data.publish.msg.message_len);
+ if (totlen != explen) return 1;
+ }
+ /* Ready to process the packet. Dispatch by type. */
sender = clusterLookupNode(hdr->sender);
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) {
+ int update_config = 0;
redisLog(REDIS_DEBUG,"Ping packet received: %p", link->node);
/* Add this node if it is new for us and the msg type is MEET.
nodeIp2String(node->ip,link);
node->port = ntohs(hdr->port);
clusterAddNode(node);
+ update_config = 1;
}
/* Get info from the gossip section */
/* Anyway reply with a PONG */
clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
+
+ /* Update config if needed */
+ if (update_config) clusterSaveConfigOrDie();
} else if (type == CLUSTERMSG_TYPE_PONG) {
- int update = 0;
+ int update_state = 0;
+ int update_config = 0;
redisLog(REDIS_DEBUG,"Pong packet received: %p", link->node);
if (link->node) {
redisLog(REDIS_DEBUG,"Handshake with node %.40s completed.",
link->node->name);
link->node->flags &= ~REDIS_NODE_HANDSHAKE;
+ update_config = 1;
} else if (memcmp(link->node->name,hdr->sender,
REDIS_CLUSTER_NAMELEN) != 0)
{
redisLog(REDIS_DEBUG,"PONG contains mismatching sender ID");
link->node->flags |= REDIS_NODE_NOADDR;
freeClusterLink(link);
+ update_config = 1;
/* FIXME: remove this node if we already have it.
*
* If we already have it but the IP is different, use
server.cluster.slots[j]->flags & REDIS_NODE_FAIL)
{
server.cluster.slots[j] = sender;
- update = 1;
+ update_state = update_config = 1;
}
}
}
clusterProcessGossipSection(hdr,link);
/* Update the cluster state if needed */
- if (update) clusterUpdateState();
+ if (update_state) clusterUpdateState();
+ if (update_config) clusterSaveConfigOrDie();
} else if (type == CLUSTERMSG_TYPE_FAIL && sender) {
clusterNode *failing;
failing = clusterLookupNode(hdr->data.fail.about.nodename);
- if (failing && !(failing->flags & REDIS_NODE_FAIL)) {
+ if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF)))
+ {
redisLog(REDIS_NOTICE,
"FAIL message received from %.40s about %.40s",
hdr->sender, hdr->data.fail.about.nodename);
failing->flags |= REDIS_NODE_FAIL;
failing->flags &= ~REDIS_NODE_PFAIL;
clusterUpdateState();
+ clusterSaveConfigOrDie();
+ }
+ } else if (type == CLUSTERMSG_TYPE_PUBLISH) {
+ robj *channel, *message;
+ uint32_t channel_len, message_len;
+
+ /* Don't bother creating useless objects if there are no Pub/Sub subscribers. */
+ if (dictSize(server.pubsub_channels) || listLength(server.pubsub_patterns)) {
+ channel_len = ntohl(hdr->data.publish.msg.channel_len);
+ message_len = ntohl(hdr->data.publish.msg.message_len);
+ channel = createStringObject(
+ (char*)hdr->data.publish.msg.bulk_data,channel_len);
+ message = createStringObject(
+ (char*)hdr->data.publish.msg.bulk_data+channel_len, message_len);
+ pubsubPublishMessage(channel,message);
+ decrRefCount(channel);
+ decrRefCount(message);
}
} else {
- redisLog(REDIS_NOTICE,"Received unknown packet type: %d", type);
+ redisLog(REDIS_WARNING,"Received unknown packet type: %d", type);
}
return 1;
}
link->sndbuf = sdscatlen(link->sndbuf, msg, msglen);
}
+/* Send a message to all the nodes with a reliable link */
+void clusterBroadcastMessage(void *buf, size_t len) {
+ dictIterator *di;
+ dictEntry *de;
+
+ di = dictGetIterator(server.cluster.nodes);
+ while((de = dictNext(di)) != NULL) {
+ clusterNode *node = dictGetVal(de);
+
+ if (!node->link) continue;
+ if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue;
+ clusterSendMessage(node->link,buf,len);
+ }
+ dictReleaseIterator(di);
+}
+
/* Build the message header */
void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
int totlen;
/* Populate the gossip fields */
while(freshnodes > 0 && gossipcount < 3) {
struct dictEntry *de = dictGetRandomKey(server.cluster.nodes);
- clusterNode *this = dictGetEntryVal(de);
+ clusterNode *this = dictGetVal(de);
clusterMsgDataGossip *gossip;
int j;
clusterSendMessage(link,buf,totlen);
}
-/* Send a message to all the nodes with a reliable link */
-void clusterBroadcastMessage(void *buf, size_t len) {
- dictIterator *di;
- dictEntry *de;
+/* Send a PUBLISH message.
+ *
+ * If link is NULL, then the message is broadcasted to the whole cluster. */
+void clusterSendPublish(clusterLink *link, robj *channel, robj *message) {
+ unsigned char buf[4096], *payload;
+ clusterMsg *hdr = (clusterMsg*) buf;
+ uint32_t totlen;
+ uint32_t channel_len, message_len;
- di = dictGetIterator(server.cluster.nodes);
- while((de = dictNext(di)) != NULL) {
- clusterNode *node = dictGetEntryVal(de);
+ channel = getDecodedObject(channel);
+ message = getDecodedObject(message);
+ channel_len = sdslen(channel->ptr);
+ message_len = sdslen(message->ptr);
- if (!node->link) continue;
- if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue;
- clusterSendMessage(node->link,buf,len);
+ clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_PUBLISH);
+ totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+ totlen += sizeof(clusterMsgDataPublish) + channel_len + message_len;
+
+ hdr->data.publish.msg.channel_len = htonl(channel_len);
+ hdr->data.publish.msg.message_len = htonl(message_len);
+ hdr->totlen = htonl(totlen);
+
+ /* Try to use the local buffer if possible */
+ if (totlen < sizeof(buf)) {
+ payload = buf;
+ } else {
+ payload = zmalloc(totlen);
+ hdr = (clusterMsg*) payload;
+ memcpy(payload,hdr,sizeof(hdr));
}
- dictReleaseIterator(di);
+ memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr));
+ memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr),
+ message->ptr,sdslen(message->ptr));
+
+ if (link)
+ clusterSendMessage(link,payload,totlen);
+ else
+ clusterBroadcastMessage(payload,totlen);
+
+ decrRefCount(channel);
+ decrRefCount(message);
+ if (payload != buf) zfree(payload);
}
/* Send a FAIL message to all the nodes we are able to contact.
clusterBroadcastMessage(buf,ntohl(hdr->totlen));
}
+/* -----------------------------------------------------------------------------
+ * CLUSTER Pub/Sub support
+ *
+ * For now we do very little, just propagating PUBLISH messages across the whole
+ * cluster. In the future we'll try to get smarter and avoiding propagating those
+ * messages to hosts without receives for a given channel.
+ * -------------------------------------------------------------------------- */
+void clusterPropagatePublish(robj *channel, robj *message) {
+ clusterSendPublish(NULL, channel, message);
+}
+
/* -----------------------------------------------------------------------------
* CLUSTER cron job
* -------------------------------------------------------------------------- */
/* Check if we have disconnected nodes and reestablish the connection. */
di = dictGetIterator(server.cluster.nodes);
while((de = dictNext(di)) != NULL) {
- clusterNode *node = dictGetEntryVal(de);
+ clusterNode *node = dictGetVal(de);
if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue;
if (node->link == NULL) {
* normal PING packets. */
node->flags &= ~REDIS_NODE_MEET;
- redisLog(REDIS_NOTICE,"Connecting with Node %.40s at %s:%d\n", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR);
+ redisLog(REDIS_NOTICE,"Connecting with Node %.40s at %s:%d", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR);
}
}
dictReleaseIterator(di);
* the oldest ping_sent time */
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster.nodes);
- clusterNode *this = dictGetEntryVal(de);
+ clusterNode *this = dictGetVal(de);
if (this->link == NULL) continue;
if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue;
/* Iterate nodes to check if we need to flag something as failing */
di = dictGetIterator(server.cluster.nodes);
while((de = dictNext(di)) != NULL) {
- clusterNode *node = dictGetEntryVal(de);
+ clusterNode *node = dictGetVal(de);
int delay;
if (node->flags &
- (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE|
- REDIS_NODE_FAIL)) continue;
+ (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE))
+ continue;
/* Check only if we already sent a ping and did not received
* a reply yet. */
if (node->ping_sent == 0 ||
node->ping_sent <= node->pong_received) continue;
delay = time(NULL) - node->pong_received;
- if (node->flags & REDIS_NODE_PFAIL) {
+ if (delay < server.cluster.node_timeout) {
/* The PFAIL condition can be reversed without external
* help if it is not transitive (that is, if it does not
- * turn into a FAIL state). */
- if (delay < server.cluster.node_timeout)
+ * turn into a FAIL state).
+ *
+ * The FAIL condition is also reversible if there are no slaves
+ * for this host, so no slave election should be in progress.
+ *
+ * TODO: consider all the implications of resurrecting a
+ * FAIL node. */
+ if (node->flags & REDIS_NODE_PFAIL) {
node->flags &= ~REDIS_NODE_PFAIL;
+ } else if (node->flags & REDIS_NODE_FAIL && !node->numslaves) {
+ node->flags &= ~REDIS_NODE_FAIL;
+ clusterUpdateState();
+ }
} else {
- if (delay >= server.cluster.node_timeout) {
+ /* Timeout reached. Set the noad se possibly failing if it is
+ * not already in this state. */
+ if (!(node->flags & (REDIS_NODE_PFAIL|REDIS_NODE_FAIL))) {
redisLog(REDIS_DEBUG,"*** NODE %.40s possibly failing",
node->name);
node->flags |= REDIS_NODE_PFAIL;
* If the slot is already assigned to another instance this is considered
* an error and REDIS_ERR is returned. */
int clusterAddSlot(clusterNode *n, int slot) {
- redisAssert(clusterNodeSetSlotBit(n,slot) == 0);
- server.cluster.slots[slot] = server.cluster.myself;
- printf("SLOT %d added to %.40s\n", slot, n->name);
+ if (clusterNodeSetSlotBit(n,slot) != 0)
+ return REDIS_ERR;
+ server.cluster.slots[slot] = n;
+ return REDIS_OK;
+}
+
+/* Delete the specified slot marking it as unassigned.
+ * Returns REDIS_OK if the slot was assigned, otherwise if the slot was
+ * already unassigned REDIS_ERR is returned. */
+int clusterDelSlot(int slot) {
+ clusterNode *n = server.cluster.slots[slot];
+
+ if (!n) return REDIS_ERR;
+ redisAssert(clusterNodeClearSlotBit(n,slot) == 1);
+ server.cluster.slots[slot] = NULL;
return REDIS_OK;
}
sds ci = sdsempty();
dictIterator *di;
dictEntry *de;
+ int j, start;
di = dictGetIterator(server.cluster.nodes);
while((de = dictNext(di)) != NULL) {
- clusterNode *node = dictGetEntryVal(de);
+ clusterNode *node = dictGetVal(de);
/* Node coordinates */
ci = sdscatprintf(ci,"%.40s %s:%d ",
ci = sdscatprintf(ci,"- ");
/* Latency from the POV of this node, link status */
- ci = sdscatprintf(ci,"%ld %ld %s\n",
+ ci = sdscatprintf(ci,"%ld %ld %s",
(long) node->ping_sent,
(long) node->pong_received,
- node->link ? "connected" : "disconnected");
+ (node->link || node->flags & REDIS_NODE_MYSELF) ?
+ "connected" : "disconnected");
+
+ /* Slots served by this instance */
+ start = -1;
+ for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
+ int bit;
+
+ if ((bit = clusterNodeGetSlotBit(node,j)) != 0) {
+ if (start == -1) start = j;
+ }
+ if (start != -1 && (!bit || j == REDIS_CLUSTER_SLOTS-1)) {
+ if (j == REDIS_CLUSTER_SLOTS-1) j++;
+
+ if (start == j-1) {
+ ci = sdscatprintf(ci," %d",start);
+ } else {
+ ci = sdscatprintf(ci," %d-%d",start,j-1);
+ }
+ start = -1;
+ }
+ }
+
+ /* Just for MYSELF node we also dump info about slots that
+ * we are migrating to other instances or importing from other
+ * instances. */
+ if (node->flags & REDIS_NODE_MYSELF) {
+ for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
+ if (server.cluster.migrating_slots_to[j]) {
+ ci = sdscatprintf(ci," [%d->-%.40s]",j,
+ server.cluster.migrating_slots_to[j]->name);
+ } else if (server.cluster.importing_slots_from[j]) {
+ ci = sdscatprintf(ci," [%d-<-%.40s]",j,
+ server.cluster.importing_slots_from[j]->name);
+ }
+ }
+ }
+ ci = sdscatlen(ci,"\n",1);
}
dictReleaseIterator(di);
return ci;
}
+int getSlotOrReply(redisClient *c, robj *o) {
+ long long slot;
+
+ if (getLongLongFromObject(o,&slot) != REDIS_OK ||
+ slot < 0 || slot > REDIS_CLUSTER_SLOTS)
+ {
+ addReplyError(c,"Invalid or out of range slot");
+ return -1;
+ }
+ return (int) slot;
+}
+
void clusterCommand(redisClient *c) {
if (server.cluster_enabled == 0) {
addReplyError(c,"This instance has cluster support disabled");
o = createObject(REDIS_STRING,ci);
addReplyBulk(c,o);
decrRefCount(o);
- } else if (!strcasecmp(c->argv[1]->ptr,"addslots") && c->argc >= 3) {
- int j;
- long long slot;
+ } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") ||
+ !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3)
+ {
+ /* CLUSTER ADDSLOTS <slot> [slot] ... */
+ /* CLUSTER DELSLOTS <slot> [slot] ... */
+ int j, slot;
unsigned char *slots = zmalloc(REDIS_CLUSTER_SLOTS);
+ int del = !strcasecmp(c->argv[1]->ptr,"delslots");
memset(slots,0,REDIS_CLUSTER_SLOTS);
/* Check that all the arguments are parsable and that all the
* slots are not already busy. */
for (j = 2; j < c->argc; j++) {
- if (getLongLongFromObject(c->argv[j],&slot) != REDIS_OK ||
- slot < 0 || slot > REDIS_CLUSTER_SLOTS)
- {
- addReplyError(c,"Invalid or out of range slot index");
+ if ((slot = getSlotOrReply(c,c->argv[j])) == -1) {
zfree(slots);
return;
}
- if (server.cluster.slots[slot]) {
- addReplyErrorFormat(c,"Slot %lld is already busy", slot);
+ if (del && server.cluster.slots[slot] == NULL) {
+ addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
+ zfree(slots);
+ return;
+ } else if (!del && server.cluster.slots[slot]) {
+ addReplyErrorFormat(c,"Slot %d is already busy", slot);
zfree(slots);
return;
}
}
for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) {
if (slots[j]) {
- int retval = clusterAddSlot(server.cluster.myself,j);
-
- redisAssert(retval == REDIS_OK);
+ int retval;
+
+ /* If this slot was set as importing we can clear this
+ * state as now we are the real owner of the slot. */
+ if (server.cluster.importing_slots_from[j])
+ server.cluster.importing_slots_from[j] = NULL;
+
+ retval = del ? clusterDelSlot(j) :
+ clusterAddSlot(server.cluster.myself,j);
+ redisAssertWithInfo(c,NULL,retval == REDIS_OK);
}
}
zfree(slots);
clusterUpdateState();
+ clusterSaveConfigOrDie();
+ addReply(c,shared.ok);
+ } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) {
+ /* SETSLOT 10 MIGRATING <node ID> */
+ /* SETSLOT 10 IMPORTING <node ID> */
+ /* SETSLOT 10 STABLE */
+ /* SETSLOT 10 NODE <node ID> */
+ int slot;
+ clusterNode *n;
+
+ if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return;
+
+ if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) {
+ if (server.cluster.slots[slot] != server.cluster.myself) {
+ addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot);
+ return;
+ }
+ if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) {
+ addReplyErrorFormat(c,"I don't know about node %s",
+ (char*)c->argv[4]->ptr);
+ return;
+ }
+ server.cluster.migrating_slots_to[slot] = n;
+ } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) {
+ if (server.cluster.slots[slot] == server.cluster.myself) {
+ addReplyErrorFormat(c,
+ "I'm already the owner of hash slot %u",slot);
+ return;
+ }
+ if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) {
+ addReplyErrorFormat(c,"I don't know about node %s",
+ (char*)c->argv[3]->ptr);
+ return;
+ }
+ server.cluster.importing_slots_from[slot] = n;
+ } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) {
+ /* CLUSTER SETSLOT <SLOT> STABLE */
+ server.cluster.importing_slots_from[slot] = NULL;
+ server.cluster.migrating_slots_to[slot] = NULL;
+ } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) {
+ /* CLUSTER SETSLOT <SLOT> NODE <NODE ID> */
+ clusterNode *n = clusterLookupNode(c->argv[4]->ptr);
+
+ if (!n) addReplyErrorFormat(c,"Unknown node %s",
+ (char*)c->argv[4]->ptr);
+ /* If this hash slot was served by 'myself' before to switch
+ * make sure there are no longer local keys for this hash slot. */
+ if (server.cluster.slots[slot] == server.cluster.myself &&
+ n != server.cluster.myself)
+ {
+ int numkeys;
+ robj **keys;
+
+ keys = zmalloc(sizeof(robj*)*1);
+ numkeys = GetKeysInSlot(slot, keys, 1);
+ zfree(keys);
+ if (numkeys != 0) {
+ addReplyErrorFormat(c, "Can't assign hashslot %d to a different node while I still hold keys for this hash slot.", slot);
+ return;
+ }
+ }
+ /* If this node was the slot owner and the slot was marked as
+ * migrating, assigning the slot to another node will clear
+ * the migratig status. */
+ if (server.cluster.slots[slot] == server.cluster.myself &&
+ server.cluster.migrating_slots_to[slot])
+ server.cluster.migrating_slots_to[slot] = NULL;
+
+ /* If this node was importing this slot, assigning the slot to
+ * itself also clears the importing status. */
+ if (n == server.cluster.myself && server.cluster.importing_slots_from[slot])
+ server.cluster.importing_slots_from[slot] = NULL;
+
+ clusterDelSlot(slot);
+ clusterAddSlot(n,slot);
+ } else {
+ addReplyError(c,"Invalid CLUSTER SETSLOT action or number of arguments");
+ return;
+ }
+ clusterSaveConfigOrDie();
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) {
char *statestr[] = {"ok","fail","needhelp"};
"cluster_slots_ok:%d\r\n"
"cluster_slots_pfail:%d\r\n"
"cluster_slots_fail:%d\r\n"
+ "cluster_known_nodes:%lu\r\n"
, statestr[server.cluster.state],
slots_assigned,
slots_ok,
slots_pfail,
- slots_fail
+ slots_fail,
+ dictSize(server.cluster.nodes)
);
addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
(unsigned long)sdslen(info)));
addReplySds(c,info);
addReply(c,shared.crlf);
+ } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) {
+ sds key = c->argv[2]->ptr;
+
+ addReplyLongLong(c,keyHashSlot(key,sdslen(key)));
+ } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) {
+ long long maxkeys, slot;
+ unsigned int numkeys, j;
+ robj **keys;
+
+ if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != REDIS_OK)
+ return;
+ if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) != REDIS_OK)
+ return;
+ if (slot < 0 || slot >= REDIS_CLUSTER_SLOTS || maxkeys < 0 ||
+ maxkeys > 1024*1024) {
+ addReplyError(c,"Invalid slot or number of keys");
+ return;
+ }
+
+ keys = zmalloc(sizeof(robj*)*maxkeys);
+ numkeys = GetKeysInSlot(slot, keys, maxkeys);
+ addReplyMultiBulkLen(c,numkeys);
+ for (j = 0; j < numkeys; j++) addReplyBulk(c,keys[j]);
+ zfree(keys);
} else {
addReplyError(c,"Wrong CLUSTER subcommand or number of arguments");
}
/* RESTORE key ttl serialized-value */
void restoreCommand(redisClient *c) {
- FILE *fp;
- char buf[64];
- robj *o;
- unsigned char *data;
long ttl;
+ rio payload;
+ int type;
+ robj *obj;
/* Make sure this key does not already exist here... */
- if (dbExists(c->db,c->argv[1])) {
+ if (lookupKeyWrite(c->db,c->argv[1]) != NULL) {
addReplyError(c,"Target key name is busy.");
return;
}
return;
}
- /* rdbLoadObject() only works against file descriptors so we need to
- * dump the serialized object into a file and reload. */
- snprintf(buf,sizeof(buf),"redis-restore-%d.tmp",getpid());
- fp = fopen(buf,"w+");
- if (!fp) {
- redisLog(REDIS_WARNING,"Can't open tmp file for RESTORE: %s",
- strerror(errno));
- addReplyErrorFormat(c,"RESTORE failed, tmp file creation error: %s",
- strerror(errno));
- return;
- }
- unlink(buf);
-
- /* Write the actual data and rewind the file */
- data = (unsigned char*) c->argv[3]->ptr;
- if (fwrite(data+1,sdslen((sds)data)-1,1,fp) != 1) {
- redisLog(REDIS_WARNING,"Can't write against tmp file for RESTORE: %s",
- strerror(errno));
- addReplyError(c,"RESTORE failed, tmp file I/O error.");
- fclose(fp);
- return;
- }
- rewind(fp);
-
- /* Finally create the object from the serialized dump and
- * store it at the specified key. */
- o = rdbLoadObject(data[0],fp);
- if (o == NULL) {
- addReplyError(c,"Bad data format.");
- fclose(fp);
+ rioInitWithBuffer(&payload,c->argv[3]->ptr);
+ if (((type = rdbLoadObjectType(&payload)) == -1) ||
+ ((obj = rdbLoadObject(type,&payload)) == NULL))
+ {
+ addReplyError(c,"Bad data format");
return;
}
- fclose(fp);
/* Create the key and set the TTL if any */
- dbAdd(c->db,c->argv[1],o);
+ dbAdd(c->db,c->argv[1],obj);
if (ttl) setExpire(c->db,c->argv[1],time(NULL)+ttl);
+ signalModifiedKey(c->db,c->argv[1]);
addReply(c,shared.ok);
+ server.dirty++;
}
/* MIGRATE host port key dbid timeout */
int fd;
long timeout;
long dbid;
- char buf[64];
- FILE *fp;
time_t ttl;
robj *o;
- unsigned char type;
- off_t payload_len;
+ rio cmd, payload;
/* Sanity check */
if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != REDIS_OK)
* nothing to migrate (for instance the key expired in the meantime), but
* we include such information in the reply string. */
if ((o = lookupKeyRead(c->db,c->argv[3])) == NULL) {
- addReplySds(c,sdsnew("+NOKEY"));
+ addReplySds(c,sdsnew("+NOKEY\r\n"));
return;
}
return;
}
- /* Create temp file */
- snprintf(buf,sizeof(buf),"redis-migrate-%d.tmp",getpid());
- fp = fopen(buf,"w+");
- if (!fp) {
- redisLog(REDIS_WARNING,"Can't open tmp file for MIGRATE: %s",
- strerror(errno));
- addReplyErrorFormat(c,"MIGRATE failed, tmp file creation error: %s.",
- strerror(errno));
- return;
- }
- unlink(buf);
-
- /* Build the SELECT + RESTORE query writing it in our temp file. */
- if (fwriteBulkCount(fp,'*',2) == 0) goto file_wr_err;
- if (fwriteBulkString(fp,"SELECT",6) == 0) goto file_wr_err;
- if (fwriteBulkLongLong(fp,dbid) == 0) goto file_wr_err;
+ rioInitWithBuffer(&cmd,sdsempty());
+ redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2));
+ redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6));
+ redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid));
ttl = getExpire(c->db,c->argv[3]);
- type = o->type;
- if (fwriteBulkCount(fp,'*',4) == 0) goto file_wr_err;
- if (fwriteBulkString(fp,"RESTORE",7) == 0) goto file_wr_err;
- if (fwriteBulkObject(fp,c->argv[3]) == 0) goto file_wr_err;
- if (fwriteBulkLongLong(fp, (ttl == -1) ? 0 : ttl) == 0) goto file_wr_err;
+ redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',4));
+ redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7));
+ redisAssertWithInfo(c,NULL,c->argv[3]->encoding == REDIS_ENCODING_RAW);
+ redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr,sdslen(c->argv[3]->ptr)));
+ redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,(ttl == -1) ? 0 : ttl));
/* Finally the last argument that is the serailized object payload
- * in the form: <type><rdb-serailized-object>. */
- payload_len = rdbSavedObjectLen(o);
- if (fwriteBulkCount(fp,'$',payload_len+1) == 0) goto file_wr_err;
- if (fwrite(&type,1,1,fp) == 0) goto file_wr_err;
- if (rdbSaveObject(fp,o) == -1) goto file_wr_err;
- if (fwrite("\r\n",2,1,fp) == 0) goto file_wr_err;
-
- /* Tranfer the query to the other node */
- rewind(fp);
+ * in the form: <type><rdb-serialized-object>. */
+ rioInitWithBuffer(&payload,sdsempty());
+ redisAssertWithInfo(c,NULL,rdbSaveObjectType(&payload,o));
+ redisAssertWithInfo(c,NULL,rdbSaveObject(&payload,o) != -1);
+ redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,payload.io.buffer.ptr,sdslen(payload.io.buffer.ptr)));
+ sdsfree(payload.io.buffer.ptr);
+
+ /* Tranfer the query to the other node in 64K chunks. */
{
- char buf[4096];
- size_t nread;
-
- while ((nread = fread(buf,1,sizeof(buf),fp)) != 0) {
- int nwritten;
-
- nwritten = syncWrite(fd,buf,nread,timeout);
- if (nwritten != (signed)nread) goto socket_wr_err;
+ sds buf = cmd.io.buffer.ptr;
+ size_t pos = 0, towrite;
+ int nwritten = 0;
+
+ while ((towrite = sdslen(buf)-pos) > 0) {
+ towrite = (towrite > (64*1024) ? (64*1024) : towrite);
+ nwritten = syncWrite(fd,buf+nwritten,towrite,timeout);
+ if (nwritten != (signed)towrite) goto socket_wr_err;
+ pos += nwritten;
}
- if (ferror(fp)) goto file_rd_err;
}
- /* Read back the reply */
+ /* Read back the reply. */
{
char buf1[1024];
char buf2[1024];
if (syncReadLine(fd, buf1, sizeof(buf1), timeout) <= 0)
goto socket_rd_err;
if (syncReadLine(fd, buf2, sizeof(buf2), timeout) <= 0)
- goto socket_rd_err;
+ goto socket_rd_err;
if (buf1[0] == '-' || buf2[0] == '-') {
addReplyErrorFormat(c,"Target instance replied with error: %s",
(buf1[0] == '-') ? buf1+1 : buf2+1);
} else {
+ robj *aux;
+
dbDelete(c->db,c->argv[3]);
+ signalModifiedKey(c->db,c->argv[3]);
addReply(c,shared.ok);
+ server.dirty++;
+
+ /* Translate MIGRATE as DEL for replication/AOF. */
+ aux = createStringObject("DEL",3);
+ rewriteClientCommandVector(c,2,aux,c->argv[3]);
+ decrRefCount(aux);
}
}
- fclose(fp);
- close(fd);
- return;
-
-file_wr_err:
- redisLog(REDIS_WARNING,"Can't write on tmp file for MIGRATE: %s",
- strerror(errno));
- addReplyErrorFormat(c,"MIGRATE failed, tmp file write error: %s.",
- strerror(errno));
- fclose(fp);
- close(fd);
-file_rd_err:
- redisLog(REDIS_WARNING,"Can't read from tmp file for MIGRATE: %s",
- strerror(errno));
- addReplyErrorFormat(c,"MIGRATE failed, tmp file read error: %s.",
- strerror(errno));
- fclose(fp);
+ sdsfree(cmd.io.buffer.ptr);
close(fd);
+ return;
socket_wr_err:
redisLog(REDIS_NOTICE,"Can't write to target node for MIGRATE: %s",
strerror(errno));
addReplyErrorFormat(c,"MIGRATE failed, writing to target node: %s.",
strerror(errno));
- fclose(fp);
+ sdsfree(cmd.io.buffer.ptr);
close(fd);
+ return;
socket_rd_err:
redisLog(REDIS_NOTICE,"Can't read from target node for MIGRATE: %s",
strerror(errno));
addReplyErrorFormat(c,"MIGRATE failed, reading from target node: %s.",
strerror(errno));
- fclose(fp);
+ sdsfree(cmd.io.buffer.ptr);
close(fd);
+ return;
+}
+
+/* DUMP keyname
+ * DUMP is actually not used by Redis Cluster but it is the obvious
+ * complement of RESTORE and can be useful for different applications. */
+void dumpCommand(redisClient *c) {
+ robj *o, *dumpobj;
+ rio payload;
+
+ /* Check if the key is here. */
+ if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) {
+ addReply(c,shared.nullbulk);
+ return;
+ }
+
+ /* Serialize the object in a RDB-like format. It consist of an object type
+ * byte followed by the serialized object. This is understood by RESTORE. */
+ rioInitWithBuffer(&payload,sdsempty());
+ redisAssertWithInfo(c,NULL,rdbSaveObjectType(&payload,o));
+ redisAssertWithInfo(c,NULL,rdbSaveObject(&payload,o));
+
+ /* Transfer to the client */
+ dumpobj = createObject(REDIS_STRING,payload.io.buffer.ptr);
+ addReplyBulk(c,dumpobj);
+ decrRefCount(dumpobj);
+ return;
+}
+
+/* The ASKING command is required after a -ASK redirection.
+ * The client should issue ASKING before to actualy send the command to
+ * the target instance. See the Redis Cluster specification for more
+ * information. */
+void askingCommand(redisClient *c) {
+ if (server.cluster_enabled == 0) {
+ addReplyError(c,"This instance has cluster support disabled");
+ return;
+ }
+ c->flags |= REDIS_ASKING;
+ addReply(c,shared.ok);
}
/* -----------------------------------------------------------------------------
/* Return the pointer to the cluster node that is able to serve the query
* as all the keys belong to hash slots for which the node is in charge.
*
- * If keys in query spawn multiple nodes NULL is returned. */
-clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot) {
+ * If the returned node should be used only for this request, the *ask
+ * integer is set to '1', otherwise to '0'. This is used in order to
+ * let the caller know if we should reply with -MOVED or with -ASK.
+ *
+ * If the request contains more than a single key NULL is returned,
+ * however a request with more then a key argument where the key is always
+ * the same is valid, like in: RPOPLPUSH mylist mylist.*/
+clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask) {
clusterNode *n = NULL;
+ robj *firstkey = NULL;
multiState *ms, _ms;
multiCmd mc;
- int i;
+ int i, slot = 0;
/* We handle all the cases as if they were EXEC commands, so we have
* a common code path for everything */
if (!(c->flags & REDIS_MULTI)) return server.cluster.myself;
ms = &c->mstate;
} else {
- /* Create a fake Multi State structure, with just one command */
+ /* In order to have a single codepath create a fake Multi State
+ * structure if the client is not in MULTI/EXEC state, this way
+ * we have a single codepath below. */
ms = &_ms;
_ms.commands = &mc;
_ms.count = 1;
mc.cmd = cmd;
}
+ /* Check that all the keys are the same key, and get the slot and
+ * node for this key. */
for (i = 0; i < ms->count; i++) {
struct redisCommand *mcmd;
robj **margv;
margv = ms->commands[i].argv;
keyindex = getKeysFromCommand(mcmd,margv,margc,&numkeys,
- REDIS_GETKEYS_PRELOAD);
+ REDIS_GETKEYS_ALL);
for (j = 0; j < numkeys; j++) {
- int slot = keyHashSlot((char*)margv[keyindex[j]]->ptr,
- sdslen(margv[keyindex[j]]->ptr));
- struct clusterNode *slotnode;
-
- slotnode = server.cluster.slots[slot];
- if (hashslot) *hashslot = slot;
- /* Node not assigned? (Should never happen actually
- * if we reached this function).
- * Different node than the previous one?
- * Return NULL, the cluster can't serve multi-node requests */
- if (slotnode == NULL || (n && slotnode != n)) {
- getKeysFreeResult(keyindex);
- return NULL;
+ if (firstkey == NULL) {
+ /* This is the first key we see. Check what is the slot
+ * and node. */
+ firstkey = margv[keyindex[j]];
+
+ slot = keyHashSlot((char*)firstkey->ptr, sdslen(firstkey->ptr));
+ n = server.cluster.slots[slot];
+ redisAssertWithInfo(c,firstkey,n != NULL);
} else {
- n = slotnode;
+ /* If it is not the first key, make sure it is exactly
+ * the same key as the first we saw. */
+ if (!equalStringObjects(firstkey,margv[keyindex[j]])) {
+ decrRefCount(firstkey);
+ getKeysFreeResult(keyindex);
+ return NULL;
+ }
}
}
getKeysFreeResult(keyindex);
}
- return (n == NULL) ? server.cluster.myself : n;
+ if (ask) *ask = 0; /* This is the default. Set to 1 if needed later. */
+ /* No key at all in command? then we can serve the request
+ * without redirections. */
+ if (n == NULL) return server.cluster.myself;
+ if (hashslot) *hashslot = slot;
+ /* This request is about a slot we are migrating into another instance?
+ * Then we need to check if we have the key. If we have it we can reply.
+ * If instead is a new key, we pass the request to the node that is
+ * receiving the slot. */
+ if (n == server.cluster.myself &&
+ server.cluster.migrating_slots_to[slot] != NULL)
+ {
+ if (lookupKeyRead(&server.db[0],firstkey) == NULL) {
+ if (ask) *ask = 1;
+ return server.cluster.migrating_slots_to[slot];
+ }
+ }
+ /* Handle the case in which we are receiving this hash slot from
+ * another instance, so we'll accept the query even if in the table
+ * it is assigned to a different node, but only if the client
+ * issued an ASKING command before. */
+ if (server.cluster.importing_slots_from[slot] != NULL &&
+ c->flags & REDIS_ASKING) {
+ return server.cluster.myself;
+ }
+ /* It's not a -ASK case. Base case: just return the right node. */
+ return n;
}