X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/6c390c0b2303247c16f42160fec3fd609cb99cb7..fcdeb98568708b8d9ef1415aaeef75ee5fe488f2:/src/cluster.c?ds=sidebyside diff --git a/src/cluster.c b/src/cluster.c index 76ea894d..85cb1198 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -11,6 +11,9 @@ void clusterSendFail(char *nodename); void clusterUpdateState(void); int clusterNodeGetSlotBit(clusterNode *n, int slot); sds clusterGenNodesDescription(void); +clusterNode *clusterLookupNode(char *name); +int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); +int clusterAddSlot(clusterNode *n, int slot); /* ----------------------------------------------------------------------------- * Initialization @@ -21,12 +24,10 @@ void clusterGetRandomName(char *p) { char *charset = "0123456789abcdef"; int j; - if (!fp) { - redisLog(REDIS_WARNING, - "Unrecovarable error: can't open /dev/urandom:%s" ,strerror(errno)); - exit(1); + if (fp == NULL || fread(p,REDIS_CLUSTER_NAMELEN,1,fp) == 0) { + for (j = 0; j < REDIS_CLUSTER_NAMELEN; j++) + p[j] = rand(); } - fread(p,REDIS_CLUSTER_NAMELEN,1,fp); for (j = 0; j < REDIS_CLUSTER_NAMELEN; j++) p[j] = charset[p[j] & 0x0F]; fclose(fp); @@ -34,13 +35,127 @@ void clusterGetRandomName(char *p) { int clusterLoadConfig(char *filename) { FILE *fp = fopen(filename,"r"); + char *line; + int maxline, j; - return REDIS_ERR; if (fp == NULL) return REDIS_ERR; + + /* Parse the file. Note that single liens of the cluster config file can + * be really long as they include all the hash slots of the node. + * This means in the worst possible case REDIS_CLUSTER_SLOTS/2 integers. + * To simplify we allocate 1024+REDIS_CLUSTER_SLOTS*16 bytes per line. */ + maxline = 1024+REDIS_CLUSTER_SLOTS*16; + line = zmalloc(maxline); + while(fgets(line,maxline,fp) != NULL) { + int argc; + sds *argv = sdssplitargs(line,&argc); + clusterNode *n, *master; + char *p, *s; + + /* Create this node if it does not exist */ + n = clusterLookupNode(argv[0]); + if (!n) { + n = createClusterNode(argv[0],0); + clusterAddNode(n); + } + /* Address and port */ + if ((p = strchr(argv[1],':')) == NULL) goto fmterr; + *p = '\0'; + memcpy(n->ip,argv[1],strlen(argv[1])+1); + n->port = atoi(p+1); + + /* Parse flags */ + p = s = argv[2]; + while(p) { + p = strchr(s,','); + if (p) *p = '\0'; + if (!strcasecmp(s,"myself")) { + redisAssert(server.cluster.myself == NULL); + server.cluster.myself = n; + n->flags |= REDIS_NODE_MYSELF; + } else if (!strcasecmp(s,"master")) { + n->flags |= REDIS_NODE_MASTER; + } else if (!strcasecmp(s,"slave")) { + n->flags |= REDIS_NODE_SLAVE; + } else if (!strcasecmp(s,"fail?")) { + n->flags |= REDIS_NODE_PFAIL; + } else if (!strcasecmp(s,"fail")) { + n->flags |= REDIS_NODE_FAIL; + } else if (!strcasecmp(s,"handshake")) { + n->flags |= REDIS_NODE_HANDSHAKE; + } else if (!strcasecmp(s,"noaddr")) { + n->flags |= REDIS_NODE_NOADDR; + } else if (!strcasecmp(s,"noflags")) { + /* nothing to do */ + } else { + redisPanic("Unknown flag in redis cluster config file"); + } + if (p) s = p+1; + } + + /* Get master if any. Set the master and populate master's + * slave list. */ + if (argv[3][0] != '-') { + master = clusterLookupNode(argv[3]); + if (!master) { + master = createClusterNode(argv[3],0); + clusterAddNode(master); + } + n->slaveof = master; + clusterNodeAddSlave(master,n); + } + + /* Set ping sent / pong received timestamps */ + if (atoi(argv[4])) n->ping_sent = time(NULL); + if (atoi(argv[5])) n->pong_received = time(NULL); + + /* Populate hash slots served by this instance. */ + for (j = 7; j < argc; j++) { + int start, stop; + + if (argv[j][0] == '[') { + /* Here we handle migrating / importing slots */ + int slot; + char direction; + clusterNode *cn; + + p = strchr(argv[j],'-'); + redisAssert(p != NULL); + *p = '\0'; + direction = p[1]; /* Either '>' or '<' */ + slot = atoi(argv[j]+1); + p += 3; + cn = clusterLookupNode(p); + if (!cn) { + cn = createClusterNode(p,0); + clusterAddNode(cn); + } + if (direction == '>') { + server.cluster.migrating_slots_to[slot] = cn; + } else { + server.cluster.importing_slots_from[slot] = cn; + } + continue; + } else if ((p = strchr(argv[j],'-')) != NULL) { + *p = '\0'; + start = atoi(argv[j]); + stop = atoi(p+1); + } else { + start = stop = atoi(argv[j]); + } + while(start <= stop) clusterAddSlot(n, start++); + } + + sdssplitargs_free(argv,argc); + } + zfree(line); fclose(fp); + /* Config sanity check */ + redisAssert(server.cluster.myself != NULL); redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s", server.cluster.myself->name); + clusterUpdateState(); return REDIS_OK; fmterr: @@ -57,8 +172,8 @@ int clusterSaveConfig(void) { sds ci = clusterGenNodesDescription(); int fd; - if ((fd = open(server.cluster.configfile,O_WRONLY|O_CREAT,0644)) == -1) - goto err; + if ((fd = open(server.cluster.configfile,O_WRONLY|O_CREAT|O_TRUNC,0644)) + == -1) goto err; if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err; close(fd); sdsfree(ci); @@ -79,7 +194,7 @@ void clusterSaveConfigOrDie(void) { void clusterInit(void) { int saveconf = 0; - server.cluster.myself = createClusterNode(NULL,REDIS_NODE_MYSELF); + server.cluster.myself = NULL; server.cluster.state = REDIS_CLUSTER_FAIL; server.cluster.nodes = dictCreate(&clusterNodesDictType,NULL); server.cluster.node_timeout = 15; @@ -92,6 +207,7 @@ void clusterInit(void) { if (clusterLoadConfig(server.cluster.configfile) == REDIS_ERR) { /* No configuration found. We will just use the random name provided * by the createClusterNode() function. */ + server.cluster.myself = createClusterNode(NULL,REDIS_NODE_MYSELF); redisLog(REDIS_NOTICE,"No cluster configuration found, I'm %.40s", server.cluster.myself->name); clusterAddNode(server.cluster.myself); @@ -107,6 +223,7 @@ void clusterInit(void) { } if (aeCreateFileEvent(server.el, server.cfd, AE_READABLE, clusterAcceptHandler, NULL) == AE_ERR) oom("creating file event"); + server.cluster.slots_to_keys = zslCreate(); } /* ----------------------------------------------------------------------------- @@ -260,7 +377,7 @@ clusterNode *clusterLookupNode(char *name) { de = dictFind(server.cluster.nodes,s); sdsfree(s); if (de == NULL) return NULL; - return dictGetEntryVal(de); + return dictGetVal(de); } /* This is only used after the handshake. When we connect a given IP/PORT @@ -322,7 +439,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { * time PONG figure if it is newer than our figure. * Note that it's not a problem if we have a PING already * in progress against this node. */ - if (node->pong_received < ntohl(g->pong_received)) { + if (node->pong_received < (signed) ntohl(g->pong_received)) { redisLog(REDIS_DEBUG,"Node pong_received updated by gossip"); node->pong_received = ntohl(g->pong_received); } @@ -337,6 +454,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Broadcast the failing node name to everybody */ clusterSendFail(node->name); clusterUpdateState(); + clusterSaveConfigOrDie(); } } else { /* If it's not in NOADDR state and we don't have it, we @@ -375,6 +493,7 @@ void nodeIp2String(char *buf, clusterLink *link) { /* Update the node address to the IP address that can be extracted * from link->fd, and at the specified port. */ void nodeUpdateAddress(clusterNode *node, clusterLink *link, int port) { + /* TODO */ } /* When this function is called, there is a packet to process starting @@ -392,8 +511,10 @@ int clusterProcessPacket(clusterLink *link) { uint16_t type = ntohs(hdr->type); clusterNode *sender; - redisLog(REDIS_DEBUG,"--- packet to process %lu bytes (%lu) ---", - (unsigned long) totlen, sdslen(link->rcvbuf)); + redisLog(REDIS_DEBUG,"--- Processing packet of type %d, %lu bytes", + type, (unsigned long) totlen); + + /* Perform sanity checks */ if (totlen < 8) return 1; if (totlen > sdslen(link->rcvbuf)) return 1; if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || @@ -412,9 +533,19 @@ int clusterProcessPacket(clusterLink *link) { explen += sizeof(clusterMsgDataFail); if (totlen != explen) return 1; } + if (type == CLUSTERMSG_TYPE_PUBLISH) { + uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + + explen += sizeof(clusterMsgDataPublish) + + ntohl(hdr->data.publish.msg.channel_len) + + ntohl(hdr->data.publish.msg.message_len); + if (totlen != explen) return 1; + } + /* Ready to process the packet. Dispatch by type. */ sender = clusterLookupNode(hdr->sender); if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { + int update_config = 0; redisLog(REDIS_DEBUG,"Ping packet received: %p", link->node); /* Add this node if it is new for us and the msg type is MEET. @@ -428,6 +559,7 @@ int clusterProcessPacket(clusterLink *link) { nodeIp2String(node->ip,link); node->port = ntohs(hdr->port); clusterAddNode(node); + update_config = 1; } /* Get info from the gossip section */ @@ -435,8 +567,12 @@ int clusterProcessPacket(clusterLink *link) { /* Anyway reply with a PONG */ clusterSendPing(link,CLUSTERMSG_TYPE_PONG); + + /* Update config if needed */ + if (update_config) clusterSaveConfigOrDie(); } else if (type == CLUSTERMSG_TYPE_PONG) { - int update = 0; + int update_state = 0; + int update_config = 0; redisLog(REDIS_DEBUG,"Pong packet received: %p", link->node); if (link->node) { @@ -457,6 +593,7 @@ int clusterProcessPacket(clusterLink *link) { redisLog(REDIS_DEBUG,"Handshake with node %.40s completed.", link->node->name); link->node->flags &= ~REDIS_NODE_HANDSHAKE; + update_config = 1; } else if (memcmp(link->node->name,hdr->sender, REDIS_CLUSTER_NAMELEN) != 0) { @@ -466,6 +603,7 @@ int clusterProcessPacket(clusterLink *link) { redisLog(REDIS_DEBUG,"PONG contains mismatching sender ID"); link->node->flags |= REDIS_NODE_NOADDR; freeClusterLink(link); + update_config = 1; /* FIXME: remove this node if we already have it. * * If we already have it but the IP is different, use @@ -475,7 +613,7 @@ int clusterProcessPacket(clusterLink *link) { } } /* Update our info about the node */ - link->node->pong_received = time(NULL); + if (link->node) link->node->pong_received = time(NULL); /* Update master/slave info */ if (sender) { @@ -511,7 +649,7 @@ int clusterProcessPacket(clusterLink *link) { server.cluster.slots[j]->flags & REDIS_NODE_FAIL) { server.cluster.slots[j] = sender; - update = 1; + update_state = update_config = 1; } } } @@ -522,21 +660,40 @@ int clusterProcessPacket(clusterLink *link) { clusterProcessGossipSection(hdr,link); /* Update the cluster state if needed */ - if (update) clusterUpdateState(); + if (update_state) clusterUpdateState(); + if (update_config) clusterSaveConfigOrDie(); } else if (type == CLUSTERMSG_TYPE_FAIL && sender) { clusterNode *failing; failing = clusterLookupNode(hdr->data.fail.about.nodename); - if (failing && !(failing->flags & REDIS_NODE_FAIL)) { + if (failing && !(failing->flags & (REDIS_NODE_FAIL|REDIS_NODE_MYSELF))) + { redisLog(REDIS_NOTICE, "FAIL message received from %.40s about %.40s", hdr->sender, hdr->data.fail.about.nodename); failing->flags |= REDIS_NODE_FAIL; failing->flags &= ~REDIS_NODE_PFAIL; clusterUpdateState(); + clusterSaveConfigOrDie(); + } + } else if (type == CLUSTERMSG_TYPE_PUBLISH) { + robj *channel, *message; + uint32_t channel_len, message_len; + + /* Don't bother creating useless objects if there are no Pub/Sub subscribers. */ + if (dictSize(server.pubsub_channels) || listLength(server.pubsub_patterns)) { + channel_len = ntohl(hdr->data.publish.msg.channel_len); + message_len = ntohl(hdr->data.publish.msg.message_len); + channel = createStringObject( + (char*)hdr->data.publish.msg.bulk_data,channel_len); + message = createStringObject( + (char*)hdr->data.publish.msg.bulk_data+channel_len, message_len); + pubsubPublishMessage(channel,message); + decrRefCount(channel); + decrRefCount(message); } } else { - redisLog(REDIS_NOTICE,"Received unknown packet type: %d", type); + redisLog(REDIS_WARNING,"Received unknown packet type: %d", type); } return 1; } @@ -629,9 +786,25 @@ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { link->sndbuf = sdscatlen(link->sndbuf, msg, msglen); } +/* Send a message to all the nodes with a reliable link */ +void clusterBroadcastMessage(void *buf, size_t len) { + dictIterator *di; + dictEntry *de; + + di = dictGetIterator(server.cluster.nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (!node->link) continue; + if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue; + clusterSendMessage(node->link,buf,len); + } + dictReleaseIterator(di); +} + /* Build the message header */ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { - int totlen; + int totlen = 0; memset(hdr,0,sizeof(*hdr)); hdr->type = htons(type); @@ -676,7 +849,7 @@ void clusterSendPing(clusterLink *link, int type) { /* Populate the gossip fields */ while(freshnodes > 0 && gossipcount < 3) { struct dictEntry *de = dictGetRandomKey(server.cluster.nodes); - clusterNode *this = dictGetEntryVal(de); + clusterNode *this = dictGetVal(de); clusterMsgDataGossip *gossip; int j; @@ -713,20 +886,48 @@ void clusterSendPing(clusterLink *link, int type) { clusterSendMessage(link,buf,totlen); } -/* Send a message to all the nodes with a reliable link */ -void clusterBroadcastMessage(void *buf, size_t len) { - dictIterator *di; - dictEntry *de; +/* Send a PUBLISH message. + * + * If link is NULL, then the message is broadcasted to the whole cluster. */ +void clusterSendPublish(clusterLink *link, robj *channel, robj *message) { + unsigned char buf[4096], *payload; + clusterMsg *hdr = (clusterMsg*) buf; + uint32_t totlen; + uint32_t channel_len, message_len; - di = dictGetIterator(server.cluster.nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + channel = getDecodedObject(channel); + message = getDecodedObject(message); + channel_len = sdslen(channel->ptr); + message_len = sdslen(message->ptr); - if (!node->link) continue; - if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue; - clusterSendMessage(node->link,buf,len); + clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_PUBLISH); + totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += sizeof(clusterMsgDataPublish) + channel_len + message_len; + + hdr->data.publish.msg.channel_len = htonl(channel_len); + hdr->data.publish.msg.message_len = htonl(message_len); + hdr->totlen = htonl(totlen); + + /* Try to use the local buffer if possible */ + if (totlen < sizeof(buf)) { + payload = buf; + } else { + payload = zmalloc(totlen); + hdr = (clusterMsg*) payload; + memcpy(payload,hdr,sizeof(hdr)); } - dictReleaseIterator(di); + memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); + memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), + message->ptr,sdslen(message->ptr)); + + if (link) + clusterSendMessage(link,payload,totlen); + else + clusterBroadcastMessage(payload,totlen); + + decrRefCount(channel); + decrRefCount(message); + if (payload != buf) zfree(payload); } /* Send a FAIL message to all the nodes we are able to contact. @@ -743,6 +944,17 @@ void clusterSendFail(char *nodename) { clusterBroadcastMessage(buf,ntohl(hdr->totlen)); } +/* ----------------------------------------------------------------------------- + * CLUSTER Pub/Sub support + * + * For now we do very little, just propagating PUBLISH messages across the whole + * cluster. In the future we'll try to get smarter and avoiding propagating those + * messages to hosts without receives for a given channel. + * -------------------------------------------------------------------------- */ +void clusterPropagatePublish(robj *channel, robj *message) { + clusterSendPublish(NULL, channel, message); +} + /* ----------------------------------------------------------------------------- * CLUSTER cron job * -------------------------------------------------------------------------- */ @@ -758,7 +970,7 @@ void clusterCron(void) { /* Check if we have disconnected nodes and reestablish the connection. */ di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue; if (node->link == NULL) { @@ -784,7 +996,7 @@ void clusterCron(void) { * normal PING packets. */ node->flags &= ~REDIS_NODE_MEET; - redisLog(REDIS_NOTICE,"Connecting with Node %.40s at %s:%d\n", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR); + redisLog(REDIS_NOTICE,"Connecting with Node %.40s at %s:%d", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR); } } dictReleaseIterator(di); @@ -793,7 +1005,7 @@ void clusterCron(void) { * the oldest ping_sent time */ for (j = 0; j < 5; j++) { de = dictGetRandomKey(server.cluster.nodes); - clusterNode *this = dictGetEntryVal(de); + clusterNode *this = dictGetVal(de); if (this->link == NULL) continue; if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; @@ -810,26 +1022,38 @@ void clusterCron(void) { /* Iterate nodes to check if we need to flag something as failing */ di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); int delay; if (node->flags & - (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE| - REDIS_NODE_FAIL)) continue; + (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE)) + continue; /* Check only if we already sent a ping and did not received * a reply yet. */ if (node->ping_sent == 0 || node->ping_sent <= node->pong_received) continue; delay = time(NULL) - node->pong_received; - if (node->flags & REDIS_NODE_PFAIL) { + if (delay < server.cluster.node_timeout) { /* The PFAIL condition can be reversed without external * help if it is not transitive (that is, if it does not - * turn into a FAIL state). */ - if (delay < server.cluster.node_timeout) + * turn into a FAIL state). + * + * The FAIL condition is also reversible if there are no slaves + * for this host, so no slave election should be in progress. + * + * TODO: consider all the implications of resurrecting a + * FAIL node. */ + if (node->flags & REDIS_NODE_PFAIL) { node->flags &= ~REDIS_NODE_PFAIL; + } else if (node->flags & REDIS_NODE_FAIL && !node->numslaves) { + node->flags &= ~REDIS_NODE_FAIL; + clusterUpdateState(); + } } else { - if (delay >= server.cluster.node_timeout) { + /* Timeout reached. Set the noad se possibly failing if it is + * not already in this state. */ + if (!(node->flags & (REDIS_NODE_PFAIL|REDIS_NODE_FAIL))) { redisLog(REDIS_DEBUG,"*** NODE %.40s possibly failing", node->name); node->flags |= REDIS_NODE_PFAIL; @@ -873,9 +1097,21 @@ int clusterNodeGetSlotBit(clusterNode *n, int slot) { * If the slot is already assigned to another instance this is considered * an error and REDIS_ERR is returned. */ int clusterAddSlot(clusterNode *n, int slot) { - redisAssert(clusterNodeSetSlotBit(n,slot) == 0); - server.cluster.slots[slot] = server.cluster.myself; - printf("SLOT %d added to %.40s\n", slot, n->name); + if (clusterNodeSetSlotBit(n,slot) != 0) + return REDIS_ERR; + server.cluster.slots[slot] = n; + return REDIS_OK; +} + +/* Delete the specified slot marking it as unassigned. + * Returns REDIS_OK if the slot was assigned, otherwise if the slot was + * already unassigned REDIS_ERR is returned. */ +int clusterDelSlot(int slot) { + clusterNode *n = server.cluster.slots[slot]; + + if (!n) return REDIS_ERR; + redisAssert(clusterNodeClearSlotBit(n,slot) == 1); + server.cluster.slots[slot] = NULL; return REDIS_OK; } @@ -917,7 +1153,7 @@ sds clusterGenNodesDescription(void) { di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); /* Node coordinates */ ci = sdscatprintf(ci,"%.40s %s:%d ", @@ -946,7 +1182,8 @@ sds clusterGenNodesDescription(void) { ci = sdscatprintf(ci,"%ld %ld %s", (long) node->ping_sent, (long) node->pong_received, - node->link ? "connected" : "disconnected"); + (node->link || node->flags & REDIS_NODE_MYSELF) ? + "connected" : "disconnected"); /* Slots served by this instance */ start = -1; @@ -967,12 +1204,39 @@ sds clusterGenNodesDescription(void) { start = -1; } } + + /* Just for MYSELF node we also dump info about slots that + * we are migrating to other instances or importing from other + * instances. */ + if (node->flags & REDIS_NODE_MYSELF) { + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { + if (server.cluster.migrating_slots_to[j]) { + ci = sdscatprintf(ci," [%d->-%.40s]",j, + server.cluster.migrating_slots_to[j]->name); + } else if (server.cluster.importing_slots_from[j]) { + ci = sdscatprintf(ci," [%d-<-%.40s]",j, + server.cluster.importing_slots_from[j]->name); + } + } + } + ci = sdscatlen(ci,"\n",1); } - ci = sdscatlen(ci,"\n",1); dictReleaseIterator(di); return ci; } +int getSlotOrReply(redisClient *c, robj *o) { + long long slot; + + if (getLongLongFromObject(o,&slot) != REDIS_OK || + slot < 0 || slot > REDIS_CLUSTER_SLOTS) + { + addReplyError(c,"Invalid or out of range slot"); + return -1; + } + return (int) slot; +} + void clusterCommand(redisClient *c) { if (server.cluster_enabled == 0) { addReplyError(c,"This instance has cluster support disabled"); @@ -1010,24 +1274,29 @@ void clusterCommand(redisClient *c) { o = createObject(REDIS_STRING,ci); addReplyBulk(c,o); decrRefCount(o); - } else if (!strcasecmp(c->argv[1]->ptr,"addslots") && c->argc >= 3) { - int j; - long long slot; + } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || + !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) + { + /* CLUSTER ADDSLOTS [slot] ... */ + /* CLUSTER DELSLOTS [slot] ... */ + int j, slot; unsigned char *slots = zmalloc(REDIS_CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslots"); memset(slots,0,REDIS_CLUSTER_SLOTS); /* Check that all the arguments are parsable and that all the * slots are not already busy. */ for (j = 2; j < c->argc; j++) { - if (getLongLongFromObject(c->argv[j],&slot) != REDIS_OK || - slot < 0 || slot > REDIS_CLUSTER_SLOTS) - { - addReplyError(c,"Invalid or out of range slot index"); + if ((slot = getSlotOrReply(c,c->argv[j])) == -1) { zfree(slots); return; } - if (server.cluster.slots[slot]) { - addReplyErrorFormat(c,"Slot %lld is already busy", slot); + if (del && server.cluster.slots[slot] == NULL) { + addReplyErrorFormat(c,"Slot %d is already unassigned", slot); + zfree(slots); + return; + } else if (!del && server.cluster.slots[slot]) { + addReplyErrorFormat(c,"Slot %d is already busy", slot); zfree(slots); return; } @@ -1040,13 +1309,100 @@ void clusterCommand(redisClient *c) { } for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { if (slots[j]) { - int retval = clusterAddSlot(server.cluster.myself,j); - - redisAssert(retval == REDIS_OK); + int retval; + + /* If this slot was set as importing we can clear this + * state as now we are the real owner of the slot. */ + if (server.cluster.importing_slots_from[j]) + server.cluster.importing_slots_from[j] = NULL; + + retval = del ? clusterDelSlot(j) : + clusterAddSlot(server.cluster.myself,j); + redisAssertWithInfo(c,NULL,retval == REDIS_OK); } } zfree(slots); clusterUpdateState(); + clusterSaveConfigOrDie(); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { + /* SETSLOT 10 MIGRATING */ + /* SETSLOT 10 IMPORTING */ + /* SETSLOT 10 STABLE */ + /* SETSLOT 10 NODE */ + int slot; + clusterNode *n; + + if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return; + + if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { + if (server.cluster.slots[slot] != server.cluster.myself) { + addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); + return; + } + if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); + return; + } + server.cluster.migrating_slots_to[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { + if (server.cluster.slots[slot] == server.cluster.myself) { + addReplyErrorFormat(c, + "I'm already the owner of hash slot %u",slot); + return; + } + if ((n = clusterLookupNode(c->argv[4]->ptr)) == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[3]->ptr); + return; + } + server.cluster.importing_slots_from[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { + /* CLUSTER SETSLOT STABLE */ + server.cluster.importing_slots_from[slot] = NULL; + server.cluster.migrating_slots_to[slot] = NULL; + } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { + /* CLUSTER SETSLOT NODE */ + clusterNode *n = clusterLookupNode(c->argv[4]->ptr); + + if (!n) addReplyErrorFormat(c,"Unknown node %s", + (char*)c->argv[4]->ptr); + /* If this hash slot was served by 'myself' before to switch + * make sure there are no longer local keys for this hash slot. */ + if (server.cluster.slots[slot] == server.cluster.myself && + n != server.cluster.myself) + { + int numkeys; + robj **keys; + + keys = zmalloc(sizeof(robj*)*1); + numkeys = GetKeysInSlot(slot, keys, 1); + zfree(keys); + if (numkeys != 0) { + addReplyErrorFormat(c, "Can't assign hashslot %d to a different node while I still hold keys for this hash slot.", slot); + return; + } + } + /* If this node was the slot owner and the slot was marked as + * migrating, assigning the slot to another node will clear + * the migratig status. */ + if (server.cluster.slots[slot] == server.cluster.myself && + server.cluster.migrating_slots_to[slot]) + server.cluster.migrating_slots_to[slot] = NULL; + + /* If this node was importing this slot, assigning the slot to + * itself also clears the importing status. */ + if (n == server.cluster.myself && server.cluster.importing_slots_from[slot]) + server.cluster.importing_slots_from[slot] = NULL; + + clusterDelSlot(slot); + clusterAddSlot(n,slot); + } else { + addReplyError(c,"Invalid CLUSTER SETSLOT action or number of arguments"); + return; + } + clusterSaveConfigOrDie(); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { char *statestr[] = {"ok","fail","needhelp"}; @@ -1073,16 +1429,42 @@ void clusterCommand(redisClient *c) { "cluster_slots_ok:%d\r\n" "cluster_slots_pfail:%d\r\n" "cluster_slots_fail:%d\r\n" + "cluster_known_nodes:%lu\r\n" , statestr[server.cluster.state], slots_assigned, slots_ok, slots_pfail, - slots_fail + slots_fail, + dictSize(server.cluster.nodes) ); addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", (unsigned long)sdslen(info))); addReplySds(c,info); addReply(c,shared.crlf); + } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { + sds key = c->argv[2]->ptr; + + addReplyLongLong(c,keyHashSlot(key,sdslen(key))); + } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { + long long maxkeys, slot; + unsigned int numkeys, j; + robj **keys; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != REDIS_OK) + return; + if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) != REDIS_OK) + return; + if (slot < 0 || slot >= REDIS_CLUSTER_SLOTS || maxkeys < 0 || + maxkeys > 1024*1024) { + addReplyError(c,"Invalid slot or number of keys"); + return; + } + + keys = zmalloc(sizeof(robj*)*maxkeys); + numkeys = GetKeysInSlot(slot, keys, maxkeys); + addReplyMultiBulkLen(c,numkeys); + for (j = 0; j < numkeys; j++) addReplyBulk(c,keys[j]); + zfree(keys); } else { addReplyError(c,"Wrong CLUSTER subcommand or number of arguments"); } @@ -1094,14 +1476,13 @@ void clusterCommand(redisClient *c) { /* RESTORE key ttl serialized-value */ void restoreCommand(redisClient *c) { - FILE *fp; - char buf[64]; - robj *o; - unsigned char *data; long ttl; + rio payload; + int type; + robj *obj; /* Make sure this key does not already exist here... */ - if (dbExists(c->db,c->argv[1])) { + if (lookupKeyWrite(c->db,c->argv[1]) != NULL) { addReplyError(c,"Target key name is busy."); return; } @@ -1114,44 +1495,20 @@ void restoreCommand(redisClient *c) { return; } - /* rdbLoadObject() only works against file descriptors so we need to - * dump the serialized object into a file and reload. */ - snprintf(buf,sizeof(buf),"redis-restore-%d.tmp",getpid()); - fp = fopen(buf,"w+"); - if (!fp) { - redisLog(REDIS_WARNING,"Can't open tmp file for RESTORE: %s", - strerror(errno)); - addReplyErrorFormat(c,"RESTORE failed, tmp file creation error: %s", - strerror(errno)); - return; - } - unlink(buf); - - /* Write the actual data and rewind the file */ - data = (unsigned char*) c->argv[3]->ptr; - if (fwrite(data+1,sdslen((sds)data)-1,1,fp) != 1) { - redisLog(REDIS_WARNING,"Can't write against tmp file for RESTORE: %s", - strerror(errno)); - addReplyError(c,"RESTORE failed, tmp file I/O error."); - fclose(fp); - return; - } - rewind(fp); - - /* Finally create the object from the serialized dump and - * store it at the specified key. */ - o = rdbLoadObject(data[0],fp); - if (o == NULL) { - addReplyError(c,"Bad data format."); - fclose(fp); + rioInitWithBuffer(&payload,c->argv[3]->ptr); + if (((type = rdbLoadObjectType(&payload)) == -1) || + ((obj = rdbLoadObject(type,&payload)) == NULL)) + { + addReplyError(c,"Bad data format"); return; } - fclose(fp); /* Create the key and set the TTL if any */ - dbAdd(c->db,c->argv[1],o); + dbAdd(c->db,c->argv[1],obj); if (ttl) setExpire(c->db,c->argv[1],time(NULL)+ttl); + signalModifiedKey(c->db,c->argv[1]); addReply(c,shared.ok); + server.dirty++; } /* MIGRATE host port key dbid timeout */ @@ -1159,12 +1516,9 @@ void migrateCommand(redisClient *c) { int fd; long timeout; long dbid; - char buf[64]; - FILE *fp; time_t ttl; robj *o; - unsigned char type; - off_t payload_len; + rio cmd, payload; /* Sanity check */ if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != REDIS_OK) @@ -1177,7 +1531,7 @@ void migrateCommand(redisClient *c) { * nothing to migrate (for instance the key expired in the meantime), but * we include such information in the reply string. */ if ((o = lookupKeyRead(c->db,c->argv[3])) == NULL) { - addReplySds(c,sdsnew("+NOKEY")); + addReplySds(c,sdsnew("+NOKEY\r\n")); return; } @@ -1194,54 +1548,41 @@ void migrateCommand(redisClient *c) { return; } - /* Create temp file */ - snprintf(buf,sizeof(buf),"redis-migrate-%d.tmp",getpid()); - fp = fopen(buf,"w+"); - if (!fp) { - redisLog(REDIS_WARNING,"Can't open tmp file for MIGRATE: %s", - strerror(errno)); - addReplyErrorFormat(c,"MIGRATE failed, tmp file creation error: %s.", - strerror(errno)); - return; - } - unlink(buf); - - /* Build the SELECT + RESTORE query writing it in our temp file. */ - if (fwriteBulkCount(fp,'*',2) == 0) goto file_wr_err; - if (fwriteBulkString(fp,"SELECT",6) == 0) goto file_wr_err; - if (fwriteBulkLongLong(fp,dbid) == 0) goto file_wr_err; + rioInitWithBuffer(&cmd,sdsempty()); + redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2)); + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6)); + redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid)); ttl = getExpire(c->db,c->argv[3]); - type = o->type; - if (fwriteBulkCount(fp,'*',4) == 0) goto file_wr_err; - if (fwriteBulkString(fp,"RESTORE",7) == 0) goto file_wr_err; - if (fwriteBulkObject(fp,c->argv[3]) == 0) goto file_wr_err; - if (fwriteBulkLongLong(fp, (ttl == -1) ? 0 : ttl) == 0) goto file_wr_err; + redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',4)); + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); + redisAssertWithInfo(c,NULL,c->argv[3]->encoding == REDIS_ENCODING_RAW); + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr,sdslen(c->argv[3]->ptr))); + redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,(ttl == -1) ? 0 : ttl)); /* Finally the last argument that is the serailized object payload - * in the form: . */ - payload_len = rdbSavedObjectLen(o); - if (fwriteBulkCount(fp,'$',payload_len+1) == 0) goto file_wr_err; - if (fwrite(&type,1,1,fp) == 0) goto file_wr_err; - if (rdbSaveObject(fp,o) == -1) goto file_wr_err; - if (fwrite("\r\n",2,1,fp) == 0) goto file_wr_err; - - /* Tranfer the query to the other node */ - rewind(fp); + * in the form: . */ + rioInitWithBuffer(&payload,sdsempty()); + redisAssertWithInfo(c,NULL,rdbSaveObjectType(&payload,o)); + redisAssertWithInfo(c,NULL,rdbSaveObject(&payload,o) != -1); + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,payload.io.buffer.ptr,sdslen(payload.io.buffer.ptr))); + sdsfree(payload.io.buffer.ptr); + + /* Tranfer the query to the other node in 64K chunks. */ { - char buf[4096]; - size_t nread; - - while ((nread = fread(buf,1,sizeof(buf),fp)) != 0) { - int nwritten; - - nwritten = syncWrite(fd,buf,nread,timeout); - if (nwritten != (signed)nread) goto socket_wr_err; + sds buf = cmd.io.buffer.ptr; + size_t pos = 0, towrite; + int nwritten = 0; + + while ((towrite = sdslen(buf)-pos) > 0) { + towrite = (towrite > (64*1024) ? (64*1024) : towrite); + nwritten = syncWrite(fd,buf+nwritten,towrite,timeout); + if (nwritten != (signed)towrite) goto socket_wr_err; + pos += nwritten; } - if (ferror(fp)) goto file_rd_err; } - /* Read back the reply */ + /* Read back the reply. */ { char buf1[1024]; char buf2[1024]; @@ -1250,50 +1591,85 @@ void migrateCommand(redisClient *c) { if (syncReadLine(fd, buf1, sizeof(buf1), timeout) <= 0) goto socket_rd_err; if (syncReadLine(fd, buf2, sizeof(buf2), timeout) <= 0) - goto socket_rd_err; + goto socket_rd_err; if (buf1[0] == '-' || buf2[0] == '-') { addReplyErrorFormat(c,"Target instance replied with error: %s", (buf1[0] == '-') ? buf1+1 : buf2+1); } else { + robj *aux; + dbDelete(c->db,c->argv[3]); + signalModifiedKey(c->db,c->argv[3]); addReply(c,shared.ok); + server.dirty++; + + /* Translate MIGRATE as DEL for replication/AOF. */ + aux = createStringObject("DEL",3); + rewriteClientCommandVector(c,2,aux,c->argv[3]); + decrRefCount(aux); } } - fclose(fp); - close(fd); - return; - -file_wr_err: - redisLog(REDIS_WARNING,"Can't write on tmp file for MIGRATE: %s", - strerror(errno)); - addReplyErrorFormat(c,"MIGRATE failed, tmp file write error: %s.", - strerror(errno)); - fclose(fp); - close(fd); -file_rd_err: - redisLog(REDIS_WARNING,"Can't read from tmp file for MIGRATE: %s", - strerror(errno)); - addReplyErrorFormat(c,"MIGRATE failed, tmp file read error: %s.", - strerror(errno)); - fclose(fp); + sdsfree(cmd.io.buffer.ptr); close(fd); + return; socket_wr_err: redisLog(REDIS_NOTICE,"Can't write to target node for MIGRATE: %s", strerror(errno)); addReplyErrorFormat(c,"MIGRATE failed, writing to target node: %s.", strerror(errno)); - fclose(fp); + sdsfree(cmd.io.buffer.ptr); close(fd); + return; socket_rd_err: redisLog(REDIS_NOTICE,"Can't read from target node for MIGRATE: %s", strerror(errno)); addReplyErrorFormat(c,"MIGRATE failed, reading from target node: %s.", strerror(errno)); - fclose(fp); + sdsfree(cmd.io.buffer.ptr); close(fd); + return; +} + +/* DUMP keyname + * DUMP is actually not used by Redis Cluster but it is the obvious + * complement of RESTORE and can be useful for different applications. */ +void dumpCommand(redisClient *c) { + robj *o, *dumpobj; + rio payload; + + /* Check if the key is here. */ + if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { + addReply(c,shared.nullbulk); + return; + } + + /* Serialize the object in a RDB-like format. It consist of an object type + * byte followed by the serialized object. This is understood by RESTORE. */ + rioInitWithBuffer(&payload,sdsempty()); + redisAssertWithInfo(c,NULL,rdbSaveObjectType(&payload,o)); + redisAssertWithInfo(c,NULL,rdbSaveObject(&payload,o)); + + /* Transfer to the client */ + dumpobj = createObject(REDIS_STRING,payload.io.buffer.ptr); + addReplyBulk(c,dumpobj); + decrRefCount(dumpobj); + return; +} + +/* The ASKING command is required after a -ASK redirection. + * The client should issue ASKING before to actualy send the command to + * the target instance. See the Redis Cluster specification for more + * information. */ +void askingCommand(redisClient *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= REDIS_ASKING; + addReply(c,shared.ok); } /* ----------------------------------------------------------------------------- @@ -1303,12 +1679,19 @@ socket_rd_err: /* Return the pointer to the cluster node that is able to serve the query * as all the keys belong to hash slots for which the node is in charge. * - * If keys in query spawn multiple nodes NULL is returned. */ -clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot) { + * If the returned node should be used only for this request, the *ask + * integer is set to '1', otherwise to '0'. This is used in order to + * let the caller know if we should reply with -MOVED or with -ASK. + * + * If the request contains more than a single key NULL is returned, + * however a request with more then a key argument where the key is always + * the same is valid, like in: RPOPLPUSH mylist mylist.*/ +clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask) { clusterNode *n = NULL; + robj *firstkey = NULL; multiState *ms, _ms; multiCmd mc; - int i; + int i, slot = 0; /* We handle all the cases as if they were EXEC commands, so we have * a common code path for everything */ @@ -1318,7 +1701,9 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg if (!(c->flags & REDIS_MULTI)) return server.cluster.myself; ms = &c->mstate; } else { - /* Create a fake Multi State structure, with just one command */ + /* In order to have a single codepath create a fake Multi State + * structure if the client is not in MULTI/EXEC state, this way + * we have a single codepath below. */ ms = &_ms; _ms.commands = &mc; _ms.count = 1; @@ -1327,6 +1712,8 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg mc.cmd = cmd; } + /* Check that all the keys are the same key, and get the slot and + * node for this key. */ for (i = 0; i < ms->count; i++) { struct redisCommand *mcmd; robj **margv; @@ -1337,26 +1724,53 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg margv = ms->commands[i].argv; keyindex = getKeysFromCommand(mcmd,margv,margc,&numkeys, - REDIS_GETKEYS_PRELOAD); + REDIS_GETKEYS_ALL); for (j = 0; j < numkeys; j++) { - int slot = keyHashSlot((char*)margv[keyindex[j]]->ptr, - sdslen(margv[keyindex[j]]->ptr)); - struct clusterNode *slotnode; - - slotnode = server.cluster.slots[slot]; - if (hashslot) *hashslot = slot; - /* Node not assigned? (Should never happen actually - * if we reached this function). - * Different node than the previous one? - * Return NULL, the cluster can't serve multi-node requests */ - if (slotnode == NULL || (n && slotnode != n)) { - getKeysFreeResult(keyindex); - return NULL; + if (firstkey == NULL) { + /* This is the first key we see. Check what is the slot + * and node. */ + firstkey = margv[keyindex[j]]; + + slot = keyHashSlot((char*)firstkey->ptr, sdslen(firstkey->ptr)); + n = server.cluster.slots[slot]; + redisAssertWithInfo(c,firstkey,n != NULL); } else { - n = slotnode; + /* If it is not the first key, make sure it is exactly + * the same key as the first we saw. */ + if (!equalStringObjects(firstkey,margv[keyindex[j]])) { + decrRefCount(firstkey); + getKeysFreeResult(keyindex); + return NULL; + } } } getKeysFreeResult(keyindex); } - return (n == NULL) ? server.cluster.myself : n; + if (ask) *ask = 0; /* This is the default. Set to 1 if needed later. */ + /* No key at all in command? then we can serve the request + * without redirections. */ + if (n == NULL) return server.cluster.myself; + if (hashslot) *hashslot = slot; + /* This request is about a slot we are migrating into another instance? + * Then we need to check if we have the key. If we have it we can reply. + * If instead is a new key, we pass the request to the node that is + * receiving the slot. */ + if (n == server.cluster.myself && + server.cluster.migrating_slots_to[slot] != NULL) + { + if (lookupKeyRead(&server.db[0],firstkey) == NULL) { + if (ask) *ask = 1; + return server.cluster.migrating_slots_to[slot]; + } + } + /* Handle the case in which we are receiving this hash slot from + * another instance, so we'll accept the query even if in the table + * it is assigned to a different node, but only if the client + * issued an ASKING command before. */ + if (server.cluster.importing_slots_from[slot] != NULL && + c->flags & REDIS_ASKING) { + return server.cluster.myself; + } + /* It's not a -ASK case. Base case: just return the right node. */ + return n; }