X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/73fac227a0dd3a7d25e29b4718f8588f55db07d6..af0b220756571bc8faf57a0c7b7389dd86a60376:/src/cluster.c diff --git a/src/cluster.c b/src/cluster.c index 42b01113..cbcdf373 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1,8 +1,40 @@ +/* Redis Cluster implementation. + * + * Copyright (c) 2009-2012, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + #include "redis.h" +#include "endianconv.h" #include #include #include +#include void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask); @@ -19,20 +51,6 @@ int clusterAddSlot(clusterNode *n, int slot); * Initialization * -------------------------------------------------------------------------- */ -void clusterGetRandomName(char *p) { - FILE *fp = fopen("/dev/urandom","r"); - char *charset = "0123456789abcdef"; - int j; - - if (fp == NULL || fread(p,REDIS_CLUSTER_NAMELEN,1,fp) == 0) { - for (j = 0; j < REDIS_CLUSTER_NAMELEN; j++) - p[j] = rand(); - } - for (j = 0; j < REDIS_CLUSTER_NAMELEN; j++) - p[j] = charset[p[j] & 0x0F]; - fclose(fp); -} - int clusterLoadConfig(char *filename) { FILE *fp = fopen(filename,"r"); char *line; @@ -222,7 +240,7 @@ void clusterInit(void) { exit(1); } if (aeCreateFileEvent(server.el, server.cfd, AE_READABLE, - clusterAcceptHandler, NULL) == AE_ERR) oom("creating file event"); + clusterAcceptHandler, NULL) == AE_ERR) redisPanic("Unrecoverable error creating Redis Cluster file event."); server.cluster.slots_to_keys = zslCreate(); } @@ -304,7 +322,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { if (nodename) memcpy(node->name, nodename, REDIS_CLUSTER_NAMELEN); else - clusterGetRandomName(node->name); + getRandomHexChars(node->name, REDIS_CLUSTER_NAMELEN); node->flags = flags; memset(node->slots,0,sizeof(node->slots)); node->numslaves = 0; @@ -377,7 +395,7 @@ clusterNode *clusterLookupNode(char *name) { de = dictFind(server.cluster.nodes,s); sdsfree(s); if (de == NULL) return NULL; - return dictGetEntryVal(de); + return dictGetVal(de); } /* This is only used after the handshake. When we connect a given IP/PORT @@ -439,7 +457,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { * time PONG figure if it is newer than our figure. * Note that it's not a problem if we have a PING already * in progress against this node. */ - if (node->pong_received < ntohl(g->pong_received)) { + if (node->pong_received < (signed) ntohl(g->pong_received)) { redisLog(REDIS_DEBUG,"Node pong_received updated by gossip"); node->pong_received = ntohl(g->pong_received); } @@ -613,7 +631,7 @@ int clusterProcessPacket(clusterLink *link) { } } /* Update our info about the node */ - link->node->pong_received = time(NULL); + if (link->node) link->node->pong_received = time(NULL); /* Update master/slave info */ if (sender) { @@ -793,7 +811,7 @@ void clusterBroadcastMessage(void *buf, size_t len) { di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); if (!node->link) continue; if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue; @@ -804,7 +822,7 @@ void clusterBroadcastMessage(void *buf, size_t len) { /* Build the message header */ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { - int totlen; + int totlen = 0; memset(hdr,0,sizeof(*hdr)); hdr->type = htons(type); @@ -849,7 +867,7 @@ void clusterSendPing(clusterLink *link, int type) { /* Populate the gossip fields */ while(freshnodes > 0 && gossipcount < 3) { struct dictEntry *de = dictGetRandomKey(server.cluster.nodes); - clusterNode *this = dictGetEntryVal(de); + clusterNode *this = dictGetVal(de); clusterMsgDataGossip *gossip; int j; @@ -914,7 +932,7 @@ void clusterSendPublish(clusterLink *link, robj *channel, robj *message) { } else { payload = zmalloc(totlen); hdr = (clusterMsg*) payload; - memcpy(payload,hdr,sizeof(hdr)); + memcpy(payload,hdr,sizeof(*hdr)); } memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), @@ -970,7 +988,7 @@ void clusterCron(void) { /* Check if we have disconnected nodes and reestablish the connection. */ di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR)) continue; if (node->link == NULL) { @@ -1005,7 +1023,7 @@ void clusterCron(void) { * the oldest ping_sent time */ for (j = 0; j < 5; j++) { de = dictGetRandomKey(server.cluster.nodes); - clusterNode *this = dictGetEntryVal(de); + clusterNode *this = dictGetVal(de); if (this->link == NULL) continue; if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; @@ -1022,7 +1040,7 @@ void clusterCron(void) { /* Iterate nodes to check if we need to flag something as failing */ di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); int delay; if (node->flags & @@ -1153,7 +1171,7 @@ sds clusterGenNodesDescription(void) { di = dictGetIterator(server.cluster.nodes); while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetEntryVal(de); + clusterNode *node = dictGetVal(de); /* Node coordinates */ ci = sdscatprintf(ci,"%.40s %s:%d ", @@ -1471,18 +1489,105 @@ void clusterCommand(redisClient *c) { } /* ----------------------------------------------------------------------------- - * RESTORE and MIGRATE commands + * DUMP, RESTORE and MIGRATE commands * -------------------------------------------------------------------------- */ -/* RESTORE key ttl serialized-value */ +/* Generates a DUMP-format representation of the object 'o', adding it to the + * io stream pointed by 'rio'. This function can't fail. */ +void createDumpPayload(rio *payload, robj *o) { + unsigned char buf[2]; + uint64_t crc; + + /* Serialize the object in a RDB-like format. It consist of an object type + * byte followed by the serialized object. This is understood by RESTORE. */ + rioInitWithBuffer(payload,sdsempty()); + redisAssert(rdbSaveObjectType(payload,o)); + redisAssert(rdbSaveObject(payload,o)); + + /* Write the footer, this is how it looks like: + * ----------------+---------------------+---------------+ + * ... RDB payload | 2 bytes RDB version | 8 bytes CRC64 | + * ----------------+---------------------+---------------+ + * RDB version and CRC are both in little endian. + */ + + /* RDB version */ + buf[0] = REDIS_RDB_VERSION & 0xff; + buf[1] = (REDIS_RDB_VERSION >> 8) & 0xff; + payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,buf,2); + + /* CRC64 */ + crc = crc64(0,(unsigned char*)payload->io.buffer.ptr, + sdslen(payload->io.buffer.ptr)); + memrev64ifbe(&crc); + payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,&crc,8); +} + +/* Verify that the RDB version of the dump payload matches the one of this Redis + * instance and that the checksum is ok. + * If the DUMP payload looks valid REDIS_OK is returned, otherwise REDIS_ERR + * is returned. */ +int verifyDumpPayload(unsigned char *p, size_t len) { + unsigned char *footer; + uint16_t rdbver; + uint64_t crc; + + /* At least 2 bytes of RDB version and 8 of CRC64 should be present. */ + if (len < 10) return REDIS_ERR; + footer = p+(len-10); + + /* Verify RDB version */ + rdbver = (footer[1] << 8) | footer[0]; + if (rdbver != REDIS_RDB_VERSION) return REDIS_ERR; + + /* Verify CRC64 */ + crc = crc64(0,p,len-8); + memrev64ifbe(&crc); + return (memcmp(&crc,footer+2,8) == 0) ? REDIS_OK : REDIS_ERR; +} + +/* DUMP keyname + * DUMP is actually not used by Redis Cluster but it is the obvious + * complement of RESTORE and can be useful for different applications. */ +void dumpCommand(redisClient *c) { + robj *o, *dumpobj; + rio payload; + + /* Check if the key is here. */ + if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { + addReply(c,shared.nullbulk); + return; + } + + /* Create the DUMP encoded representation. */ + createDumpPayload(&payload,o); + + /* Transfer to the client */ + dumpobj = createObject(REDIS_STRING,payload.io.buffer.ptr); + addReplyBulk(c,dumpobj); + decrRefCount(dumpobj); + return; +} + +/* RESTORE key ttl serialized-value [REPLACE] */ void restoreCommand(redisClient *c) { long ttl; rio payload; - int type; + int j, type, replace = 0; robj *obj; + /* Parse additional options */ + for (j = 4; j < c->argc; j++) { + if (!strcasecmp(c->argv[j]->ptr,"replace")) { + replace = 1; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + /* Make sure this key does not already exist here... */ - if (lookupKeyWrite(c->db,c->argv[1]) != NULL) { + if (!replace && lookupKeyWrite(c->db,c->argv[1]) != NULL) { addReplyError(c,"Target key name is busy."); return; } @@ -1495,6 +1600,12 @@ void restoreCommand(redisClient *c) { return; } + /* Verify RDB version and data checksum. */ + if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr)) == REDIS_ERR) { + addReplyError(c,"DUMP payload version or checksum are wrong"); + return; + } + rioInitWithBuffer(&payload,c->argv[3]->ptr); if (((type = rdbLoadObjectType(&payload)) == -1) || ((obj = rdbLoadObject(type,&payload)) == NULL)) @@ -1503,29 +1614,165 @@ void restoreCommand(redisClient *c) { return; } + /* Remove the old key if needed. */ + if (replace) dbDelete(c->db,c->argv[1]); + /* Create the key and set the TTL if any */ dbAdd(c->db,c->argv[1],obj); - if (ttl) setExpire(c->db,c->argv[1],time(NULL)+ttl); + if (ttl) setExpire(c->db,c->argv[1],mstime()+ttl); signalModifiedKey(c->db,c->argv[1]); addReply(c,shared.ok); server.dirty++; } -/* MIGRATE host port key dbid timeout */ -void migrateCommand(redisClient *c) { +/* MIGRATE socket cache implementation. + * + * We take a map between host:ip and a TCP socket that we used to connect + * to this instance in recent time. + * This sockets are closed when the max number we cache is reached, and also + * in serverCron() when they are around for more than a few seconds. */ +#define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */ +#define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached socekts after 10 sec. */ + +typedef struct migrateCachedSocket { + int fd; + time_t last_use_time; +} migrateCachedSocket; + +/* Return a TCP scoket connected with the target instance, possibly returning + * a cached one. + * + * This function is responsible of sending errors to the client if a + * connection can't be established. In this case -1 is returned. + * Otherwise on success the socket is returned, and the caller should not + * attempt to free it after usage. + * + * If the caller detects an error while using the socket, migrateCloseSocket() + * should be called so that the connection will be craeted from scratch + * the next time. */ +int migrateGetSocket(redisClient *c, robj *host, robj *port, long timeout) { int fd; + sds name = sdsempty(); + migrateCachedSocket *cs; + + /* Check if we have an already cached socket for this ip:port pair. */ + name = sdscatlen(name,host->ptr,sdslen(host->ptr)); + name = sdscatlen(name,":",1); + name = sdscatlen(name,port->ptr,sdslen(port->ptr)); + cs = dictFetchValue(server.migrate_cached_sockets,name); + if (cs) { + sdsfree(name); + cs->last_use_time = server.unixtime; + return cs->fd; + } + + /* No cached socket, create one. */ + if (dictSize(server.migrate_cached_sockets) == MIGRATE_SOCKET_CACHE_ITEMS) { + /* Too many items, drop one at random. */ + dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets); + cs = dictGetVal(de); + close(cs->fd); + zfree(cs); + dictDelete(server.migrate_cached_sockets,dictGetKey(de)); + } + + /* Create the socket */ + fd = anetTcpNonBlockConnect(server.neterr,c->argv[1]->ptr, + atoi(c->argv[2]->ptr)); + if (fd == -1) { + sdsfree(name); + addReplyErrorFormat(c,"Can't connect to target node: %s", + server.neterr); + return -1; + } + anetTcpNoDelay(server.neterr,fd); + + /* Check if it connects within the specified timeout. */ + if ((aeWait(fd,AE_WRITABLE,timeout) & AE_WRITABLE) == 0) { + sdsfree(name); + addReplySds(c,sdsnew("-IOERR error or timeout connecting to the client\r\n")); + close(fd); + return -1; + } + + /* Add to the cache and return it to the caller. */ + cs = zmalloc(sizeof(*cs)); + cs->fd = fd; + cs->last_use_time = server.unixtime; + dictAdd(server.migrate_cached_sockets,name,cs); + return fd; +} + +/* Free a migrate cached connection. */ +void migrateCloseSocket(robj *host, robj *port) { + sds name = sdsempty(); + migrateCachedSocket *cs; + + name = sdscatlen(name,host->ptr,sdslen(host->ptr)); + name = sdscatlen(name,":",1); + name = sdscatlen(name,port->ptr,sdslen(port->ptr)); + cs = dictFetchValue(server.migrate_cached_sockets,name); + if (!cs) { + sdsfree(name); + return; + } + + close(cs->fd); + zfree(cs); + dictDelete(server.migrate_cached_sockets,name); + sdsfree(name); +} + +void migrateCloseTimedoutSockets(void) { + dictIterator *di = dictGetSafeIterator(server.migrate_cached_sockets); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + migrateCachedSocket *cs = dictGetVal(de); + + if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) { + close(cs->fd); + zfree(cs); + dictDelete(server.migrate_cached_sockets,dictGetKey(de)); + } + } + dictReleaseIterator(di); +} + +/* MIGRATE host port key dbid timeout [COPY | REPLACE] */ +void migrateCommand(redisClient *c) { + int fd, copy, replace, j; long timeout; long dbid; - time_t ttl; + long long ttl, expireat; robj *o; rio cmd, payload; + int retry_num = 0; + +try_again: + /* Initialization */ + copy = 0; + replace = 0; + ttl = 0; + + /* Parse additional options */ + for (j = 6; j < c->argc; j++) { + if (!strcasecmp(c->argv[j]->ptr,"copy")) { + copy = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"replace")) { + replace = 1; + } else { + addReply(c,shared.syntaxerr); + return; + } + } /* Sanity check */ if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != REDIS_OK) return; if (getLongFromObjectOrReply(c,c->argv[4],&dbid,NULL) != REDIS_OK) return; - if (timeout <= 0) timeout = 1; + if (timeout <= 0) timeout = 1000; /* Check if the key is here. If not we reply with success as there is * nothing to migrate (for instance the key expired in the meantime), but @@ -1536,39 +1783,40 @@ void migrateCommand(redisClient *c) { } /* Connect */ - fd = anetTcpNonBlockConnect(server.neterr,c->argv[1]->ptr, - atoi(c->argv[2]->ptr)); - if (fd == -1) { - addReplyErrorFormat(c,"Can't connect to target node: %s", - server.neterr); - return; - } - if ((aeWait(fd,AE_WRITABLE,timeout*1000) & AE_WRITABLE) == 0) { - addReplyError(c,"Timeout connecting to the client"); - return; - } + fd = migrateGetSocket(c,c->argv[1],c->argv[2],timeout); + if (fd == -1) return; /* error sent to the client by migrateGetSocket() */ + /* Create RESTORE payload and generate the protocol to call the command. */ rioInitWithBuffer(&cmd,sdsempty()); redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2)); redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6)); redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid)); - ttl = getExpire(c->db,c->argv[3]); - redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',4)); + expireat = getExpire(c->db,c->argv[3]); + if (expireat != -1) { + ttl = expireat-mstime(); + if (ttl < 1) ttl = 1; + } + redisAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); redisAssertWithInfo(c,NULL,c->argv[3]->encoding == REDIS_ENCODING_RAW); redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr,sdslen(c->argv[3]->ptr))); - redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,(ttl == -1) ? 0 : ttl)); - - /* Finally the last argument that is the serailized object payload - * in the form: . */ - rioInitWithBuffer(&payload,sdsempty()); - redisAssertWithInfo(c,NULL,rdbSaveObjectType(&payload,o)); - redisAssertWithInfo(c,NULL,rdbSaveObject(&payload,o) != -1); - redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,payload.io.buffer.ptr,sdslen(payload.io.buffer.ptr))); + redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); + + /* Emit the payload argument, that is the serailized object using + * the DUMP format. */ + createDumpPayload(&payload,o); + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,payload.io.buffer.ptr, + sdslen(payload.io.buffer.ptr))); sdsfree(payload.io.buffer.ptr); + /* Add the REPLACE option to the RESTORE command if it was specified + * as a MIGRATE option. */ + if (replace) + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"REPLACE",7)); + /* Tranfer the query to the other node in 64K chunks. */ + errno = 0; { sds buf = cmd.io.buffer.ptr; size_t pos = 0, towrite; @@ -1576,7 +1824,7 @@ void migrateCommand(redisClient *c) { while ((towrite = sdslen(buf)-pos) > 0) { towrite = (towrite > (64*1024) ? (64*1024) : towrite); - nwritten = syncWrite(fd,buf+nwritten,towrite,timeout); + nwritten = syncWrite(fd,buf+pos,towrite,timeout); if (nwritten != (signed)towrite) goto socket_wr_err; pos += nwritten; } @@ -1598,8 +1846,11 @@ void migrateCommand(redisClient *c) { } else { robj *aux; - dbDelete(c->db,c->argv[3]); - signalModifiedKey(c->db,c->argv[3]); + if (!copy) { + /* No COPY option: remove the local key, signal the change. */ + dbDelete(c->db,c->argv[3]); + signalModifiedKey(c->db,c->argv[3]); + } addReply(c,shared.ok); server.dirty++; @@ -1611,51 +1862,22 @@ void migrateCommand(redisClient *c) { } sdsfree(cmd.io.buffer.ptr); - close(fd); return; socket_wr_err: - redisLog(REDIS_NOTICE,"Can't write to target node for MIGRATE: %s", - strerror(errno)); - addReplyErrorFormat(c,"MIGRATE failed, writing to target node: %s.", - strerror(errno)); sdsfree(cmd.io.buffer.ptr); - close(fd); + migrateCloseSocket(c->argv[1],c->argv[2]); + if (errno != ETIMEDOUT && retry_num++ == 0) goto try_again; + addReplySds(c, + sdsnew("-IOERR error or timeout writing to target instance\r\n")); return; socket_rd_err: - redisLog(REDIS_NOTICE,"Can't read from target node for MIGRATE: %s", - strerror(errno)); - addReplyErrorFormat(c,"MIGRATE failed, reading from target node: %s.", - strerror(errno)); sdsfree(cmd.io.buffer.ptr); - close(fd); - return; -} - -/* DUMP keyname - * DUMP is actually not used by Redis Cluster but it is the obvious - * complement of RESTORE and can be useful for different applications. */ -void dumpCommand(redisClient *c) { - robj *o, *dumpobj; - rio payload; - - /* Check if the key is here. */ - if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { - addReply(c,shared.nullbulk); - return; - } - - /* Serialize the object in a RDB-like format. It consist of an object type - * byte followed by the serialized object. This is understood by RESTORE. */ - rioInitWithBuffer(&payload,sdsempty()); - redisAssertWithInfo(c,NULL,rdbSaveObjectType(&payload,o)); - redisAssertWithInfo(c,NULL,rdbSaveObject(&payload,o)); - - /* Transfer to the client */ - dumpobj = createObject(REDIS_STRING,payload.io.buffer.ptr); - addReplyBulk(c,dumpobj); - decrRefCount(dumpobj); + migrateCloseSocket(c->argv[1],c->argv[2]); + if (errno != ETIMEDOUT && retry_num++ == 0) goto try_again; + addReplySds(c, + sdsnew("-IOERR error or timeout reading from target node\r\n")); return; }