X-Git-Url: https://git.saurik.com/redis.git/blobdiff_plain/74a640492fe2547ff6737d598b32896840c2bd1d..af0b220756571bc8faf57a0c7b7389dd86a60376:/src/networking.c?ds=inline diff --git a/src/networking.c b/src/networking.c index 29eed888..c0dd4d0d 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1,6 +1,45 @@ +/* + * Copyright (c) 2009-2012, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + #include "redis.h" #include +static void setProtocolError(redisClient *c, int pos); + +/* To evaluate the output buffer size of a client we need to get size of + * allocated objects, however we can't used zmalloc_size() directly on sds + * strings because of the trick they use to work (the header is before the + * returned pointer), so we use this helper function. */ +size_t zmalloc_size_sds(sds s) { + return zmalloc_size(s-sizeof(struct sdshdr)); +} + void *dupClientReplyValue(void *o) { incrRefCount((robj*)o); return o; @@ -12,32 +51,43 @@ int listMatchObjects(void *a, void *b) { redisClient *createClient(int fd) { redisClient *c = zmalloc(sizeof(redisClient)); - c->bufpos = 0; - anetNonBlock(NULL,fd); - anetTcpNoDelay(NULL,fd); - if (aeCreateFileEvent(server.el,fd,AE_READABLE, - readQueryFromClient, c) == AE_ERR) - { - close(fd); - zfree(c); - return NULL; + /* passing -1 as fd it is possible to create a non connected client. + * This is useful since all the Redis commands needs to be executed + * in the context of a client. When commands are executed in other + * contexts (for instance a Lua script) we need a non connected client. */ + if (fd != -1) { + anetNonBlock(NULL,fd); + anetTcpNoDelay(NULL,fd); + if (aeCreateFileEvent(server.el,fd,AE_READABLE, + readQueryFromClient, c) == AE_ERR) + { + close(fd); + zfree(c); + return NULL; + } } selectDb(c,0); c->fd = fd; + c->bufpos = 0; c->querybuf = sdsempty(); + c->querybuf_peak = 0; c->reqtype = 0; c->argc = 0; c->argv = NULL; + c->cmd = c->lastcmd = NULL; c->multibulklen = 0; c->bulklen = -1; c->sentlen = 0; c->flags = 0; - c->lastinteraction = time(NULL); + c->ctime = c->lastinteraction = server.unixtime; c->authenticated = 0; c->replstate = REDIS_REPL_NONE; + c->slave_listening_port = 0; c->reply = listCreate(); + c->reply_bytes = 0; + c->obuf_soft_limit_reached_time = 0; listSetFreeMethod(c->reply,decrRefCount); listSetDupMethod(c->reply,dupClientReplyValue); c->bpop.keys = NULL; @@ -51,15 +101,28 @@ redisClient *createClient(int fd) { c->pubsub_patterns = listCreate(); listSetFreeMethod(c->pubsub_patterns,decrRefCount); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); - listAddNodeTail(server.clients,c); + if (fd != -1) listAddNodeTail(server.clients,c); initClientMultiState(c); return c; } -/* Set the event loop to listen for write events on the client's socket. - * Typically gets called every time a reply is built. */ -int _installWriteEvent(redisClient *c) { - if (c->fd <= 0) return REDIS_ERR; +/* This function is called every time we are going to transmit new data + * to the client. The behavior is the following: + * + * If the client should receive new data (normal clients will) the function + * returns REDIS_OK, and make sure to install the write handler in our event + * loop so that when the socket is writable new data gets written. + * + * If the client should not receive new data, because it is a fake client + * or a slave, or because the setup of the write handler failed, the function + * returns REDIS_ERR. + * + * Typically gets called every time a reply is built, before adding more + * data to the clients output buffers. If the function returns REDIS_ERR no + * data should be appended to the output buffers. */ +int prepareClientToWrite(redisClient *c) { + if (c->flags & REDIS_LUA_CLIENT) return REDIS_OK; + if (c->fd <= 0) return REDIS_ERR; /* Fake client */ if (c->bufpos == 0 && listLength(c->reply) == 0 && (c->replstate == REDIS_REPL_NONE || c->replstate == REDIS_REPL_ONLINE) && @@ -113,6 +176,7 @@ void _addReplyObjectToList(redisClient *c, robj *o) { if (listLength(c->reply) == 0) { incrRefCount(o); listAddNodeTail(c->reply,o); + c->reply_bytes += zmalloc_size_sds(o->ptr); } else { tail = listNodeValue(listLast(c->reply)); @@ -120,13 +184,17 @@ void _addReplyObjectToList(redisClient *c, robj *o) { if (tail->ptr != NULL && sdslen(tail->ptr)+sdslen(o->ptr) <= REDIS_REPLY_CHUNK_BYTES) { + c->reply_bytes -= zmalloc_size_sds(tail->ptr); tail = dupLastObjectIfNeeded(c->reply); tail->ptr = sdscatlen(tail->ptr,o->ptr,sdslen(o->ptr)); + c->reply_bytes += zmalloc_size_sds(tail->ptr); } else { incrRefCount(o); listAddNodeTail(c->reply,o); + c->reply_bytes += zmalloc_size_sds(o->ptr); } } + asyncCloseClientOnOutputBufferLimitReached(c); } /* This method takes responsibility over the sds. When it is no longer @@ -141,6 +209,7 @@ void _addReplySdsToList(redisClient *c, sds s) { if (listLength(c->reply) == 0) { listAddNodeTail(c->reply,createObject(REDIS_STRING,s)); + c->reply_bytes += zmalloc_size_sds(s); } else { tail = listNodeValue(listLast(c->reply)); @@ -148,13 +217,17 @@ void _addReplySdsToList(redisClient *c, sds s) { if (tail->ptr != NULL && sdslen(tail->ptr)+sdslen(s) <= REDIS_REPLY_CHUNK_BYTES) { + c->reply_bytes -= zmalloc_size_sds(tail->ptr); tail = dupLastObjectIfNeeded(c->reply); tail->ptr = sdscatlen(tail->ptr,s,sdslen(s)); + c->reply_bytes += zmalloc_size_sds(tail->ptr); sdsfree(s); } else { listAddNodeTail(c->reply,createObject(REDIS_STRING,s)); + c->reply_bytes += zmalloc_size_sds(s); } } + asyncCloseClientOnOutputBufferLimitReached(c); } void _addReplyStringToList(redisClient *c, char *s, size_t len) { @@ -163,7 +236,10 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { if (c->flags & REDIS_CLOSE_AFTER_REPLY) return; if (listLength(c->reply) == 0) { - listAddNodeTail(c->reply,createStringObject(s,len)); + robj *o = createStringObject(s,len); + + listAddNodeTail(c->reply,o); + c->reply_bytes += zmalloc_size_sds(o->ptr); } else { tail = listNodeValue(listLast(c->reply)); @@ -171,12 +247,18 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { if (tail->ptr != NULL && sdslen(tail->ptr)+len <= REDIS_REPLY_CHUNK_BYTES) { + c->reply_bytes -= zmalloc_size_sds(tail->ptr); tail = dupLastObjectIfNeeded(c->reply); tail->ptr = sdscatlen(tail->ptr,s,len); + c->reply_bytes += zmalloc_size_sds(tail->ptr); } else { - listAddNodeTail(c->reply,createStringObject(s,len)); + robj *o = createStringObject(s,len); + + listAddNodeTail(c->reply,o); + c->reply_bytes += zmalloc_size_sds(o->ptr); } } + asyncCloseClientOnOutputBufferLimitReached(c); } /* ----------------------------------------------------------------------------- @@ -185,7 +267,7 @@ void _addReplyStringToList(redisClient *c, char *s, size_t len) { * -------------------------------------------------------------------------- */ void addReply(redisClient *c, robj *obj) { - if (_installWriteEvent(c) != REDIS_OK) return; + if (prepareClientToWrite(c) != REDIS_OK) return; /* This is an important place where we can avoid copy-on-write * when there is a saving child running, avoiding touching the @@ -197,19 +279,31 @@ void addReply(redisClient *c, robj *obj) { if (obj->encoding == REDIS_ENCODING_RAW) { if (_addReplyToBuffer(c,obj->ptr,sdslen(obj->ptr)) != REDIS_OK) _addReplyObjectToList(c,obj); - } else { - /* FIXME: convert the long into string and use _addReplyToBuffer() - * instead of calling getDecodedObject. As this place in the - * code is too performance critical. */ + } else if (obj->encoding == REDIS_ENCODING_INT) { + /* Optimization: if there is room in the static buffer for 32 bytes + * (more than the max chars a 64 bit integer can take as string) we + * avoid decoding the object and go for the lower level approach. */ + if (listLength(c->reply) == 0 && (sizeof(c->buf) - c->bufpos) >= 32) { + char buf[32]; + int len; + + len = ll2string(buf,sizeof(buf),(long)obj->ptr); + if (_addReplyToBuffer(c,buf,len) == REDIS_OK) + return; + /* else... continue with the normal code path, but should never + * happen actually since we verified there is room. */ + } obj = getDecodedObject(obj); if (_addReplyToBuffer(c,obj->ptr,sdslen(obj->ptr)) != REDIS_OK) _addReplyObjectToList(c,obj); decrRefCount(obj); + } else { + redisPanic("Wrong obj->encoding in addReply()"); } } void addReplySds(redisClient *c, sds s) { - if (_installWriteEvent(c) != REDIS_OK) { + if (prepareClientToWrite(c) != REDIS_OK) { /* The caller expects the sds to be free'd. */ sdsfree(s); return; @@ -223,38 +317,45 @@ void addReplySds(redisClient *c, sds s) { } void addReplyString(redisClient *c, char *s, size_t len) { - if (_installWriteEvent(c) != REDIS_OK) return; + if (prepareClientToWrite(c) != REDIS_OK) return; if (_addReplyToBuffer(c,s,len) != REDIS_OK) _addReplyStringToList(c,s,len); } -void _addReplyError(redisClient *c, char *s, size_t len) { +void addReplyErrorLength(redisClient *c, char *s, size_t len) { addReplyString(c,"-ERR ",5); addReplyString(c,s,len); addReplyString(c,"\r\n",2); } void addReplyError(redisClient *c, char *err) { - _addReplyError(c,err,strlen(err)); + addReplyErrorLength(c,err,strlen(err)); } void addReplyErrorFormat(redisClient *c, const char *fmt, ...) { + size_t l, j; va_list ap; va_start(ap,fmt); sds s = sdscatvprintf(sdsempty(),fmt,ap); va_end(ap); - _addReplyError(c,s,sdslen(s)); + /* Make sure there are no newlines in the string, otherwise invalid protocol + * is emitted. */ + l = sdslen(s); + for (j = 0; j < l; j++) { + if (s[j] == '\r' || s[j] == '\n') s[j] = ' '; + } + addReplyErrorLength(c,s,sdslen(s)); sdsfree(s); } -void _addReplyStatus(redisClient *c, char *s, size_t len) { +void addReplyStatusLength(redisClient *c, char *s, size_t len) { addReplyString(c,"+",1); addReplyString(c,s,len); addReplyString(c,"\r\n",2); } void addReplyStatus(redisClient *c, char *status) { - _addReplyStatus(c,status,strlen(status)); + addReplyStatusLength(c,status,strlen(status)); } void addReplyStatusFormat(redisClient *c, const char *fmt, ...) { @@ -262,7 +363,7 @@ void addReplyStatusFormat(redisClient *c, const char *fmt, ...) { va_start(ap,fmt); sds s = sdscatvprintf(sdsempty(),fmt,ap); va_end(ap); - _addReplyStatus(c,s,sdslen(s)); + addReplyStatusLength(c,s,sdslen(s)); sdsfree(s); } @@ -272,7 +373,7 @@ void *addDeferredMultiBulkLength(redisClient *c) { /* Note that we install the write event here even if the object is not * ready to be sent, since we are sure that before returning to the * event loop setDeferredMultiBulkLength() will be called. */ - if (_installWriteEvent(c) != REDIS_OK) return NULL; + if (prepareClientToWrite(c) != REDIS_OK) return NULL; listAddNodeTail(c->reply,createObject(REDIS_STRING,NULL)); return listLast(c->reply); } @@ -287,15 +388,20 @@ void setDeferredMultiBulkLength(redisClient *c, void *node, long length) { len = listNodeValue(ln); len->ptr = sdscatprintf(sdsempty(),"*%ld\r\n",length); + c->reply_bytes += zmalloc_size_sds(len->ptr); if (ln->next != NULL) { next = listNodeValue(ln->next); /* Only glue when the next node is non-NULL (an sds in this case) */ if (next->ptr != NULL) { + c->reply_bytes -= zmalloc_size_sds(len->ptr); + c->reply_bytes -= zmalloc_size_sds(next->ptr); len->ptr = sdscatlen(len->ptr,next->ptr,sdslen(next->ptr)); + c->reply_bytes += zmalloc_size_sds(len->ptr); listDelNode(c->reply,ln->next); } } + asyncCloseClientOnOutputBufferLimitReached(c); } /* Add a duble as a bulk reply */ @@ -309,9 +415,21 @@ void addReplyDouble(redisClient *c, double d) { /* Add a long long as integer reply or bulk len / multi bulk count. * Basically this is used to output . */ -void _addReplyLongLong(redisClient *c, long long ll, char prefix) { +void addReplyLongLongWithPrefix(redisClient *c, long long ll, char prefix) { char buf[128]; int len; + + /* Things like $3\r\n or *2\r\n are emitted very often by the protocol + * so we have a few shared objects to use if the integer is small + * like it is most of the times. */ + if (prefix == '*' && ll < REDIS_SHARED_BULKHDR_LEN) { + addReply(c,shared.mbulkhdr[ll]); + return; + } else if (prefix == '$' && ll < REDIS_SHARED_BULKHDR_LEN) { + addReply(c,shared.bulkhdr[ll]); + return; + } + buf[0] = prefix; len = ll2string(buf+1,sizeof(buf)-1,ll); buf[len+1] = '\r'; @@ -325,11 +443,11 @@ void addReplyLongLong(redisClient *c, long long ll) { else if (ll == 1) addReply(c,shared.cone); else - _addReplyLongLong(c,ll,':'); + addReplyLongLongWithPrefix(c,ll,':'); } void addReplyMultiBulkLen(redisClient *c, long length) { - _addReplyLongLong(c,length,'*'); + addReplyLongLongWithPrefix(c,length,'*'); } /* Create the length prefix of a bulk reply, example: $2234 */ @@ -351,7 +469,7 @@ void addReplyBulkLen(redisClient *c, robj *obj) { len++; } } - _addReplyLongLong(c,len,'$'); + addReplyLongLongWithPrefix(c,len,'$'); } /* Add a Redis Object as a bulk reply */ @@ -363,7 +481,7 @@ void addReplyBulk(redisClient *c, robj *obj) { /* Add a C buffer as bulk reply */ void addReplyBulkCBuffer(redisClient *c, void *p, size_t len) { - _addReplyLongLong(c,len,'$'); + addReplyLongLongWithPrefix(c,len,'$'); addReplyString(c,p,len); addReply(c,shared.crlf); } @@ -386,28 +504,41 @@ void addReplyBulkLongLong(redisClient *c, long long ll) { addReplyBulkCBuffer(c,buf,len); } -static void acceptCommonHandler(int fd) { +/* Copy 'src' client output buffers into 'dst' client output buffers. + * The function takes care of freeing the old output buffers of the + * destination client. */ +void copyClientOutputBuffer(redisClient *dst, redisClient *src) { + listRelease(dst->reply); + dst->reply = listDup(src->reply); + memcpy(dst->buf,src->buf,src->bufpos); + dst->bufpos = src->bufpos; + dst->reply_bytes = src->reply_bytes; +} + +static void acceptCommonHandler(int fd, int flags) { redisClient *c; if ((c = createClient(fd)) == NULL) { - redisLog(REDIS_WARNING,"Error allocating resoures for the client"); - close(fd); /* May be already closed, just ingore errors */ + redisLog(REDIS_WARNING,"Error allocating resources for the client"); + close(fd); /* May be already closed, just ignore errors */ return; } /* If maxclient directive is set and this is one client more... close the * connection. Note that we create the client instead to check before * for this condition, since now the socket is already set in nonblocking * mode and we can send an error for free using the Kernel I/O */ - if (server.maxclients && listLength(server.clients) > server.maxclients) { + if (listLength(server.clients) > server.maxclients) { char *err = "-ERR max number of clients reached\r\n"; /* That's a best effort error message, don't check write errors */ if (write(c->fd,err,strlen(err)) == -1) { /* Nothing to do, Just to avoid the warning... */ } + server.stat_rejected_conn++; freeClient(c); return; } server.stat_numconnections++; + c->flags |= flags; } void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { @@ -419,11 +550,11 @@ void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { cfd = anetTcpAccept(server.neterr, fd, cip, &cport); if (cfd == AE_ERR) { - redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr); + redisLog(REDIS_WARNING,"Accepting client connection: %s", server.neterr); return; } redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(cfd); + acceptCommonHandler(cfd,0); } void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask) { @@ -434,11 +565,11 @@ void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask) { cfd = anetUnixAccept(server.neterr, fd); if (cfd == AE_ERR) { - redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr); + redisLog(REDIS_WARNING,"Accepting client connection: %s", server.neterr); return; } redisLog(REDIS_VERBOSE,"Accepted connection to %s", server.unixsocket); - acceptCommonHandler(cfd); + acceptCommonHandler(cfd,REDIS_UNIX_SOCKET); } @@ -447,11 +578,25 @@ static void freeClientArgv(redisClient *c) { for (j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); c->argc = 0; + c->cmd = NULL; +} + +/* Close all the slaves connections. This is useful in chained replication + * when we resync with our own master and want to force all our slaves to + * resync with us as well. */ +void disconnectSlaves(void) { + while (listLength(server.slaves)) { + listNode *ln = listFirst(server.slaves); + freeClient((redisClient*)ln->value); + } } void freeClient(redisClient *c) { listNode *ln; + /* If this is marked as current client unset it */ + if (server.current_client == c) server.current_client = NULL; + /* Note that if the client we are freeing is blocked into a blocking * call, we have to set querybuf to NULL *before* to call * unblockClientWaitingData() to avoid processInputBuffer() will get @@ -487,25 +632,6 @@ void freeClient(redisClient *c) { redisAssert(ln != NULL); listDelNode(server.unblocked_clients,ln); } - /* Remove from the list of clients waiting for swapped keys, or ready - * to be restarted, but not yet woken up again. */ - if (c->flags & REDIS_IO_WAIT) { - redisAssert(server.ds_enabled); - if (listLength(c->io_keys) == 0) { - ln = listSearchKey(server.io_ready_clients,c); - - /* When this client is waiting to be woken up (REDIS_IO_WAIT), - * it should be present in the list io_ready_clients */ - redisAssert(ln != NULL); - listDelNode(server.io_ready_clients,ln); - } else { - while (listLength(c->io_keys)) { - ln = listFirst(c->io_keys); - dontWaitForSwappedKey(c,ln->value); - } - } - server.cache_blocked_clients--; - } listRelease(c->io_keys); /* Master/slave cleanup. * Case 1: we lost the connection with a slave. */ @@ -521,32 +647,55 @@ void freeClient(redisClient *c) { /* Case 2: we lost the connection with the master. */ if (c->flags & REDIS_MASTER) { server.master = NULL; - server.replstate = REDIS_REPL_CONNECT; - /* Since we lost the connection with the master, we should also - * close the connection with all our slaves if we have any, so - * when we'll resync with the master the other slaves will sync again - * with us as well. Note that also when the slave is not connected - * to the master it will keep refusing connections by other slaves. + server.repl_state = REDIS_REPL_CONNECT; + server.repl_down_since = server.unixtime; + /* We lost connection with our master, force our slaves to resync + * with us as well to load the new data set. * - * We do this only if server.masterhost != NULL. If it is NULL this - * means the user called SLAVEOF NO ONE and we are freeing our - * link with the master, so no need to close link with slaves. */ - if (server.masterhost != NULL) { - while (listLength(server.slaves)) { - ln = listFirst(server.slaves); - freeClient((redisClient*)ln->value); - } - } + * If server.masterhost is NULL the user called SLAVEOF NO ONE so + * slave resync is not needed. */ + if (server.masterhost != NULL) disconnectSlaves(); + } + + /* If this client was scheduled for async freeing we need to remove it + * from the queue. */ + if (c->flags & REDIS_CLOSE_ASAP) { + ln = listSearchKey(server.clients_to_close,c); + redisAssert(ln != NULL); + listDelNode(server.clients_to_close,ln); } + /* Release memory */ zfree(c->argv); freeClientMultiState(c); zfree(c); } +/* Schedule a client to free it at a safe time in the serverCron() function. + * This function is useful when we need to terminate a client but we are in + * a context where calling freeClient() is not possible, because the client + * should be valid for the continuation of the flow of the program. */ +void freeClientAsync(redisClient *c) { + if (c->flags & REDIS_CLOSE_ASAP) return; + c->flags |= REDIS_CLOSE_ASAP; + listAddNodeTail(server.clients_to_close,c); +} + +void freeClientsInAsyncFreeQueue(void) { + while (listLength(server.clients_to_close)) { + listNode *ln = listFirst(server.clients_to_close); + redisClient *c = listNodeValue(ln); + + c->flags &= ~REDIS_CLOSE_ASAP; + freeClient(c); + listDelNode(server.clients_to_close,ln); + } +} + void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { redisClient *c = privdata; int nwritten = 0, totwritten = 0, objlen; + size_t objmem; robj *o; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); @@ -572,6 +721,7 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { } else { o = listNodeValue(listFirst(c->reply)); objlen = sdslen(o->ptr); + objmem = zmalloc_size_sds(o->ptr); if (objlen == 0) { listDelNode(c->reply,listFirst(c->reply)); @@ -592,14 +742,20 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { if (c->sentlen == objlen) { listDelNode(c->reply,listFirst(c->reply)); c->sentlen = 0; + c->reply_bytes -= objmem; } } - /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT + /* Note that we avoid to send more than REDIS_MAX_WRITE_PER_EVENT * bytes, in a single threaded server it's a good idea to serve * other clients as well, even if a very large request comes from * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interfae) */ - if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break; + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. */ + if (totwritten > REDIS_MAX_WRITE_PER_EVENT && + (server.maxmemory == 0 || + zmalloc_used_memory() < server.maxmemory)) break; } if (nwritten == -1) { if (errno == EAGAIN) { @@ -611,8 +767,8 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { return; } } - if (totwritten > 0) c->lastinteraction = time(NULL); - if (listLength(c->reply) == 0) { + if (totwritten > 0) c->lastinteraction = server.unixtime; + if (c->bufpos == 0 && listLength(c->reply) == 0) { c->sentlen = 0; aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); @@ -627,34 +783,8 @@ void resetClient(redisClient *c) { c->reqtype = 0; c->multibulklen = 0; c->bulklen = -1; -} - -void closeTimedoutClients(void) { - redisClient *c; - listNode *ln; - time_t now = time(NULL); - listIter li; - - listRewind(server.clients,&li); - while ((ln = listNext(&li)) != NULL) { - c = listNodeValue(ln); - if (server.maxidletime && - !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */ - !(c->flags & REDIS_MASTER) && /* no timeout for masters */ - !(c->flags & REDIS_BLOCKED) && /* no timeout for BLPOP */ - dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */ - listLength(c->pubsub_patterns) == 0 && - (now - c->lastinteraction > server.maxidletime)) - { - redisLog(REDIS_VERBOSE,"Closing idle client"); - freeClient(c); - } else if (c->flags & REDIS_BLOCKED) { - if (c->bpop.timeout != 0 && c->bpop.timeout < now) { - addReply(c,shared.nullmultibulk); - unblockClientWaitingData(c); - } - } - } + /* We clear the ASKING flag as well if we are not inside a MULTI. */ + if (!(c->flags & REDIS_MULTI)) c->flags &= (~REDIS_ASKING); } int processInlineBuffer(redisClient *c) { @@ -664,8 +794,13 @@ int processInlineBuffer(redisClient *c) { size_t querylen; /* Nothing to do without a \r\n */ - if (newline == NULL) + if (newline == NULL) { + if (sdslen(c->querybuf) > REDIS_INLINE_MAX_SIZE) { + addReplyError(c,"Protocol error: too big inline request"); + setProtocolError(c,0); + } return REDIS_ERR; + } /* Split the input buffer up to the \r\n */ querylen = newline-(c->querybuf); @@ -694,76 +829,107 @@ int processInlineBuffer(redisClient *c) { /* Helper function. Trims query buffer to make the function that processes * multi bulk requests idempotent. */ static void setProtocolError(redisClient *c, int pos) { + if (server.verbosity >= REDIS_VERBOSE) { + sds client = getClientInfoString(c); + redisLog(REDIS_VERBOSE, + "Protocol error from client: %s", client); + sdsfree(client); + } c->flags |= REDIS_CLOSE_AFTER_REPLY; c->querybuf = sdsrange(c->querybuf,pos,-1); } int processMultibulkBuffer(redisClient *c) { char *newline = NULL; - char *eptr; - int pos = 0, tolerr; - long bulklen; + int pos = 0, ok; + long long ll; if (c->multibulklen == 0) { /* The client should have been reset */ - redisAssert(c->argc == 0); + redisAssertWithInfo(c,NULL,c->argc == 0); /* Multi bulk length cannot be read without a \r\n */ - newline = strstr(c->querybuf,"\r\n"); - if (newline == NULL) + newline = strchr(c->querybuf,'\r'); + if (newline == NULL) { + if (sdslen(c->querybuf) > REDIS_INLINE_MAX_SIZE) { + addReplyError(c,"Protocol error: too big mbulk count string"); + setProtocolError(c,0); + } + return REDIS_ERR; + } + + /* Buffer should also contain \n */ + if (newline-(c->querybuf) > ((signed)sdslen(c->querybuf)-2)) return REDIS_ERR; /* We know for sure there is a whole line since newline != NULL, * so go ahead and find out the multi bulk length. */ - redisAssert(c->querybuf[0] == '*'); - c->multibulklen = strtol(c->querybuf+1,&eptr,10); - pos = (newline-c->querybuf)+2; - if (c->multibulklen <= 0) { - c->querybuf = sdsrange(c->querybuf,pos,-1); - return REDIS_OK; - } else if (c->multibulklen > 1024*1024) { + redisAssertWithInfo(c,NULL,c->querybuf[0] == '*'); + ok = string2ll(c->querybuf+1,newline-(c->querybuf+1),&ll); + if (!ok || ll > 1024*1024) { addReplyError(c,"Protocol error: invalid multibulk length"); setProtocolError(c,pos); return REDIS_ERR; } + pos = (newline-c->querybuf)+2; + if (ll <= 0) { + c->querybuf = sdsrange(c->querybuf,pos,-1); + return REDIS_OK; + } + + c->multibulklen = ll; + /* Setup argv array on client structure */ if (c->argv) zfree(c->argv); c->argv = zmalloc(sizeof(robj*)*c->multibulklen); - - /* Search new newline */ - newline = strstr(c->querybuf+pos,"\r\n"); } - redisAssert(c->multibulklen > 0); + redisAssertWithInfo(c,NULL,c->multibulklen > 0); while(c->multibulklen) { /* Read bulk length if unknown */ if (c->bulklen == -1) { - newline = strstr(c->querybuf+pos,"\r\n"); - if (newline != NULL) { - if (c->querybuf[pos] != '$') { - addReplyErrorFormat(c, - "Protocol error: expected '$', got '%c'", - c->querybuf[pos]); - setProtocolError(c,pos); - return REDIS_ERR; + newline = strchr(c->querybuf+pos,'\r'); + if (newline == NULL) { + if (sdslen(c->querybuf) > REDIS_INLINE_MAX_SIZE) { + addReplyError(c,"Protocol error: too big bulk count string"); + setProtocolError(c,0); } + break; + } - bulklen = strtol(c->querybuf+pos+1,&eptr,10); - tolerr = (eptr[0] != '\r'); - if (tolerr || bulklen == LONG_MIN || bulklen == LONG_MAX || - bulklen < 0 || bulklen > 512*1024*1024) - { - addReplyError(c,"Protocol error: invalid bulk length"); - setProtocolError(c,pos); - return REDIS_ERR; - } - pos += eptr-(c->querybuf+pos)+2; - c->bulklen = bulklen; - } else { - /* No newline in current buffer, so wait for more data */ + /* Buffer should also contain \n */ + if (newline-(c->querybuf) > ((signed)sdslen(c->querybuf)-2)) break; + + if (c->querybuf[pos] != '$') { + addReplyErrorFormat(c, + "Protocol error: expected '$', got '%c'", + c->querybuf[pos]); + setProtocolError(c,pos); + return REDIS_ERR; + } + + ok = string2ll(c->querybuf+pos+1,newline-(c->querybuf+pos+1),&ll); + if (!ok || ll < 0 || ll > 512*1024*1024) { + addReplyError(c,"Protocol error: invalid bulk length"); + setProtocolError(c,pos); + return REDIS_ERR; + } + + pos += newline-(c->querybuf+pos)+2; + if (ll >= REDIS_MBULK_BIG_ARG) { + /* If we are going to read a large object from network + * try to make it likely that it will start at c->querybuf + * boundary so that we can optimized object creation + * avoiding a large copy of data. */ + c->querybuf = sdsrange(c->querybuf,pos,-1); + pos = 0; + /* Hint the sds library about the amount of bytes this string is + * going to contain. */ + c->querybuf = sdsMakeRoomFor(c->querybuf,ll+2); } + c->bulklen = ll; } /* Read bulk argument */ @@ -771,20 +937,37 @@ int processMultibulkBuffer(redisClient *c) { /* Not enough data (+2 == trailing \r\n) */ break; } else { - c->argv[c->argc++] = createStringObject(c->querybuf+pos,c->bulklen); - pos += c->bulklen+2; + /* Optimization: if the buffer contanins JUST our bulk element + * instead of creating a new object by *copying* the sds we + * just use the current sds string. */ + if (pos == 0 && + c->bulklen >= REDIS_MBULK_BIG_ARG && + (signed) sdslen(c->querybuf) == c->bulklen+2) + { + c->argv[c->argc++] = createObject(REDIS_STRING,c->querybuf); + sdsIncrLen(c->querybuf,-2); /* remove CRLF */ + c->querybuf = sdsempty(); + /* Assume that if we saw a fat argument we'll see another one + * likely... */ + c->querybuf = sdsMakeRoomFor(c->querybuf,c->bulklen+2); + pos = 0; + } else { + c->argv[c->argc++] = + createStringObject(c->querybuf+pos,c->bulklen); + pos += c->bulklen+2; + } c->bulklen = -1; c->multibulklen--; } } /* Trim to pos */ - c->querybuf = sdsrange(c->querybuf,pos,-1); + if (pos) c->querybuf = sdsrange(c->querybuf,pos,-1); /* We're done when c->multibulk == 0 */ - if (c->multibulklen == 0) { - return REDIS_OK; - } + if (c->multibulklen == 0) return REDIS_OK; + + /* Still not read to process the command */ return REDIS_ERR; } @@ -792,7 +975,7 @@ void processInputBuffer(redisClient *c) { /* Keep processing while there is something in the input buffer */ while(sdslen(c->querybuf)) { /* Immediately abort if the client is in the middle of something. */ - if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return; + if (c->flags & REDIS_BLOCKED) return; /* REDIS_CLOSE_AFTER_REPLY closes the connection once the reply is * written to the client. Make sure to not let the reply grow after @@ -829,12 +1012,31 @@ void processInputBuffer(redisClient *c) { void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { redisClient *c = (redisClient*) privdata; - char buf[REDIS_IOBUF_LEN]; - int nread; + int nread, readlen; + size_t qblen; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); - nread = read(fd, buf, REDIS_IOBUF_LEN); + server.current_client = c; + readlen = REDIS_IOBUF_LEN; + /* If this is a multi bulk request, and we are processing a bulk reply + * that is large enough, try to maximize the probability that the query + * buffer contains exactly the SDS string representing the object, even + * at the risk of requiring more read(2) calls. This way the function + * processMultiBulkBuffer() can avoid copying buffers to create the + * Redis Object representing the argument. */ + if (c->reqtype == REDIS_REQ_MULTIBULK && c->multibulklen && c->bulklen != -1 + && c->bulklen >= REDIS_MBULK_BIG_ARG) + { + int remaining = (unsigned)(c->bulklen+2)-sdslen(c->querybuf); + + if (remaining < readlen) readlen = remaining; + } + + qblen = sdslen(c->querybuf); + if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; + c->querybuf = sdsMakeRoomFor(c->querybuf, readlen); + nread = read(fd, c->querybuf+qblen, readlen); if (nread == -1) { if (errno == EAGAIN) { nread = 0; @@ -849,12 +1051,24 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { return; } if (nread) { - c->querybuf = sdscatlen(c->querybuf,buf,nread); - c->lastinteraction = time(NULL); + sdsIncrLen(c->querybuf,nread); + c->lastinteraction = server.unixtime; } else { + server.current_client = NULL; + return; + } + if (sdslen(c->querybuf) > server.client_max_querybuf_len) { + sds ci = getClientInfoString(c), bytes = sdsempty(); + + bytes = sdscatrepr(bytes,c->querybuf,64); + redisLog(REDIS_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); + sdsfree(ci); + sdsfree(bytes); + freeClient(c); return; } processInputBuffer(c); + server.current_client = NULL; } void getClientsMaxBuffers(unsigned long *longest_output_list, @@ -875,47 +1089,83 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, *biggest_input_buffer = bib; } -void clientCommand(redisClient *c) { +/* Turn a Redis client into an sds string representing its state. */ +sds getClientInfoString(redisClient *client) { + char ip[32], flags[16], events[3], *p; + int port = 0; /* initialized to zero for the unix socket case. */ + int emask; + + if (!(client->flags & REDIS_UNIX_SOCKET)) + anetPeerToString(client->fd,ip,&port); + p = flags; + if (client->flags & REDIS_SLAVE) { + if (client->flags & REDIS_MONITOR) + *p++ = 'O'; + else + *p++ = 'S'; + } + if (client->flags & REDIS_MASTER) *p++ = 'M'; + if (client->flags & REDIS_MULTI) *p++ = 'x'; + if (client->flags & REDIS_BLOCKED) *p++ = 'b'; + if (client->flags & REDIS_DIRTY_CAS) *p++ = 'd'; + if (client->flags & REDIS_CLOSE_AFTER_REPLY) *p++ = 'c'; + if (client->flags & REDIS_UNBLOCKED) *p++ = 'u'; + if (client->flags & REDIS_CLOSE_ASAP) *p++ = 'A'; + if (client->flags & REDIS_UNIX_SOCKET) *p++ = 'U'; + if (p == flags) *p++ = 'N'; + *p++ = '\0'; + + emask = client->fd == -1 ? 0 : aeGetFileEvents(server.el,client->fd); + p = events; + if (emask & AE_READABLE) *p++ = 'r'; + if (emask & AE_WRITABLE) *p++ = 'w'; + *p = '\0'; + return sdscatprintf(sdsempty(), + "addr=%s:%d fd=%d age=%ld idle=%ld flags=%s db=%d sub=%d psub=%d multi=%d qbuf=%lu qbuf-free=%lu obl=%lu oll=%lu omem=%lu events=%s cmd=%s", + (client->flags & REDIS_UNIX_SOCKET) ? server.unixsocket : ip, + port,client->fd, + (long)(server.unixtime - client->ctime), + (long)(server.unixtime - client->lastinteraction), + flags, + client->db->id, + (int) dictSize(client->pubsub_channels), + (int) listLength(client->pubsub_patterns), + (client->flags & REDIS_MULTI) ? client->mstate.count : -1, + (unsigned long) sdslen(client->querybuf), + (unsigned long) sdsavail(client->querybuf), + (unsigned long) client->bufpos, + (unsigned long) listLength(client->reply), + getClientOutputBufferMemoryUsage(client), + events, + client->lastcmd ? client->lastcmd->name : "NULL"); +} + +sds getAllClientsInfoString(void) { listNode *ln; listIter li; redisClient *client; + sds o = sdsempty(); - if (!strcasecmp(c->argv[1]->ptr,"list") && c->argc == 2) { - sds o = sdsempty(); - time_t now = time(NULL); + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + sds cs; - listRewind(server.clients,&li); - while ((ln = listNext(&li)) != NULL) { - char ip[32], flags[16], *p; - int port; + client = listNodeValue(ln); + cs = getClientInfoString(client); + o = sdscatsds(o,cs); + sdsfree(cs); + o = sdscatlen(o,"\n",1); + } + return o; +} - client = listNodeValue(ln); - if (anetPeerToString(client->fd,ip,&port) == -1) continue; - p = flags; - if (client->flags & REDIS_SLAVE) { - if (client->flags & REDIS_MONITOR) - *p++ = 'O'; - else - *p++ = 'S'; - } - if (client->flags & REDIS_MASTER) *p++ = 'M'; - if (p == flags) *p++ = 'N'; - if (client->flags & REDIS_MULTI) *p++ = 'x'; - if (client->flags & REDIS_BLOCKED) *p++ = 'b'; - if (client->flags & REDIS_IO_WAIT) *p++ = 'i'; - if (client->flags & REDIS_DIRTY_CAS) *p++ = 'd'; - if (client->flags & REDIS_CLOSE_AFTER_REPLY) *p++ = 'c'; - if (client->flags & REDIS_UNBLOCKED) *p++ = 'u'; - *p++ = '\0'; - o = sdscatprintf(o, - "addr=%s:%d fd=%d idle=%ld flags=%s db=%d sub=%d psub=%d\n", - ip,port,client->fd, - (long)(now - client->lastinteraction), - flags, - client->db->id, - (int) dictSize(client->pubsub_channels), - (int) listLength(client->pubsub_patterns)); - } +void clientCommand(redisClient *c) { + listNode *ln; + listIter li; + redisClient *client; + + if (!strcasecmp(c->argv[1]->ptr,"list") && c->argc == 2) { + sds o = getAllClientsInfoString(); addReplyBulkCBuffer(c,o,sdslen(o)); sdsfree(o); } else if (!strcasecmp(c->argv[1]->ptr,"kill") && c->argc == 3) { @@ -942,3 +1192,181 @@ void clientCommand(redisClient *c) { addReplyError(c, "Syntax error, try CLIENT (LIST | KILL ip:port)"); } } + +/* Rewrite the command vector of the client. All the new objects ref count + * is incremented. The old command vector is freed, and the old objects + * ref count is decremented. */ +void rewriteClientCommandVector(redisClient *c, int argc, ...) { + va_list ap; + int j; + robj **argv; /* The new argument vector */ + + argv = zmalloc(sizeof(robj*)*argc); + va_start(ap,argc); + for (j = 0; j < argc; j++) { + robj *a; + + a = va_arg(ap, robj*); + argv[j] = a; + incrRefCount(a); + } + /* We free the objects in the original vector at the end, so we are + * sure that if the same objects are reused in the new vector the + * refcount gets incremented before it gets decremented. */ + for (j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); + zfree(c->argv); + /* Replace argv and argc with our new versions. */ + c->argv = argv; + c->argc = argc; + c->cmd = lookupCommand(c->argv[0]->ptr); + redisAssertWithInfo(c,NULL,c->cmd != NULL); + va_end(ap); +} + +/* Rewrite a single item in the command vector. + * The new val ref count is incremented, and the old decremented. */ +void rewriteClientCommandArgument(redisClient *c, int i, robj *newval) { + robj *oldval; + + redisAssertWithInfo(c,NULL,i < c->argc); + oldval = c->argv[i]; + c->argv[i] = newval; + incrRefCount(newval); + decrRefCount(oldval); + + /* If this is the command name make sure to fix c->cmd. */ + if (i == 0) { + c->cmd = lookupCommand(c->argv[0]->ptr); + redisAssertWithInfo(c,NULL,c->cmd != NULL); + } +} + +/* This function returns the number of bytes that Redis is virtually + * using to store the reply still not read by the client. + * It is "virtual" since the reply output list may contain objects that + * are shared and are not really using additional memory. + * + * The function returns the total sum of the length of all the objects + * stored in the output list, plus the memory used to allocate every + * list node. The static reply buffer is not taken into account since it + * is allocated anyway. + * + * Note: this function is very fast so can be called as many time as + * the caller wishes. The main usage of this function currently is + * enforcing the client output length limits. */ +unsigned long getClientOutputBufferMemoryUsage(redisClient *c) { + unsigned long list_item_size = sizeof(listNode)+sizeof(robj); + + return c->reply_bytes + (list_item_size*listLength(c->reply)); +} + +/* Get the class of a client, used in order to enforce limits to different + * classes of clients. + * + * The function will return one of the following: + * REDIS_CLIENT_LIMIT_CLASS_NORMAL -> Normal client + * REDIS_CLIENT_LIMIT_CLASS_SLAVE -> Slave or client executing MONITOR command + * REDIS_CLIENT_LIMIT_CLASS_PUBSUB -> Client subscribed to Pub/Sub channels + */ +int getClientLimitClass(redisClient *c) { + if (c->flags & REDIS_SLAVE) return REDIS_CLIENT_LIMIT_CLASS_SLAVE; + if (dictSize(c->pubsub_channels) || listLength(c->pubsub_patterns)) + return REDIS_CLIENT_LIMIT_CLASS_PUBSUB; + return REDIS_CLIENT_LIMIT_CLASS_NORMAL; +} + +int getClientLimitClassByName(char *name) { + if (!strcasecmp(name,"normal")) return REDIS_CLIENT_LIMIT_CLASS_NORMAL; + else if (!strcasecmp(name,"slave")) return REDIS_CLIENT_LIMIT_CLASS_SLAVE; + else if (!strcasecmp(name,"pubsub")) return REDIS_CLIENT_LIMIT_CLASS_PUBSUB; + else return -1; +} + +char *getClientLimitClassName(int class) { + switch(class) { + case REDIS_CLIENT_LIMIT_CLASS_NORMAL: return "normal"; + case REDIS_CLIENT_LIMIT_CLASS_SLAVE: return "slave"; + case REDIS_CLIENT_LIMIT_CLASS_PUBSUB: return "pubsub"; + default: return NULL; + } +} + +/* The function checks if the client reached output buffer soft or hard + * limit, and also update the state needed to check the soft limit as + * a side effect. + * + * Return value: non-zero if the client reached the soft or the hard limit. + * Otherwise zero is returned. */ +int checkClientOutputBufferLimits(redisClient *c) { + int soft = 0, hard = 0, class; + unsigned long used_mem = getClientOutputBufferMemoryUsage(c); + + class = getClientLimitClass(c); + if (server.client_obuf_limits[class].hard_limit_bytes && + used_mem >= server.client_obuf_limits[class].hard_limit_bytes) + hard = 1; + if (server.client_obuf_limits[class].soft_limit_bytes && + used_mem >= server.client_obuf_limits[class].soft_limit_bytes) + soft = 1; + + /* We need to check if the soft limit is reached continuously for the + * specified amount of seconds. */ + if (soft) { + if (c->obuf_soft_limit_reached_time == 0) { + c->obuf_soft_limit_reached_time = server.unixtime; + soft = 0; /* First time we see the soft limit reached */ + } else { + time_t elapsed = server.unixtime - c->obuf_soft_limit_reached_time; + + if (elapsed <= + server.client_obuf_limits[class].soft_limit_seconds) { + soft = 0; /* The client still did not reached the max number of + seconds for the soft limit to be considered + reached. */ + } + } + } else { + c->obuf_soft_limit_reached_time = 0; + } + return soft || hard; +} + +/* Asynchronously close a client if soft or hard limit is reached on the + * output buffer size. The caller can check if the client will be closed + * checking if the client REDIS_CLOSE_ASAP flag is set. + * + * Note: we need to close the client asynchronously because this function is + * called from contexts where the client can't be freed safely, i.e. from the + * lower level functions pushing data inside the client output buffers. */ +void asyncCloseClientOnOutputBufferLimitReached(redisClient *c) { + redisAssert(c->reply_bytes < ULONG_MAX-(1024*64)); + if (c->reply_bytes == 0 || c->flags & REDIS_CLOSE_ASAP) return; + if (checkClientOutputBufferLimits(c)) { + sds client = getClientInfoString(c); + + freeClientAsync(c); + redisLog(REDIS_WARNING,"Client %s scheduled to be closed ASAP for overcoming of output buffer limits.", client); + sdsfree(client); + } +} + +/* Helper function used by freeMemoryIfNeeded() in order to flush slaves + * output buffers without returning control to the event loop. */ +void flushSlavesOutputBuffers(void) { + listIter li; + listNode *ln; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = listNodeValue(ln); + int events; + + events = aeGetFileEvents(server.el,slave->fd); + if (events & AE_WRITABLE && + slave->replstate == REDIS_REPL_ONLINE && + listLength(slave->reply)) + { + sendReplyToClient(server.el,slave->fd,slave,0); + } + } +}