2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "1.3.4"
40 #define __USE_POSIX199309
47 #endif /* HAVE_BACKTRACE */
55 #include <arpa/inet.h>
59 #include <sys/resource.h>
66 #include "solarisfixes.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_STRING 0
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130 /* Object types only used for dumping to disk */
131 #define REDIS_EXPIRETIME 253
132 #define REDIS_SELECTDB 254
133 #define REDIS_EOF 255
135 /* Defines related to the dump file format. To store 32 bits lengths for short
136 * keys requires a lot of space, so we check the most significant 2 bits of
137 * the first byte to interpreter the length:
139 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
140 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
141 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
142 * 11|000000 this means: specially encoded object will follow. The six bits
143 * number specify the kind of object that follows.
144 * See the REDIS_RDB_ENC_* defines.
146 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
147 * values, will fit inside. */
148 #define REDIS_RDB_6BITLEN 0
149 #define REDIS_RDB_14BITLEN 1
150 #define REDIS_RDB_32BITLEN 2
151 #define REDIS_RDB_ENCVAL 3
152 #define REDIS_RDB_LENERR UINT_MAX
154 /* When a length of a string object stored on disk has the first two bits
155 * set, the remaining two bits specify a special encoding for the object
156 * accordingly to the following defines: */
157 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
158 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
159 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
160 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
162 /* Virtual memory object->where field. */
163 #define REDIS_VM_MEMORY 0 /* The object is on memory */
164 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
165 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
166 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
168 /* Virtual memory static configuration stuff.
169 * Check vmFindContiguousPages() to know more about this magic numbers. */
170 #define REDIS_VM_MAX_NEAR_PAGES 65536
171 #define REDIS_VM_MAX_RANDOM_JUMP 4096
172 #define REDIS_VM_MAX_THREADS 32
173 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
174 /* The following is the *percentage* of completed I/O jobs to process when the
175 * handelr is called. While Virtual Memory I/O operations are performed by
176 * threads, this operations must be processed by the main thread when completed
177 * in order to take effect. */
178 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
181 #define REDIS_SLAVE 1 /* This client is a slave server */
182 #define REDIS_MASTER 2 /* This client is a master server */
183 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
184 #define REDIS_MULTI 8 /* This client is in a MULTI context */
185 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
186 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
188 /* Slave replication state - slave side */
189 #define REDIS_REPL_NONE 0 /* No active replication */
190 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
191 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
193 /* Slave replication state - from the point of view of master
194 * Note that in SEND_BULK and ONLINE state the slave receives new updates
195 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
196 * to start the next background saving in order to send updates to it. */
197 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
198 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
199 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
200 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
202 /* List related stuff */
206 /* Sort operations */
207 #define REDIS_SORT_GET 0
208 #define REDIS_SORT_ASC 1
209 #define REDIS_SORT_DESC 2
210 #define REDIS_SORTKEY_MAX 1024
213 #define REDIS_DEBUG 0
214 #define REDIS_VERBOSE 1
215 #define REDIS_NOTICE 2
216 #define REDIS_WARNING 3
218 /* Anti-warning macro... */
219 #define REDIS_NOTUSED(V) ((void) V)
221 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
222 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
224 /* Append only defines */
225 #define APPENDFSYNC_NO 0
226 #define APPENDFSYNC_ALWAYS 1
227 #define APPENDFSYNC_EVERYSEC 2
229 /* We can print the stacktrace, so our assert is defined this way: */
230 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
231 static void _redisAssert(char *estr
, char *file
, int line
);
233 /*================================= Data types ============================== */
235 /* A redis object, that is a type able to hold a string / list / set */
237 /* The VM object structure */
238 struct redisObjectVM
{
239 off_t page
; /* the page at witch the object is stored on disk */
240 off_t usedpages
; /* number of pages used on disk */
241 time_t atime
; /* Last access time */
244 /* The actual Redis Object */
245 typedef struct redisObject
{
248 unsigned char encoding
;
249 unsigned char storage
; /* If this object is a key, where is the value?
250 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
251 unsigned char vtype
; /* If this object is a key, and value is swapped out,
252 * this is the type of the swapped out object. */
254 /* VM fields, this are only allocated if VM is active, otherwise the
255 * object allocation function will just allocate
256 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
257 * Redis without VM active will not have any overhead. */
258 struct redisObjectVM vm
;
261 /* Macro used to initalize a Redis object allocated on the stack.
262 * Note that this macro is taken near the structure definition to make sure
263 * we'll update it when the structure is changed, to avoid bugs like
264 * bug #85 introduced exactly in this way. */
265 #define initStaticStringObject(_var,_ptr) do { \
267 _var.type = REDIS_STRING; \
268 _var.encoding = REDIS_ENCODING_RAW; \
270 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
273 typedef struct redisDb
{
274 dict
*dict
; /* The keyspace for this DB */
275 dict
*expires
; /* Timeout of keys with a timeout set */
276 dict
*blockingkeys
; /* Keys with clients waiting for data (BLPOP) */
277 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
281 /* Client MULTI/EXEC state */
282 typedef struct multiCmd
{
285 struct redisCommand
*cmd
;
288 typedef struct multiState
{
289 multiCmd
*commands
; /* Array of MULTI commands */
290 int count
; /* Total number of MULTI commands */
293 /* With multiplexing we need to take per-clinet state.
294 * Clients are taken in a liked list. */
295 typedef struct redisClient
{
300 robj
**argv
, **mbargv
;
302 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
303 int multibulk
; /* multi bulk command format active */
306 time_t lastinteraction
; /* time of the last interaction, used for timeout */
307 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
308 int slaveseldb
; /* slave selected db, if this client is a slave */
309 int authenticated
; /* when requirepass is non-NULL */
310 int replstate
; /* replication state if this is a slave */
311 int repldbfd
; /* replication DB file descriptor */
312 long repldboff
; /* replication DB file offset */
313 off_t repldbsize
; /* replication DB file size */
314 multiState mstate
; /* MULTI/EXEC state */
315 robj
**blockingkeys
; /* The key we are waiting to terminate a blocking
316 * operation such as BLPOP. Otherwise NULL. */
317 int blockingkeysnum
; /* Number of blocking keys */
318 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
319 * is >= blockingto then the operation timed out. */
320 list
*io_keys
; /* Keys this client is waiting to be loaded from the
321 * swap file in order to continue. */
329 /* Global server state structure */
334 dict
*sharingpool
; /* Poll used for object sharing */
335 unsigned int sharingpoolsize
;
336 long long dirty
; /* changes to DB from the last save */
338 list
*slaves
, *monitors
;
339 char neterr
[ANET_ERR_LEN
];
341 int cronloops
; /* number of times the cron function run */
342 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
343 time_t lastsave
; /* Unix time of last save succeeede */
344 /* Fields used only for stats */
345 time_t stat_starttime
; /* server start time */
346 long long stat_numcommands
; /* number of processed commands */
347 long long stat_numconnections
; /* number of connections received */
360 pid_t bgsavechildpid
;
361 pid_t bgrewritechildpid
;
362 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
363 struct saveparam
*saveparams
;
368 char *appendfilename
;
372 /* Replication related */
377 redisClient
*master
; /* client that is master for this slave */
379 unsigned int maxclients
;
380 unsigned long long maxmemory
;
381 unsigned int blpop_blocked_clients
;
382 unsigned int vm_blocked_clients
;
383 /* Sort parameters - qsort_r() is only available under BSD so we
384 * have to take this state global, in order to pass it to sortCompare() */
388 /* Virtual memory configuration */
393 unsigned long long vm_max_memory
;
394 /* Virtual memory state */
397 off_t vm_next_page
; /* Next probably empty page */
398 off_t vm_near_pages
; /* Number of pages allocated sequentially */
399 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
400 time_t unixtime
; /* Unix time sampled every second. */
401 /* Virtual memory I/O threads stuff */
402 /* An I/O thread process an element taken from the io_jobs queue and
403 * put the result of the operation in the io_done list. While the
404 * job is being processed, it's put on io_processing queue. */
405 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
406 list
*io_processing
; /* List of VM I/O jobs being processed */
407 list
*io_processed
; /* List of VM I/O jobs already processed */
408 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
409 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
410 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
411 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
412 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
413 int io_active_threads
; /* Number of running I/O threads */
414 int vm_max_threads
; /* Max number of I/O threads running at the same time */
415 /* Our main thread is blocked on the event loop, locking for sockets ready
416 * to be read or written, so when a threaded I/O operation is ready to be
417 * processed by the main thread, the I/O thread will use a unix pipe to
418 * awake the main thread. The followings are the two pipe FDs. */
419 int io_ready_pipe_read
;
420 int io_ready_pipe_write
;
421 /* Virtual memory stats */
422 unsigned long long vm_stats_used_pages
;
423 unsigned long long vm_stats_swapped_objects
;
424 unsigned long long vm_stats_swapouts
;
425 unsigned long long vm_stats_swapins
;
429 typedef void redisCommandProc(redisClient
*c
);
430 struct redisCommand
{
432 redisCommandProc
*proc
;
435 /* What keys should be loaded in background when calling this command? */
436 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
437 int vm_lastkey
; /* THe last argument that's a key */
438 int vm_keystep
; /* The step between first and last key */
441 struct redisFunctionSym
{
443 unsigned long pointer
;
446 typedef struct _redisSortObject
{
454 typedef struct _redisSortOperation
{
457 } redisSortOperation
;
459 /* ZSETs use a specialized version of Skiplists */
461 typedef struct zskiplistNode
{
462 struct zskiplistNode
**forward
;
463 struct zskiplistNode
*backward
;
469 typedef struct zskiplist
{
470 struct zskiplistNode
*header
, *tail
;
471 unsigned long length
;
475 typedef struct zset
{
480 /* Our shared "common" objects */
482 struct sharedObjectsStruct
{
483 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
484 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
485 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
486 *outofrangeerr
, *plus
,
487 *select0
, *select1
, *select2
, *select3
, *select4
,
488 *select5
, *select6
, *select7
, *select8
, *select9
;
491 /* Global vars that are actally used as constants. The following double
492 * values are used for double on-disk serialization, and are initialized
493 * at runtime to avoid strange compiler optimizations. */
495 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
497 /* VM threaded I/O request message */
498 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
499 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
500 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
501 typedef struct iojob
{
502 int type
; /* Request type, REDIS_IOJOB_* */
503 redisDb
*db
;/* Redis database */
504 robj
*key
; /* This I/O request is about swapping this key */
505 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
506 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
507 off_t page
; /* Swap page where to read/write the object */
508 off_t pages
; /* Swap pages needed to safe object. PREPARE_SWAP return val */
509 int canceled
; /* True if this command was canceled by blocking side of VM */
510 pthread_t thread
; /* ID of the thread processing this entry */
513 /*================================ Prototypes =============================== */
515 static void freeStringObject(robj
*o
);
516 static void freeListObject(robj
*o
);
517 static void freeSetObject(robj
*o
);
518 static void decrRefCount(void *o
);
519 static robj
*createObject(int type
, void *ptr
);
520 static void freeClient(redisClient
*c
);
521 static int rdbLoad(char *filename
);
522 static void addReply(redisClient
*c
, robj
*obj
);
523 static void addReplySds(redisClient
*c
, sds s
);
524 static void incrRefCount(robj
*o
);
525 static int rdbSaveBackground(char *filename
);
526 static robj
*createStringObject(char *ptr
, size_t len
);
527 static robj
*dupStringObject(robj
*o
);
528 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
529 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
530 static int syncWithMaster(void);
531 static robj
*tryObjectSharing(robj
*o
);
532 static int tryObjectEncoding(robj
*o
);
533 static robj
*getDecodedObject(robj
*o
);
534 static int removeExpire(redisDb
*db
, robj
*key
);
535 static int expireIfNeeded(redisDb
*db
, robj
*key
);
536 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
537 static int deleteIfSwapped(redisDb
*db
, robj
*key
);
538 static int deleteKey(redisDb
*db
, robj
*key
);
539 static time_t getExpire(redisDb
*db
, robj
*key
);
540 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
541 static void updateSlavesWaitingBgsave(int bgsaveerr
);
542 static void freeMemoryIfNeeded(void);
543 static int processCommand(redisClient
*c
);
544 static void setupSigSegvAction(void);
545 static void rdbRemoveTempFile(pid_t childpid
);
546 static void aofRemoveTempFile(pid_t childpid
);
547 static size_t stringObjectLen(robj
*o
);
548 static void processInputBuffer(redisClient
*c
);
549 static zskiplist
*zslCreate(void);
550 static void zslFree(zskiplist
*zsl
);
551 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
552 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
553 static void initClientMultiState(redisClient
*c
);
554 static void freeClientMultiState(redisClient
*c
);
555 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
556 static void unblockClientWaitingData(redisClient
*c
);
557 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
558 static void vmInit(void);
559 static void vmMarkPagesFree(off_t page
, off_t count
);
560 static robj
*vmLoadObject(robj
*key
);
561 static robj
*vmPreviewObject(robj
*key
);
562 static int vmSwapOneObjectBlocking(void);
563 static int vmSwapOneObjectThreaded(void);
564 static int vmCanSwapOut(void);
565 static int tryFreeOneObjectFromFreelist(void);
566 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
567 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
568 static void vmCancelThreadedIOJob(robj
*o
);
569 static void lockThreadedIO(void);
570 static void unlockThreadedIO(void);
571 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
572 static void freeIOJob(iojob
*j
);
573 static void queueIOJob(iojob
*j
);
574 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
575 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
576 static void waitEmptyIOJobsQueue(void);
577 static void vmReopenSwapFile(void);
578 static int vmFreePage(off_t page
);
579 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
);
580 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
581 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
582 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
583 static struct redisCommand
*lookupCommand(char *name
);
584 static void call(redisClient
*c
, struct redisCommand
*cmd
);
585 static void resetClient(redisClient
*c
);
587 static void authCommand(redisClient
*c
);
588 static void pingCommand(redisClient
*c
);
589 static void echoCommand(redisClient
*c
);
590 static void setCommand(redisClient
*c
);
591 static void setnxCommand(redisClient
*c
);
592 static void getCommand(redisClient
*c
);
593 static void delCommand(redisClient
*c
);
594 static void existsCommand(redisClient
*c
);
595 static void incrCommand(redisClient
*c
);
596 static void decrCommand(redisClient
*c
);
597 static void incrbyCommand(redisClient
*c
);
598 static void decrbyCommand(redisClient
*c
);
599 static void selectCommand(redisClient
*c
);
600 static void randomkeyCommand(redisClient
*c
);
601 static void keysCommand(redisClient
*c
);
602 static void dbsizeCommand(redisClient
*c
);
603 static void lastsaveCommand(redisClient
*c
);
604 static void saveCommand(redisClient
*c
);
605 static void bgsaveCommand(redisClient
*c
);
606 static void bgrewriteaofCommand(redisClient
*c
);
607 static void shutdownCommand(redisClient
*c
);
608 static void moveCommand(redisClient
*c
);
609 static void renameCommand(redisClient
*c
);
610 static void renamenxCommand(redisClient
*c
);
611 static void lpushCommand(redisClient
*c
);
612 static void rpushCommand(redisClient
*c
);
613 static void lpopCommand(redisClient
*c
);
614 static void rpopCommand(redisClient
*c
);
615 static void llenCommand(redisClient
*c
);
616 static void lindexCommand(redisClient
*c
);
617 static void lrangeCommand(redisClient
*c
);
618 static void ltrimCommand(redisClient
*c
);
619 static void typeCommand(redisClient
*c
);
620 static void lsetCommand(redisClient
*c
);
621 static void saddCommand(redisClient
*c
);
622 static void sremCommand(redisClient
*c
);
623 static void smoveCommand(redisClient
*c
);
624 static void sismemberCommand(redisClient
*c
);
625 static void scardCommand(redisClient
*c
);
626 static void spopCommand(redisClient
*c
);
627 static void srandmemberCommand(redisClient
*c
);
628 static void sinterCommand(redisClient
*c
);
629 static void sinterstoreCommand(redisClient
*c
);
630 static void sunionCommand(redisClient
*c
);
631 static void sunionstoreCommand(redisClient
*c
);
632 static void sdiffCommand(redisClient
*c
);
633 static void sdiffstoreCommand(redisClient
*c
);
634 static void syncCommand(redisClient
*c
);
635 static void flushdbCommand(redisClient
*c
);
636 static void flushallCommand(redisClient
*c
);
637 static void sortCommand(redisClient
*c
);
638 static void lremCommand(redisClient
*c
);
639 static void rpoplpushcommand(redisClient
*c
);
640 static void infoCommand(redisClient
*c
);
641 static void mgetCommand(redisClient
*c
);
642 static void monitorCommand(redisClient
*c
);
643 static void expireCommand(redisClient
*c
);
644 static void expireatCommand(redisClient
*c
);
645 static void getsetCommand(redisClient
*c
);
646 static void ttlCommand(redisClient
*c
);
647 static void slaveofCommand(redisClient
*c
);
648 static void debugCommand(redisClient
*c
);
649 static void msetCommand(redisClient
*c
);
650 static void msetnxCommand(redisClient
*c
);
651 static void zaddCommand(redisClient
*c
);
652 static void zincrbyCommand(redisClient
*c
);
653 static void zrangeCommand(redisClient
*c
);
654 static void zrangebyscoreCommand(redisClient
*c
);
655 static void zcountCommand(redisClient
*c
);
656 static void zrevrangeCommand(redisClient
*c
);
657 static void zcardCommand(redisClient
*c
);
658 static void zremCommand(redisClient
*c
);
659 static void zscoreCommand(redisClient
*c
);
660 static void zremrangebyscoreCommand(redisClient
*c
);
661 static void multiCommand(redisClient
*c
);
662 static void execCommand(redisClient
*c
);
663 static void discardCommand(redisClient
*c
);
664 static void blpopCommand(redisClient
*c
);
665 static void brpopCommand(redisClient
*c
);
666 static void appendCommand(redisClient
*c
);
667 static void substrCommand(redisClient
*c
);
668 static void zrankCommand(redisClient
*c
);
670 /*================================= Globals ================================= */
673 static struct redisServer server
; /* server global state */
674 static struct redisCommand cmdTable
[] = {
675 {"get",getCommand
,2,REDIS_CMD_INLINE
,1,1,1},
676 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
677 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
678 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
679 {"substr",substrCommand
,4,REDIS_CMD_INLINE
,1,1,1},
680 {"del",delCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
681 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,1,1,1},
682 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
683 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
684 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,1,-1,1},
685 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
686 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
687 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
688 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
689 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
690 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
691 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,1,1,1},
692 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,1,1,1},
693 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
694 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,1,1,1},
695 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,1,1,1},
696 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,1,1,1},
697 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,2,1},
698 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
699 {"srem",sremCommand
,3,REDIS_CMD_BULK
,1,1,1},
700 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,1,2,1},
701 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,1,1,1},
702 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
703 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
704 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,1,1,1},
705 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
706 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
707 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
708 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
709 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
710 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
711 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,1,1,1},
712 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
713 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
714 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,1,1,1},
715 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,1,1,1},
716 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
717 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
718 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,1,1,1},
719 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
720 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
721 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
722 {"zrank",zrankCommand
,3,REDIS_CMD_INLINE
,1,1,1},
723 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
724 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
725 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
726 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
727 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
728 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,0,0,0},
729 {"select",selectCommand
,2,REDIS_CMD_INLINE
,0,0,0},
730 {"move",moveCommand
,3,REDIS_CMD_INLINE
,1,1,1},
731 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,1,1,1},
732 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,1,1,1},
733 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,0,0,0},
734 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,0,0,0},
735 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,0,0,0},
736 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,0,0,0},
737 {"auth",authCommand
,2,REDIS_CMD_INLINE
,0,0,0},
738 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,0,0,0},
739 {"echo",echoCommand
,2,REDIS_CMD_BULK
,0,0,0},
740 {"save",saveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
741 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
742 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,0,0,0},
743 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,0,0,0},
744 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
745 {"type",typeCommand
,2,REDIS_CMD_INLINE
,1,1,1},
746 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,0,0,0},
747 {"exec",execCommand
,1,REDIS_CMD_INLINE
,0,0,0},
748 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,0,0,0},
749 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,0,0,0},
750 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,0,0,0},
751 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,0,0,0},
752 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
753 {"info",infoCommand
,1,REDIS_CMD_INLINE
,0,0,0},
754 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,0,0,0},
755 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,1,1,1},
756 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,0,0,0},
757 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
758 {NULL
,NULL
,0,0,0,0,0}
761 /*============================ Utility functions ============================ */
763 /* Glob-style pattern matching. */
764 int stringmatchlen(const char *pattern
, int patternLen
,
765 const char *string
, int stringLen
, int nocase
)
770 while (pattern
[1] == '*') {
775 return 1; /* match */
777 if (stringmatchlen(pattern
+1, patternLen
-1,
778 string
, stringLen
, nocase
))
779 return 1; /* match */
783 return 0; /* no match */
787 return 0; /* no match */
797 not = pattern
[0] == '^';
804 if (pattern
[0] == '\\') {
807 if (pattern
[0] == string
[0])
809 } else if (pattern
[0] == ']') {
811 } else if (patternLen
== 0) {
815 } else if (pattern
[1] == '-' && patternLen
>= 3) {
816 int start
= pattern
[0];
817 int end
= pattern
[2];
825 start
= tolower(start
);
831 if (c
>= start
&& c
<= end
)
835 if (pattern
[0] == string
[0])
838 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
848 return 0; /* no match */
854 if (patternLen
>= 2) {
861 if (pattern
[0] != string
[0])
862 return 0; /* no match */
864 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
865 return 0; /* no match */
873 if (stringLen
== 0) {
874 while(*pattern
== '*') {
881 if (patternLen
== 0 && stringLen
== 0)
886 static void redisLog(int level
, const char *fmt
, ...) {
890 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
894 if (level
>= server
.verbosity
) {
900 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
901 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
902 vfprintf(fp
, fmt
, ap
);
908 if (server
.logfile
) fclose(fp
);
911 /*====================== Hash table type implementation ==================== */
913 /* This is an hash table type that uses the SDS dynamic strings libary as
914 * keys and radis objects as values (objects can hold SDS strings,
917 static void dictVanillaFree(void *privdata
, void *val
)
919 DICT_NOTUSED(privdata
);
923 static void dictListDestructor(void *privdata
, void *val
)
925 DICT_NOTUSED(privdata
);
926 listRelease((list
*)val
);
929 static int sdsDictKeyCompare(void *privdata
, const void *key1
,
933 DICT_NOTUSED(privdata
);
935 l1
= sdslen((sds
)key1
);
936 l2
= sdslen((sds
)key2
);
937 if (l1
!= l2
) return 0;
938 return memcmp(key1
, key2
, l1
) == 0;
941 static void dictRedisObjectDestructor(void *privdata
, void *val
)
943 DICT_NOTUSED(privdata
);
945 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
949 static int dictObjKeyCompare(void *privdata
, const void *key1
,
952 const robj
*o1
= key1
, *o2
= key2
;
953 return sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
956 static unsigned int dictObjHash(const void *key
) {
958 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
961 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
964 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
967 o1
= getDecodedObject(o1
);
968 o2
= getDecodedObject(o2
);
969 cmp
= sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
975 static unsigned int dictEncObjHash(const void *key
) {
976 robj
*o
= (robj
*) key
;
978 if (o
->encoding
== REDIS_ENCODING_RAW
) {
979 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
981 if (o
->encoding
== REDIS_ENCODING_INT
) {
985 len
= snprintf(buf
,32,"%ld",(long)o
->ptr
);
986 return dictGenHashFunction((unsigned char*)buf
, len
);
990 o
= getDecodedObject(o
);
991 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
998 /* Sets type and expires */
999 static dictType setDictType
= {
1000 dictEncObjHash
, /* hash function */
1003 dictEncObjKeyCompare
, /* key compare */
1004 dictRedisObjectDestructor
, /* key destructor */
1005 NULL
/* val destructor */
1008 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1009 static dictType zsetDictType
= {
1010 dictEncObjHash
, /* hash function */
1013 dictEncObjKeyCompare
, /* key compare */
1014 dictRedisObjectDestructor
, /* key destructor */
1015 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1019 static dictType dbDictType
= {
1020 dictObjHash
, /* hash function */
1023 dictObjKeyCompare
, /* key compare */
1024 dictRedisObjectDestructor
, /* key destructor */
1025 dictRedisObjectDestructor
/* val destructor */
1029 static dictType keyptrDictType
= {
1030 dictObjHash
, /* hash function */
1033 dictObjKeyCompare
, /* key compare */
1034 dictRedisObjectDestructor
, /* key destructor */
1035 NULL
/* val destructor */
1038 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1039 static dictType hashDictType
= {
1040 dictEncObjHash
, /* hash function */
1043 dictEncObjKeyCompare
, /* key compare */
1044 dictRedisObjectDestructor
, /* key destructor */
1045 dictRedisObjectDestructor
/* val destructor */
1048 /* Keylist hash table type has unencoded redis objects as keys and
1049 * lists as values. It's used for blocking operations (BLPOP) and to
1050 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1051 static dictType keylistDictType
= {
1052 dictObjHash
, /* hash function */
1055 dictObjKeyCompare
, /* key compare */
1056 dictRedisObjectDestructor
, /* key destructor */
1057 dictListDestructor
/* val destructor */
1060 /* ========================= Random utility functions ======================= */
1062 /* Redis generally does not try to recover from out of memory conditions
1063 * when allocating objects or strings, it is not clear if it will be possible
1064 * to report this condition to the client since the networking layer itself
1065 * is based on heap allocation for send buffers, so we simply abort.
1066 * At least the code will be simpler to read... */
1067 static void oom(const char *msg
) {
1068 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1073 /* ====================== Redis server networking stuff ===================== */
1074 static void closeTimedoutClients(void) {
1077 time_t now
= time(NULL
);
1080 listRewind(server
.clients
,&li
);
1081 while ((ln
= listNext(&li
)) != NULL
) {
1082 c
= listNodeValue(ln
);
1083 if (server
.maxidletime
&&
1084 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1085 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1086 (now
- c
->lastinteraction
> server
.maxidletime
))
1088 redisLog(REDIS_VERBOSE
,"Closing idle client");
1090 } else if (c
->flags
& REDIS_BLOCKED
) {
1091 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1092 addReply(c
,shared
.nullmultibulk
);
1093 unblockClientWaitingData(c
);
1099 static int htNeedsResize(dict
*dict
) {
1100 long long size
, used
;
1102 size
= dictSlots(dict
);
1103 used
= dictSize(dict
);
1104 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1105 (used
*100/size
< REDIS_HT_MINFILL
));
1108 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1109 * we resize the hash table to save memory */
1110 static void tryResizeHashTables(void) {
1113 for (j
= 0; j
< server
.dbnum
; j
++) {
1114 if (htNeedsResize(server
.db
[j
].dict
)) {
1115 redisLog(REDIS_VERBOSE
,"The hash table %d is too sparse, resize it...",j
);
1116 dictResize(server
.db
[j
].dict
);
1117 redisLog(REDIS_VERBOSE
,"Hash table %d resized.",j
);
1119 if (htNeedsResize(server
.db
[j
].expires
))
1120 dictResize(server
.db
[j
].expires
);
1124 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1125 void backgroundSaveDoneHandler(int statloc
) {
1126 int exitcode
= WEXITSTATUS(statloc
);
1127 int bysignal
= WIFSIGNALED(statloc
);
1129 if (!bysignal
&& exitcode
== 0) {
1130 redisLog(REDIS_NOTICE
,
1131 "Background saving terminated with success");
1133 server
.lastsave
= time(NULL
);
1134 } else if (!bysignal
&& exitcode
!= 0) {
1135 redisLog(REDIS_WARNING
, "Background saving error");
1137 redisLog(REDIS_WARNING
,
1138 "Background saving terminated by signal");
1139 rdbRemoveTempFile(server
.bgsavechildpid
);
1141 server
.bgsavechildpid
= -1;
1142 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1143 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1144 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1147 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1149 void backgroundRewriteDoneHandler(int statloc
) {
1150 int exitcode
= WEXITSTATUS(statloc
);
1151 int bysignal
= WIFSIGNALED(statloc
);
1153 if (!bysignal
&& exitcode
== 0) {
1157 redisLog(REDIS_NOTICE
,
1158 "Background append only file rewriting terminated with success");
1159 /* Now it's time to flush the differences accumulated by the parent */
1160 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1161 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1163 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1166 /* Flush our data... */
1167 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1168 (signed) sdslen(server
.bgrewritebuf
)) {
1169 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1173 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1174 /* Now our work is to rename the temp file into the stable file. And
1175 * switch the file descriptor used by the server for append only. */
1176 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1177 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1181 /* Mission completed... almost */
1182 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1183 if (server
.appendfd
!= -1) {
1184 /* If append only is actually enabled... */
1185 close(server
.appendfd
);
1186 server
.appendfd
= fd
;
1188 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1189 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1191 /* If append only is disabled we just generate a dump in this
1192 * format. Why not? */
1195 } else if (!bysignal
&& exitcode
!= 0) {
1196 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1198 redisLog(REDIS_WARNING
,
1199 "Background append only file rewriting terminated by signal");
1202 sdsfree(server
.bgrewritebuf
);
1203 server
.bgrewritebuf
= sdsempty();
1204 aofRemoveTempFile(server
.bgrewritechildpid
);
1205 server
.bgrewritechildpid
= -1;
1208 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1209 int j
, loops
= server
.cronloops
++;
1210 REDIS_NOTUSED(eventLoop
);
1212 REDIS_NOTUSED(clientData
);
1214 /* We take a cached value of the unix time in the global state because
1215 * with virtual memory and aging there is to store the current time
1216 * in objects at every object access, and accuracy is not needed.
1217 * To access a global var is faster than calling time(NULL) */
1218 server
.unixtime
= time(NULL
);
1220 /* Show some info about non-empty databases */
1221 for (j
= 0; j
< server
.dbnum
; j
++) {
1222 long long size
, used
, vkeys
;
1224 size
= dictSlots(server
.db
[j
].dict
);
1225 used
= dictSize(server
.db
[j
].dict
);
1226 vkeys
= dictSize(server
.db
[j
].expires
);
1227 if (!(loops
% 5) && (used
|| vkeys
)) {
1228 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1229 /* dictPrintStats(server.dict); */
1233 /* We don't want to resize the hash tables while a bacground saving
1234 * is in progress: the saving child is created using fork() that is
1235 * implemented with a copy-on-write semantic in most modern systems, so
1236 * if we resize the HT while there is the saving child at work actually
1237 * a lot of memory movements in the parent will cause a lot of pages
1239 if (server
.bgsavechildpid
== -1) tryResizeHashTables();
1241 /* Show information about connected clients */
1243 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1244 listLength(server
.clients
)-listLength(server
.slaves
),
1245 listLength(server
.slaves
),
1246 zmalloc_used_memory(),
1247 dictSize(server
.sharingpool
));
1250 /* Close connections of timedout clients */
1251 if ((server
.maxidletime
&& !(loops
% 10)) || server
.blpop_blocked_clients
)
1252 closeTimedoutClients();
1254 /* Check if a background saving or AOF rewrite in progress terminated */
1255 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1259 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1260 if (pid
== server
.bgsavechildpid
) {
1261 backgroundSaveDoneHandler(statloc
);
1263 backgroundRewriteDoneHandler(statloc
);
1267 /* If there is not a background saving in progress check if
1268 * we have to save now */
1269 time_t now
= time(NULL
);
1270 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1271 struct saveparam
*sp
= server
.saveparams
+j
;
1273 if (server
.dirty
>= sp
->changes
&&
1274 now
-server
.lastsave
> sp
->seconds
) {
1275 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1276 sp
->changes
, sp
->seconds
);
1277 rdbSaveBackground(server
.dbfilename
);
1283 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1284 * will use few CPU cycles if there are few expiring keys, otherwise
1285 * it will get more aggressive to avoid that too much memory is used by
1286 * keys that can be removed from the keyspace. */
1287 for (j
= 0; j
< server
.dbnum
; j
++) {
1289 redisDb
*db
= server
.db
+j
;
1291 /* Continue to expire if at the end of the cycle more than 25%
1292 * of the keys were expired. */
1294 long num
= dictSize(db
->expires
);
1295 time_t now
= time(NULL
);
1298 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1299 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1304 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1305 t
= (time_t) dictGetEntryVal(de
);
1307 deleteKey(db
,dictGetEntryKey(de
));
1311 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1314 /* Swap a few keys on disk if we are over the memory limit and VM
1315 * is enbled. Try to free objects from the free list first. */
1316 if (vmCanSwapOut()) {
1317 while (server
.vm_enabled
&& zmalloc_used_memory() >
1318 server
.vm_max_memory
)
1322 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1323 retval
= (server
.vm_max_threads
== 0) ?
1324 vmSwapOneObjectBlocking() :
1325 vmSwapOneObjectThreaded();
1326 if (retval
== REDIS_ERR
&& (loops
% 30) == 0 &&
1327 zmalloc_used_memory() >
1328 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1330 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1332 /* Note that when using threade I/O we free just one object,
1333 * because anyway when the I/O thread in charge to swap this
1334 * object out will finish, the handler of completed jobs
1335 * will try to swap more objects if we are still out of memory. */
1336 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1340 /* Check if we should connect to a MASTER */
1341 if (server
.replstate
== REDIS_REPL_CONNECT
) {
1342 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1343 if (syncWithMaster() == REDIS_OK
) {
1344 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1350 /* This function gets called every time Redis is entering the
1351 * main loop of the event driven library, that is, before to sleep
1352 * for ready file descriptors. */
1353 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1354 REDIS_NOTUSED(eventLoop
);
1356 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1360 listRewind(server
.io_ready_clients
,&li
);
1361 while((ln
= listNext(&li
))) {
1362 redisClient
*c
= ln
->value
;
1363 struct redisCommand
*cmd
;
1365 /* Resume the client. */
1366 listDelNode(server
.io_ready_clients
,ln
);
1367 c
->flags
&= (~REDIS_IO_WAIT
);
1368 server
.vm_blocked_clients
--;
1369 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1370 readQueryFromClient
, c
);
1371 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1372 assert(cmd
!= NULL
);
1375 /* There may be more data to process in the input buffer. */
1376 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1377 processInputBuffer(c
);
1382 static void createSharedObjects(void) {
1383 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1384 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1385 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1386 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1387 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1388 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1389 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1390 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1391 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1392 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1393 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1394 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1395 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1396 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1397 "-ERR no such key\r\n"));
1398 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1399 "-ERR syntax error\r\n"));
1400 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1401 "-ERR source and destination objects are the same\r\n"));
1402 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1403 "-ERR index out of range\r\n"));
1404 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1405 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1406 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1407 shared
.select0
= createStringObject("select 0\r\n",10);
1408 shared
.select1
= createStringObject("select 1\r\n",10);
1409 shared
.select2
= createStringObject("select 2\r\n",10);
1410 shared
.select3
= createStringObject("select 3\r\n",10);
1411 shared
.select4
= createStringObject("select 4\r\n",10);
1412 shared
.select5
= createStringObject("select 5\r\n",10);
1413 shared
.select6
= createStringObject("select 6\r\n",10);
1414 shared
.select7
= createStringObject("select 7\r\n",10);
1415 shared
.select8
= createStringObject("select 8\r\n",10);
1416 shared
.select9
= createStringObject("select 9\r\n",10);
1419 static void appendServerSaveParams(time_t seconds
, int changes
) {
1420 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1421 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1422 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1423 server
.saveparamslen
++;
1426 static void resetServerSaveParams() {
1427 zfree(server
.saveparams
);
1428 server
.saveparams
= NULL
;
1429 server
.saveparamslen
= 0;
1432 static void initServerConfig() {
1433 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1434 server
.port
= REDIS_SERVERPORT
;
1435 server
.verbosity
= REDIS_VERBOSE
;
1436 server
.maxidletime
= REDIS_MAXIDLETIME
;
1437 server
.saveparams
= NULL
;
1438 server
.logfile
= NULL
; /* NULL = log on standard output */
1439 server
.bindaddr
= NULL
;
1440 server
.glueoutputbuf
= 1;
1441 server
.daemonize
= 0;
1442 server
.appendonly
= 0;
1443 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1444 server
.lastfsync
= time(NULL
);
1445 server
.appendfd
= -1;
1446 server
.appendseldb
= -1; /* Make sure the first time will not match */
1447 server
.pidfile
= "/var/run/redis.pid";
1448 server
.dbfilename
= "dump.rdb";
1449 server
.appendfilename
= "appendonly.aof";
1450 server
.requirepass
= NULL
;
1451 server
.shareobjects
= 0;
1452 server
.rdbcompression
= 1;
1453 server
.sharingpoolsize
= 1024;
1454 server
.maxclients
= 0;
1455 server
.blpop_blocked_clients
= 0;
1456 server
.maxmemory
= 0;
1457 server
.vm_enabled
= 0;
1458 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1459 server
.vm_page_size
= 256; /* 256 bytes per page */
1460 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1461 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1462 server
.vm_max_threads
= 4;
1463 server
.vm_blocked_clients
= 0;
1465 resetServerSaveParams();
1467 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1468 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1469 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1470 /* Replication related */
1472 server
.masterauth
= NULL
;
1473 server
.masterhost
= NULL
;
1474 server
.masterport
= 6379;
1475 server
.master
= NULL
;
1476 server
.replstate
= REDIS_REPL_NONE
;
1478 /* Double constants initialization */
1480 R_PosInf
= 1.0/R_Zero
;
1481 R_NegInf
= -1.0/R_Zero
;
1482 R_Nan
= R_Zero
/R_Zero
;
1485 static void initServer() {
1488 signal(SIGHUP
, SIG_IGN
);
1489 signal(SIGPIPE
, SIG_IGN
);
1490 setupSigSegvAction();
1492 server
.devnull
= fopen("/dev/null","w");
1493 if (server
.devnull
== NULL
) {
1494 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1497 server
.clients
= listCreate();
1498 server
.slaves
= listCreate();
1499 server
.monitors
= listCreate();
1500 server
.objfreelist
= listCreate();
1501 createSharedObjects();
1502 server
.el
= aeCreateEventLoop();
1503 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1504 server
.sharingpool
= dictCreate(&setDictType
,NULL
);
1505 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1506 if (server
.fd
== -1) {
1507 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1510 for (j
= 0; j
< server
.dbnum
; j
++) {
1511 server
.db
[j
].dict
= dictCreate(&dbDictType
,NULL
);
1512 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1513 server
.db
[j
].blockingkeys
= dictCreate(&keylistDictType
,NULL
);
1514 if (server
.vm_enabled
)
1515 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1516 server
.db
[j
].id
= j
;
1518 server
.cronloops
= 0;
1519 server
.bgsavechildpid
= -1;
1520 server
.bgrewritechildpid
= -1;
1521 server
.bgrewritebuf
= sdsempty();
1522 server
.lastsave
= time(NULL
);
1524 server
.stat_numcommands
= 0;
1525 server
.stat_numconnections
= 0;
1526 server
.stat_starttime
= time(NULL
);
1527 server
.unixtime
= time(NULL
);
1528 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1529 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1530 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1532 if (server
.appendonly
) {
1533 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1534 if (server
.appendfd
== -1) {
1535 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1541 if (server
.vm_enabled
) vmInit();
1544 /* Empty the whole database */
1545 static long long emptyDb() {
1547 long long removed
= 0;
1549 for (j
= 0; j
< server
.dbnum
; j
++) {
1550 removed
+= dictSize(server
.db
[j
].dict
);
1551 dictEmpty(server
.db
[j
].dict
);
1552 dictEmpty(server
.db
[j
].expires
);
1557 static int yesnotoi(char *s
) {
1558 if (!strcasecmp(s
,"yes")) return 1;
1559 else if (!strcasecmp(s
,"no")) return 0;
1563 /* I agree, this is a very rudimental way to load a configuration...
1564 will improve later if the config gets more complex */
1565 static void loadServerConfig(char *filename
) {
1567 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1571 if (filename
[0] == '-' && filename
[1] == '\0')
1574 if ((fp
= fopen(filename
,"r")) == NULL
) {
1575 redisLog(REDIS_WARNING
,"Fatal error, can't open config file");
1580 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1586 line
= sdstrim(line
," \t\r\n");
1588 /* Skip comments and blank lines*/
1589 if (line
[0] == '#' || line
[0] == '\0') {
1594 /* Split into arguments */
1595 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1596 sdstolower(argv
[0]);
1598 /* Execute config directives */
1599 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1600 server
.maxidletime
= atoi(argv
[1]);
1601 if (server
.maxidletime
< 0) {
1602 err
= "Invalid timeout value"; goto loaderr
;
1604 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1605 server
.port
= atoi(argv
[1]);
1606 if (server
.port
< 1 || server
.port
> 65535) {
1607 err
= "Invalid port"; goto loaderr
;
1609 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1610 server
.bindaddr
= zstrdup(argv
[1]);
1611 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1612 int seconds
= atoi(argv
[1]);
1613 int changes
= atoi(argv
[2]);
1614 if (seconds
< 1 || changes
< 0) {
1615 err
= "Invalid save parameters"; goto loaderr
;
1617 appendServerSaveParams(seconds
,changes
);
1618 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1619 if (chdir(argv
[1]) == -1) {
1620 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1621 argv
[1], strerror(errno
));
1624 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1625 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1626 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1627 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1628 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1630 err
= "Invalid log level. Must be one of debug, notice, warning";
1633 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1636 server
.logfile
= zstrdup(argv
[1]);
1637 if (!strcasecmp(server
.logfile
,"stdout")) {
1638 zfree(server
.logfile
);
1639 server
.logfile
= NULL
;
1641 if (server
.logfile
) {
1642 /* Test if we are able to open the file. The server will not
1643 * be able to abort just for this problem later... */
1644 logfp
= fopen(server
.logfile
,"a");
1645 if (logfp
== NULL
) {
1646 err
= sdscatprintf(sdsempty(),
1647 "Can't open the log file: %s", strerror(errno
));
1652 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1653 server
.dbnum
= atoi(argv
[1]);
1654 if (server
.dbnum
< 1) {
1655 err
= "Invalid number of databases"; goto loaderr
;
1657 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1658 server
.maxclients
= atoi(argv
[1]);
1659 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1660 server
.maxmemory
= strtoll(argv
[1], NULL
, 10);
1661 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1662 server
.masterhost
= sdsnew(argv
[1]);
1663 server
.masterport
= atoi(argv
[2]);
1664 server
.replstate
= REDIS_REPL_CONNECT
;
1665 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1666 server
.masterauth
= zstrdup(argv
[1]);
1667 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1668 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1669 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1671 } else if (!strcasecmp(argv
[0],"shareobjects") && argc
== 2) {
1672 if ((server
.shareobjects
= yesnotoi(argv
[1])) == -1) {
1673 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1675 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1676 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1677 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1679 } else if (!strcasecmp(argv
[0],"shareobjectspoolsize") && argc
== 2) {
1680 server
.sharingpoolsize
= atoi(argv
[1]);
1681 if (server
.sharingpoolsize
< 1) {
1682 err
= "invalid object sharing pool size"; goto loaderr
;
1684 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1685 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1686 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1688 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
1689 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
1690 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1692 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
1693 if (!strcasecmp(argv
[1],"no")) {
1694 server
.appendfsync
= APPENDFSYNC_NO
;
1695 } else if (!strcasecmp(argv
[1],"always")) {
1696 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1697 } else if (!strcasecmp(argv
[1],"everysec")) {
1698 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1700 err
= "argument must be 'no', 'always' or 'everysec'";
1703 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
1704 server
.requirepass
= zstrdup(argv
[1]);
1705 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
1706 server
.pidfile
= zstrdup(argv
[1]);
1707 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
1708 server
.dbfilename
= zstrdup(argv
[1]);
1709 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
1710 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
1711 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1713 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
1714 zfree(server
.vm_swap_file
);
1715 server
.vm_swap_file
= zstrdup(argv
[1]);
1716 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
1717 server
.vm_max_memory
= strtoll(argv
[1], NULL
, 10);
1718 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
1719 server
.vm_page_size
= strtoll(argv
[1], NULL
, 10);
1720 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
1721 server
.vm_pages
= strtoll(argv
[1], NULL
, 10);
1722 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1723 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1725 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
1727 for (j
= 0; j
< argc
; j
++)
1732 if (fp
!= stdin
) fclose(fp
);
1736 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
1737 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
1738 fprintf(stderr
, ">>> '%s'\n", line
);
1739 fprintf(stderr
, "%s\n", err
);
1743 static void freeClientArgv(redisClient
*c
) {
1746 for (j
= 0; j
< c
->argc
; j
++)
1747 decrRefCount(c
->argv
[j
]);
1748 for (j
= 0; j
< c
->mbargc
; j
++)
1749 decrRefCount(c
->mbargv
[j
]);
1754 static void freeClient(redisClient
*c
) {
1757 /* Note that if the client we are freeing is blocked into a blocking
1758 * call, we have to set querybuf to NULL *before* to call
1759 * unblockClientWaitingData() to avoid processInputBuffer() will get
1760 * called. Also it is important to remove the file events after
1761 * this, because this call adds the READABLE event. */
1762 sdsfree(c
->querybuf
);
1764 if (c
->flags
& REDIS_BLOCKED
)
1765 unblockClientWaitingData(c
);
1767 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
1768 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1769 listRelease(c
->reply
);
1772 /* Remove from the list of clients */
1773 ln
= listSearchKey(server
.clients
,c
);
1774 redisAssert(ln
!= NULL
);
1775 listDelNode(server
.clients
,ln
);
1776 /* Remove from the list of clients waiting for swapped keys */
1777 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
1778 ln
= listSearchKey(server
.io_ready_clients
,c
);
1780 listDelNode(server
.io_ready_clients
,ln
);
1781 server
.vm_blocked_clients
--;
1784 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
1785 ln
= listFirst(c
->io_keys
);
1786 dontWaitForSwappedKey(c
,ln
->value
);
1788 listRelease(c
->io_keys
);
1790 if (c
->flags
& REDIS_SLAVE
) {
1791 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
1793 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
1794 ln
= listSearchKey(l
,c
);
1795 redisAssert(ln
!= NULL
);
1798 if (c
->flags
& REDIS_MASTER
) {
1799 server
.master
= NULL
;
1800 server
.replstate
= REDIS_REPL_CONNECT
;
1804 freeClientMultiState(c
);
1808 #define GLUEREPLY_UP_TO (1024)
1809 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
1811 char buf
[GLUEREPLY_UP_TO
];
1816 listRewind(c
->reply
,&li
);
1817 while((ln
= listNext(&li
))) {
1821 objlen
= sdslen(o
->ptr
);
1822 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
1823 memcpy(buf
+copylen
,o
->ptr
,objlen
);
1825 listDelNode(c
->reply
,ln
);
1827 if (copylen
== 0) return;
1831 /* Now the output buffer is empty, add the new single element */
1832 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
1833 listAddNodeHead(c
->reply
,o
);
1836 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
1837 redisClient
*c
= privdata
;
1838 int nwritten
= 0, totwritten
= 0, objlen
;
1841 REDIS_NOTUSED(mask
);
1843 /* Use writev() if we have enough buffers to send */
1844 if (!server
.glueoutputbuf
&&
1845 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
1846 !(c
->flags
& REDIS_MASTER
))
1848 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
1852 while(listLength(c
->reply
)) {
1853 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
1854 glueReplyBuffersIfNeeded(c
);
1856 o
= listNodeValue(listFirst(c
->reply
));
1857 objlen
= sdslen(o
->ptr
);
1860 listDelNode(c
->reply
,listFirst(c
->reply
));
1864 if (c
->flags
& REDIS_MASTER
) {
1865 /* Don't reply to a master */
1866 nwritten
= objlen
- c
->sentlen
;
1868 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
1869 if (nwritten
<= 0) break;
1871 c
->sentlen
+= nwritten
;
1872 totwritten
+= nwritten
;
1873 /* If we fully sent the object on head go to the next one */
1874 if (c
->sentlen
== objlen
) {
1875 listDelNode(c
->reply
,listFirst(c
->reply
));
1878 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1879 * bytes, in a single threaded server it's a good idea to serve
1880 * other clients as well, even if a very large request comes from
1881 * super fast link that is always able to accept data (in real world
1882 * scenario think about 'KEYS *' against the loopback interfae) */
1883 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
1885 if (nwritten
== -1) {
1886 if (errno
== EAGAIN
) {
1889 redisLog(REDIS_VERBOSE
,
1890 "Error writing to client: %s", strerror(errno
));
1895 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
1896 if (listLength(c
->reply
) == 0) {
1898 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1902 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
1904 redisClient
*c
= privdata
;
1905 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
1907 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
1908 int offset
, ion
= 0;
1910 REDIS_NOTUSED(mask
);
1913 while (listLength(c
->reply
)) {
1914 offset
= c
->sentlen
;
1918 /* fill-in the iov[] array */
1919 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
1920 o
= listNodeValue(node
);
1921 objlen
= sdslen(o
->ptr
);
1923 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
1926 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
1927 break; /* no more iovecs */
1929 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
1930 iov
[ion
].iov_len
= objlen
- offset
;
1931 willwrite
+= objlen
- offset
;
1932 offset
= 0; /* just for the first item */
1939 /* write all collected blocks at once */
1940 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
1941 if (errno
!= EAGAIN
) {
1942 redisLog(REDIS_VERBOSE
,
1943 "Error writing to client: %s", strerror(errno
));
1950 totwritten
+= nwritten
;
1951 offset
= c
->sentlen
;
1953 /* remove written robjs from c->reply */
1954 while (nwritten
&& listLength(c
->reply
)) {
1955 o
= listNodeValue(listFirst(c
->reply
));
1956 objlen
= sdslen(o
->ptr
);
1958 if(nwritten
>= objlen
- offset
) {
1959 listDelNode(c
->reply
, listFirst(c
->reply
));
1960 nwritten
-= objlen
- offset
;
1964 c
->sentlen
+= nwritten
;
1972 c
->lastinteraction
= time(NULL
);
1974 if (listLength(c
->reply
) == 0) {
1976 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1980 static struct redisCommand
*lookupCommand(char *name
) {
1982 while(cmdTable
[j
].name
!= NULL
) {
1983 if (!strcasecmp(name
,cmdTable
[j
].name
)) return &cmdTable
[j
];
1989 /* resetClient prepare the client to process the next command */
1990 static void resetClient(redisClient
*c
) {
1996 /* Call() is the core of Redis execution of a command */
1997 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
2000 dirty
= server
.dirty
;
2002 if (server
.appendonly
&& server
.dirty
-dirty
)
2003 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2004 if (server
.dirty
-dirty
&& listLength(server
.slaves
))
2005 replicationFeedSlaves(server
.slaves
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2006 if (listLength(server
.monitors
))
2007 replicationFeedSlaves(server
.monitors
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2008 server
.stat_numcommands
++;
2011 /* If this function gets called we already read a whole
2012 * command, argments are in the client argv/argc fields.
2013 * processCommand() execute the command or prepare the
2014 * server for a bulk read from the client.
2016 * If 1 is returned the client is still alive and valid and
2017 * and other operations can be performed by the caller. Otherwise
2018 * if 0 is returned the client was destroied (i.e. after QUIT). */
2019 static int processCommand(redisClient
*c
) {
2020 struct redisCommand
*cmd
;
2022 /* Free some memory if needed (maxmemory setting) */
2023 if (server
.maxmemory
) freeMemoryIfNeeded();
2025 /* Handle the multi bulk command type. This is an alternative protocol
2026 * supported by Redis in order to receive commands that are composed of
2027 * multiple binary-safe "bulk" arguments. The latency of processing is
2028 * a bit higher but this allows things like multi-sets, so if this
2029 * protocol is used only for MSET and similar commands this is a big win. */
2030 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2031 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2032 if (c
->multibulk
<= 0) {
2036 decrRefCount(c
->argv
[c
->argc
-1]);
2040 } else if (c
->multibulk
) {
2041 if (c
->bulklen
== -1) {
2042 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2043 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2047 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2048 decrRefCount(c
->argv
[0]);
2049 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2051 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2056 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2060 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2061 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2065 if (c
->multibulk
== 0) {
2069 /* Here we need to swap the multi-bulk argc/argv with the
2070 * normal argc/argv of the client structure. */
2072 c
->argv
= c
->mbargv
;
2073 c
->mbargv
= auxargv
;
2076 c
->argc
= c
->mbargc
;
2077 c
->mbargc
= auxargc
;
2079 /* We need to set bulklen to something different than -1
2080 * in order for the code below to process the command without
2081 * to try to read the last argument of a bulk command as
2082 * a special argument. */
2084 /* continue below and process the command */
2091 /* -- end of multi bulk commands processing -- */
2093 /* The QUIT command is handled as a special case. Normal command
2094 * procs are unable to close the client connection safely */
2095 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2100 /* Now lookup the command and check ASAP about trivial error conditions
2101 * such wrong arity, bad command name and so forth. */
2102 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2105 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2106 (char*)c
->argv
[0]->ptr
));
2109 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2110 (c
->argc
< -cmd
->arity
)) {
2112 sdscatprintf(sdsempty(),
2113 "-ERR wrong number of arguments for '%s' command\r\n",
2117 } else if (server
.maxmemory
&& cmd
->flags
& REDIS_CMD_DENYOOM
&& zmalloc_used_memory() > server
.maxmemory
) {
2118 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2121 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2122 /* This is a bulk command, we have to read the last argument yet. */
2123 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2125 decrRefCount(c
->argv
[c
->argc
-1]);
2126 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2128 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2133 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2134 /* It is possible that the bulk read is already in the
2135 * buffer. Check this condition and handle it accordingly.
2136 * This is just a fast path, alternative to call processInputBuffer().
2137 * It's a good idea since the code is small and this condition
2138 * happens most of the times. */
2139 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2140 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2142 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2144 /* Otherwise return... there is to read the last argument
2145 * from the socket. */
2149 /* Let's try to share objects on the command arguments vector */
2150 if (server
.shareobjects
) {
2152 for(j
= 1; j
< c
->argc
; j
++)
2153 c
->argv
[j
] = tryObjectSharing(c
->argv
[j
]);
2155 /* Let's try to encode the bulk object to save space. */
2156 if (cmd
->flags
& REDIS_CMD_BULK
)
2157 tryObjectEncoding(c
->argv
[c
->argc
-1]);
2159 /* Check if the user is authenticated */
2160 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2161 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2166 /* Exec the command */
2167 if (c
->flags
& REDIS_MULTI
&& cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
) {
2168 queueMultiCommand(c
,cmd
);
2169 addReply(c
,shared
.queued
);
2171 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2172 blockClientOnSwappedKeys(cmd
,c
)) return 1;
2176 /* Prepare the client for the next command */
2181 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
2186 /* (args*2)+1 is enough room for args, spaces, newlines */
2187 robj
*static_outv
[REDIS_STATIC_ARGS
*2+1];
2189 if (argc
<= REDIS_STATIC_ARGS
) {
2192 outv
= zmalloc(sizeof(robj
*)*(argc
*2+1));
2195 for (j
= 0; j
< argc
; j
++) {
2196 if (j
!= 0) outv
[outc
++] = shared
.space
;
2197 if ((cmd
->flags
& REDIS_CMD_BULK
) && j
== argc
-1) {
2200 lenobj
= createObject(REDIS_STRING
,
2201 sdscatprintf(sdsempty(),"%lu\r\n",
2202 (unsigned long) stringObjectLen(argv
[j
])));
2203 lenobj
->refcount
= 0;
2204 outv
[outc
++] = lenobj
;
2206 outv
[outc
++] = argv
[j
];
2208 outv
[outc
++] = shared
.crlf
;
2210 /* Increment all the refcounts at start and decrement at end in order to
2211 * be sure to free objects if there is no slave in a replication state
2212 * able to be feed with commands */
2213 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2214 listRewind(slaves
,&li
);
2215 while((ln
= listNext(&li
))) {
2216 redisClient
*slave
= ln
->value
;
2218 /* Don't feed slaves that are still waiting for BGSAVE to start */
2219 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2221 /* Feed all the other slaves, MONITORs and so on */
2222 if (slave
->slaveseldb
!= dictid
) {
2226 case 0: selectcmd
= shared
.select0
; break;
2227 case 1: selectcmd
= shared
.select1
; break;
2228 case 2: selectcmd
= shared
.select2
; break;
2229 case 3: selectcmd
= shared
.select3
; break;
2230 case 4: selectcmd
= shared
.select4
; break;
2231 case 5: selectcmd
= shared
.select5
; break;
2232 case 6: selectcmd
= shared
.select6
; break;
2233 case 7: selectcmd
= shared
.select7
; break;
2234 case 8: selectcmd
= shared
.select8
; break;
2235 case 9: selectcmd
= shared
.select9
; break;
2237 selectcmd
= createObject(REDIS_STRING
,
2238 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2239 selectcmd
->refcount
= 0;
2242 addReply(slave
,selectcmd
);
2243 slave
->slaveseldb
= dictid
;
2245 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2247 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2248 if (outv
!= static_outv
) zfree(outv
);
2251 static void processInputBuffer(redisClient
*c
) {
2253 /* Before to process the input buffer, make sure the client is not
2254 * waitig for a blocking operation such as BLPOP. Note that the first
2255 * iteration the client is never blocked, otherwise the processInputBuffer
2256 * would not be called at all, but after the execution of the first commands
2257 * in the input buffer the client may be blocked, and the "goto again"
2258 * will try to reiterate. The following line will make it return asap. */
2259 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2260 if (c
->bulklen
== -1) {
2261 /* Read the first line of the query */
2262 char *p
= strchr(c
->querybuf
,'\n');
2269 query
= c
->querybuf
;
2270 c
->querybuf
= sdsempty();
2271 querylen
= 1+(p
-(query
));
2272 if (sdslen(query
) > querylen
) {
2273 /* leave data after the first line of the query in the buffer */
2274 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2276 *p
= '\0'; /* remove "\n" */
2277 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2278 sdsupdatelen(query
);
2280 /* Now we can split the query in arguments */
2281 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2284 if (c
->argv
) zfree(c
->argv
);
2285 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2287 for (j
= 0; j
< argc
; j
++) {
2288 if (sdslen(argv
[j
])) {
2289 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2297 /* Execute the command. If the client is still valid
2298 * after processCommand() return and there is something
2299 * on the query buffer try to process the next command. */
2300 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2302 /* Nothing to process, argc == 0. Just process the query
2303 * buffer if it's not empty or return to the caller */
2304 if (sdslen(c
->querybuf
)) goto again
;
2307 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2308 redisLog(REDIS_VERBOSE
, "Client protocol error");
2313 /* Bulk read handling. Note that if we are at this point
2314 the client already sent a command terminated with a newline,
2315 we are reading the bulk data that is actually the last
2316 argument of the command. */
2317 int qbl
= sdslen(c
->querybuf
);
2319 if (c
->bulklen
<= qbl
) {
2320 /* Copy everything but the final CRLF as final argument */
2321 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2323 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2324 /* Process the command. If the client is still valid after
2325 * the processing and there is more data in the buffer
2326 * try to parse it. */
2327 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2333 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2334 redisClient
*c
= (redisClient
*) privdata
;
2335 char buf
[REDIS_IOBUF_LEN
];
2338 REDIS_NOTUSED(mask
);
2340 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2342 if (errno
== EAGAIN
) {
2345 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2349 } else if (nread
== 0) {
2350 redisLog(REDIS_VERBOSE
, "Client closed connection");
2355 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2356 c
->lastinteraction
= time(NULL
);
2360 if (!(c
->flags
& REDIS_BLOCKED
))
2361 processInputBuffer(c
);
2364 static int selectDb(redisClient
*c
, int id
) {
2365 if (id
< 0 || id
>= server
.dbnum
)
2367 c
->db
= &server
.db
[id
];
2371 static void *dupClientReplyValue(void *o
) {
2372 incrRefCount((robj
*)o
);
2376 static redisClient
*createClient(int fd
) {
2377 redisClient
*c
= zmalloc(sizeof(*c
));
2379 anetNonBlock(NULL
,fd
);
2380 anetTcpNoDelay(NULL
,fd
);
2381 if (!c
) return NULL
;
2384 c
->querybuf
= sdsempty();
2393 c
->lastinteraction
= time(NULL
);
2394 c
->authenticated
= 0;
2395 c
->replstate
= REDIS_REPL_NONE
;
2396 c
->reply
= listCreate();
2397 listSetFreeMethod(c
->reply
,decrRefCount
);
2398 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2399 c
->blockingkeys
= NULL
;
2400 c
->blockingkeysnum
= 0;
2401 c
->io_keys
= listCreate();
2402 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2403 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2404 readQueryFromClient
, c
) == AE_ERR
) {
2408 listAddNodeTail(server
.clients
,c
);
2409 initClientMultiState(c
);
2413 static void addReply(redisClient
*c
, robj
*obj
) {
2414 if (listLength(c
->reply
) == 0 &&
2415 (c
->replstate
== REDIS_REPL_NONE
||
2416 c
->replstate
== REDIS_REPL_ONLINE
) &&
2417 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2418 sendReplyToClient
, c
) == AE_ERR
) return;
2420 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2421 obj
= dupStringObject(obj
);
2422 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2424 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2427 static void addReplySds(redisClient
*c
, sds s
) {
2428 robj
*o
= createObject(REDIS_STRING
,s
);
2433 static void addReplyDouble(redisClient
*c
, double d
) {
2436 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2437 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2438 (unsigned long) strlen(buf
),buf
));
2441 static void addReplyLong(redisClient
*c
, long l
) {
2445 len
= snprintf(buf
,sizeof(buf
),":%ld\r\n",l
);
2446 addReplySds(c
,sdsnewlen(buf
,len
));
2449 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2452 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2453 len
= sdslen(obj
->ptr
);
2455 long n
= (long)obj
->ptr
;
2457 /* Compute how many bytes will take this integer as a radix 10 string */
2463 while((n
= n
/10) != 0) {
2467 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len
));
2470 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2475 REDIS_NOTUSED(mask
);
2476 REDIS_NOTUSED(privdata
);
2478 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2479 if (cfd
== AE_ERR
) {
2480 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2483 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2484 if ((c
= createClient(cfd
)) == NULL
) {
2485 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2486 close(cfd
); /* May be already closed, just ingore errors */
2489 /* If maxclient directive is set and this is one client more... close the
2490 * connection. Note that we create the client instead to check before
2491 * for this condition, since now the socket is already set in nonblocking
2492 * mode and we can send an error for free using the Kernel I/O */
2493 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2494 char *err
= "-ERR max number of clients reached\r\n";
2496 /* That's a best effort error message, don't check write errors */
2497 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2498 /* Nothing to do, Just to avoid the warning... */
2503 server
.stat_numconnections
++;
2506 /* ======================= Redis objects implementation ===================== */
2508 static robj
*createObject(int type
, void *ptr
) {
2511 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2512 if (listLength(server
.objfreelist
)) {
2513 listNode
*head
= listFirst(server
.objfreelist
);
2514 o
= listNodeValue(head
);
2515 listDelNode(server
.objfreelist
,head
);
2516 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2518 if (server
.vm_enabled
) {
2519 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2520 o
= zmalloc(sizeof(*o
));
2522 o
= zmalloc(sizeof(*o
)-sizeof(struct redisObjectVM
));
2526 o
->encoding
= REDIS_ENCODING_RAW
;
2529 if (server
.vm_enabled
) {
2530 /* Note that this code may run in the context of an I/O thread
2531 * and accessing to server.unixtime in theory is an error
2532 * (no locks). But in practice this is safe, and even if we read
2533 * garbage Redis will not fail, as it's just a statistical info */
2534 o
->vm
.atime
= server
.unixtime
;
2535 o
->storage
= REDIS_VM_MEMORY
;
2540 static robj
*createStringObject(char *ptr
, size_t len
) {
2541 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
2544 static robj
*dupStringObject(robj
*o
) {
2545 assert(o
->encoding
== REDIS_ENCODING_RAW
);
2546 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
2549 static robj
*createListObject(void) {
2550 list
*l
= listCreate();
2552 listSetFreeMethod(l
,decrRefCount
);
2553 return createObject(REDIS_LIST
,l
);
2556 static robj
*createSetObject(void) {
2557 dict
*d
= dictCreate(&setDictType
,NULL
);
2558 return createObject(REDIS_SET
,d
);
2561 static robj
*createHashObject(void) {
2562 /* All the Hashes start as zipmaps. Will be automatically converted
2563 * into hash tables if there are enough elements or big elements
2565 unsigned char *zm
= zipmapNew();
2566 robj
*o
= createObject(REDIS_HASH
,zm
);
2567 o
->encoding
= REDIS_ENCODING_ZIPMAP
;
2571 static robj
*createZsetObject(void) {
2572 zset
*zs
= zmalloc(sizeof(*zs
));
2574 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
2575 zs
->zsl
= zslCreate();
2576 return createObject(REDIS_ZSET
,zs
);
2579 static void freeStringObject(robj
*o
) {
2580 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2585 static void freeListObject(robj
*o
) {
2586 listRelease((list
*) o
->ptr
);
2589 static void freeSetObject(robj
*o
) {
2590 dictRelease((dict
*) o
->ptr
);
2593 static void freeZsetObject(robj
*o
) {
2596 dictRelease(zs
->dict
);
2601 static void freeHashObject(robj
*o
) {
2602 dictRelease((dict
*) o
->ptr
);
2605 static void incrRefCount(robj
*o
) {
2606 redisAssert(!server
.vm_enabled
|| o
->storage
== REDIS_VM_MEMORY
);
2610 static void decrRefCount(void *obj
) {
2613 /* Object is a key of a swapped out value, or in the process of being
2615 if (server
.vm_enabled
&&
2616 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
2618 if (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
) {
2619 redisAssert(o
->refcount
== 1);
2621 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(obj
);
2622 redisAssert(o
->type
== REDIS_STRING
);
2623 freeStringObject(o
);
2624 vmMarkPagesFree(o
->vm
.page
,o
->vm
.usedpages
);
2625 pthread_mutex_lock(&server
.obj_freelist_mutex
);
2626 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2627 !listAddNodeHead(server
.objfreelist
,o
))
2629 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2630 server
.vm_stats_swapped_objects
--;
2633 /* Object is in memory, or in the process of being swapped out. */
2634 if (--(o
->refcount
) == 0) {
2635 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
2636 vmCancelThreadedIOJob(obj
);
2638 case REDIS_STRING
: freeStringObject(o
); break;
2639 case REDIS_LIST
: freeListObject(o
); break;
2640 case REDIS_SET
: freeSetObject(o
); break;
2641 case REDIS_ZSET
: freeZsetObject(o
); break;
2642 case REDIS_HASH
: freeHashObject(o
); break;
2643 default: redisAssert(0 != 0); break;
2645 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2646 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2647 !listAddNodeHead(server
.objfreelist
,o
))
2649 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2653 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
2654 dictEntry
*de
= dictFind(db
->dict
,key
);
2656 robj
*key
= dictGetEntryKey(de
);
2657 robj
*val
= dictGetEntryVal(de
);
2659 if (server
.vm_enabled
) {
2660 if (key
->storage
== REDIS_VM_MEMORY
||
2661 key
->storage
== REDIS_VM_SWAPPING
)
2663 /* If we were swapping the object out, stop it, this key
2665 if (key
->storage
== REDIS_VM_SWAPPING
)
2666 vmCancelThreadedIOJob(key
);
2667 /* Update the access time of the key for the aging algorithm. */
2668 key
->vm
.atime
= server
.unixtime
;
2670 int notify
= (key
->storage
== REDIS_VM_LOADING
);
2672 /* Our value was swapped on disk. Bring it at home. */
2673 redisAssert(val
== NULL
);
2674 val
= vmLoadObject(key
);
2675 dictGetEntryVal(de
) = val
;
2677 /* Clients blocked by the VM subsystem may be waiting for
2679 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
2688 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
2689 expireIfNeeded(db
,key
);
2690 return lookupKey(db
,key
);
2693 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
2694 deleteIfVolatile(db
,key
);
2695 return lookupKey(db
,key
);
2698 static int deleteKey(redisDb
*db
, robj
*key
) {
2701 /* We need to protect key from destruction: after the first dictDelete()
2702 * it may happen that 'key' is no longer valid if we don't increment
2703 * it's count. This may happen when we get the object reference directly
2704 * from the hash table with dictRandomKey() or dict iterators */
2706 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
);
2707 retval
= dictDelete(db
->dict
,key
);
2710 return retval
== DICT_OK
;
2713 /* Try to share an object against the shared objects pool */
2714 static robj
*tryObjectSharing(robj
*o
) {
2715 struct dictEntry
*de
;
2718 if (o
== NULL
|| server
.shareobjects
== 0) return o
;
2720 redisAssert(o
->type
== REDIS_STRING
);
2721 de
= dictFind(server
.sharingpool
,o
);
2723 robj
*shared
= dictGetEntryKey(de
);
2725 c
= ((unsigned long) dictGetEntryVal(de
))+1;
2726 dictGetEntryVal(de
) = (void*) c
;
2727 incrRefCount(shared
);
2731 /* Here we are using a stream algorihtm: Every time an object is
2732 * shared we increment its count, everytime there is a miss we
2733 * recrement the counter of a random object. If this object reaches
2734 * zero we remove the object and put the current object instead. */
2735 if (dictSize(server
.sharingpool
) >=
2736 server
.sharingpoolsize
) {
2737 de
= dictGetRandomKey(server
.sharingpool
);
2738 redisAssert(de
!= NULL
);
2739 c
= ((unsigned long) dictGetEntryVal(de
))-1;
2740 dictGetEntryVal(de
) = (void*) c
;
2742 dictDelete(server
.sharingpool
,de
->key
);
2745 c
= 0; /* If the pool is empty we want to add this object */
2750 retval
= dictAdd(server
.sharingpool
,o
,(void*)1);
2751 redisAssert(retval
== DICT_OK
);
2758 /* Check if the nul-terminated string 's' can be represented by a long
2759 * (that is, is a number that fits into long without any other space or
2760 * character before or after the digits).
2762 * If so, the function returns REDIS_OK and *longval is set to the value
2763 * of the number. Otherwise REDIS_ERR is returned */
2764 static int isStringRepresentableAsLong(sds s
, long *longval
) {
2765 char buf
[32], *endptr
;
2769 value
= strtol(s
, &endptr
, 10);
2770 if (endptr
[0] != '\0') return REDIS_ERR
;
2771 slen
= snprintf(buf
,32,"%ld",value
);
2773 /* If the number converted back into a string is not identical
2774 * then it's not possible to encode the string as integer */
2775 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
2776 if (longval
) *longval
= value
;
2780 /* Try to encode a string object in order to save space */
2781 static int tryObjectEncoding(robj
*o
) {
2785 if (o
->encoding
!= REDIS_ENCODING_RAW
)
2786 return REDIS_ERR
; /* Already encoded */
2788 /* It's not save to encode shared objects: shared objects can be shared
2789 * everywhere in the "object space" of Redis. Encoded objects can only
2790 * appear as "values" (and not, for instance, as keys) */
2791 if (o
->refcount
> 1) return REDIS_ERR
;
2793 /* Currently we try to encode only strings */
2794 redisAssert(o
->type
== REDIS_STRING
);
2796 /* Check if we can represent this string as a long integer */
2797 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return REDIS_ERR
;
2799 /* Ok, this object can be encoded */
2800 o
->encoding
= REDIS_ENCODING_INT
;
2802 o
->ptr
= (void*) value
;
2806 /* Get a decoded version of an encoded object (returned as a new object).
2807 * If the object is already raw-encoded just increment the ref count. */
2808 static robj
*getDecodedObject(robj
*o
) {
2811 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2815 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
2818 snprintf(buf
,32,"%ld",(long)o
->ptr
);
2819 dec
= createStringObject(buf
,strlen(buf
));
2822 redisAssert(1 != 1);
2826 /* Compare two string objects via strcmp() or alike.
2827 * Note that the objects may be integer-encoded. In such a case we
2828 * use snprintf() to get a string representation of the numbers on the stack
2829 * and compare the strings, it's much faster than calling getDecodedObject().
2831 * Important note: if objects are not integer encoded, but binary-safe strings,
2832 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2834 static int compareStringObjects(robj
*a
, robj
*b
) {
2835 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
2836 char bufa
[128], bufb
[128], *astr
, *bstr
;
2839 if (a
== b
) return 0;
2840 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
2841 snprintf(bufa
,sizeof(bufa
),"%ld",(long) a
->ptr
);
2847 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
2848 snprintf(bufb
,sizeof(bufb
),"%ld",(long) b
->ptr
);
2854 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
2857 static size_t stringObjectLen(robj
*o
) {
2858 redisAssert(o
->type
== REDIS_STRING
);
2859 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2860 return sdslen(o
->ptr
);
2864 return snprintf(buf
,32,"%ld",(long)o
->ptr
);
2868 /*============================ RDB saving/loading =========================== */
2870 static int rdbSaveType(FILE *fp
, unsigned char type
) {
2871 if (fwrite(&type
,1,1,fp
) == 0) return -1;
2875 static int rdbSaveTime(FILE *fp
, time_t t
) {
2876 int32_t t32
= (int32_t) t
;
2877 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
2881 /* check rdbLoadLen() comments for more info */
2882 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
2883 unsigned char buf
[2];
2886 /* Save a 6 bit len */
2887 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
2888 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2889 } else if (len
< (1<<14)) {
2890 /* Save a 14 bit len */
2891 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
2893 if (fwrite(buf
,2,1,fp
) == 0) return -1;
2895 /* Save a 32 bit len */
2896 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
2897 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2899 if (fwrite(&len
,4,1,fp
) == 0) return -1;
2904 /* String objects in the form "2391" "-100" without any space and with a
2905 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2906 * encoded as integers to save space */
2907 static int rdbTryIntegerEncoding(sds s
, unsigned char *enc
) {
2909 char *endptr
, buf
[32];
2911 /* Check if it's possible to encode this value as a number */
2912 value
= strtoll(s
, &endptr
, 10);
2913 if (endptr
[0] != '\0') return 0;
2914 snprintf(buf
,32,"%lld",value
);
2916 /* If the number converted back into a string is not identical
2917 * then it's not possible to encode the string as integer */
2918 if (strlen(buf
) != sdslen(s
) || memcmp(buf
,s
,sdslen(s
))) return 0;
2920 /* Finally check if it fits in our ranges */
2921 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
2922 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
2923 enc
[1] = value
&0xFF;
2925 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
2926 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
2927 enc
[1] = value
&0xFF;
2928 enc
[2] = (value
>>8)&0xFF;
2930 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
2931 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
2932 enc
[1] = value
&0xFF;
2933 enc
[2] = (value
>>8)&0xFF;
2934 enc
[3] = (value
>>16)&0xFF;
2935 enc
[4] = (value
>>24)&0xFF;
2942 static int rdbSaveLzfStringObject(FILE *fp
, robj
*obj
) {
2943 unsigned int comprlen
, outlen
;
2947 /* We require at least four bytes compression for this to be worth it */
2948 outlen
= sdslen(obj
->ptr
)-4;
2949 if (outlen
<= 0) return 0;
2950 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
2951 comprlen
= lzf_compress(obj
->ptr
, sdslen(obj
->ptr
), out
, outlen
);
2952 if (comprlen
== 0) {
2956 /* Data compressed! Let's save it on disk */
2957 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
2958 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
2959 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
2960 if (rdbSaveLen(fp
,sdslen(obj
->ptr
)) == -1) goto writeerr
;
2961 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
2970 /* Save a string objet as [len][data] on disk. If the object is a string
2971 * representation of an integer value we try to safe it in a special form */
2972 static int rdbSaveStringObjectRaw(FILE *fp
, robj
*obj
) {
2976 len
= sdslen(obj
->ptr
);
2978 /* Try integer encoding */
2980 unsigned char buf
[5];
2981 if ((enclen
= rdbTryIntegerEncoding(obj
->ptr
,buf
)) > 0) {
2982 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
2987 /* Try LZF compression - under 20 bytes it's unable to compress even
2988 * aaaaaaaaaaaaaaaaaa so skip it */
2989 if (server
.rdbcompression
&& len
> 20) {
2992 retval
= rdbSaveLzfStringObject(fp
,obj
);
2993 if (retval
== -1) return -1;
2994 if (retval
> 0) return 0;
2995 /* retval == 0 means data can't be compressed, save the old way */
2998 /* Store verbatim */
2999 if (rdbSaveLen(fp
,len
) == -1) return -1;
3000 if (len
&& fwrite(obj
->ptr
,len
,1,fp
) == 0) return -1;
3004 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3005 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
3008 /* Avoid incr/decr ref count business when possible.
3009 * This plays well with copy-on-write given that we are probably
3010 * in a child process (BGSAVE). Also this makes sure key objects
3011 * of swapped objects are not incRefCount-ed (an assert does not allow
3012 * this in order to avoid bugs) */
3013 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
3014 obj
= getDecodedObject(obj
);
3015 retval
= rdbSaveStringObjectRaw(fp
,obj
);
3018 retval
= rdbSaveStringObjectRaw(fp
,obj
);
3023 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3024 * 8 bit integer specifing the length of the representation.
3025 * This 8 bit integer has special values in order to specify the following
3031 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3032 unsigned char buf
[128];
3038 } else if (!isfinite(val
)) {
3040 buf
[0] = (val
< 0) ? 255 : 254;
3042 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3043 buf
[0] = strlen((char*)buf
+1);
3046 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3050 /* Save a Redis object. */
3051 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3052 if (o
->type
== REDIS_STRING
) {
3053 /* Save a string value */
3054 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3055 } else if (o
->type
== REDIS_LIST
) {
3056 /* Save a list value */
3057 list
*list
= o
->ptr
;
3061 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3062 listRewind(list
,&li
);
3063 while((ln
= listNext(&li
))) {
3064 robj
*eleobj
= listNodeValue(ln
);
3066 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3068 } else if (o
->type
== REDIS_SET
) {
3069 /* Save a set value */
3071 dictIterator
*di
= dictGetIterator(set
);
3074 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3075 while((de
= dictNext(di
)) != NULL
) {
3076 robj
*eleobj
= dictGetEntryKey(de
);
3078 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3080 dictReleaseIterator(di
);
3081 } else if (o
->type
== REDIS_ZSET
) {
3082 /* Save a set value */
3084 dictIterator
*di
= dictGetIterator(zs
->dict
);
3087 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3088 while((de
= dictNext(di
)) != NULL
) {
3089 robj
*eleobj
= dictGetEntryKey(de
);
3090 double *score
= dictGetEntryVal(de
);
3092 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3093 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3095 dictReleaseIterator(di
);
3097 redisAssert(0 != 0);
3102 /* Return the length the object will have on disk if saved with
3103 * the rdbSaveObject() function. Currently we use a trick to get
3104 * this length with very little changes to the code. In the future
3105 * we could switch to a faster solution. */
3106 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3107 if (fp
== NULL
) fp
= server
.devnull
;
3109 assert(rdbSaveObject(fp
,o
) != 1);
3113 /* Return the number of pages required to save this object in the swap file */
3114 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3115 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3117 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3120 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3121 static int rdbSave(char *filename
) {
3122 dictIterator
*di
= NULL
;
3127 time_t now
= time(NULL
);
3129 /* Wait for I/O therads to terminate, just in case this is a
3130 * foreground-saving, to avoid seeking the swap file descriptor at the
3132 if (server
.vm_enabled
)
3133 waitEmptyIOJobsQueue();
3135 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3136 fp
= fopen(tmpfile
,"w");
3138 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3141 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3142 for (j
= 0; j
< server
.dbnum
; j
++) {
3143 redisDb
*db
= server
.db
+j
;
3145 if (dictSize(d
) == 0) continue;
3146 di
= dictGetIterator(d
);
3152 /* Write the SELECT DB opcode */
3153 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3154 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3156 /* Iterate this DB writing every entry */
3157 while((de
= dictNext(di
)) != NULL
) {
3158 robj
*key
= dictGetEntryKey(de
);
3159 robj
*o
= dictGetEntryVal(de
);
3160 time_t expiretime
= getExpire(db
,key
);
3162 /* Save the expire time */
3163 if (expiretime
!= -1) {
3164 /* If this key is already expired skip it */
3165 if (expiretime
< now
) continue;
3166 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3167 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3169 /* Save the key and associated value. This requires special
3170 * handling if the value is swapped out. */
3171 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
3172 key
->storage
== REDIS_VM_SWAPPING
) {
3173 /* Save type, key, value */
3174 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3175 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3176 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3178 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3180 /* Get a preview of the object in memory */
3181 po
= vmPreviewObject(key
);
3182 /* Save type, key, value */
3183 if (rdbSaveType(fp
,key
->vtype
) == -1) goto werr
;
3184 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3185 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3186 /* Remove the loaded object from memory */
3190 dictReleaseIterator(di
);
3193 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3195 /* Make sure data will not remain on the OS's output buffers */
3200 /* Use RENAME to make sure the DB file is changed atomically only
3201 * if the generate DB file is ok. */
3202 if (rename(tmpfile
,filename
) == -1) {
3203 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3207 redisLog(REDIS_NOTICE
,"DB saved on disk");
3209 server
.lastsave
= time(NULL
);
3215 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
3216 if (di
) dictReleaseIterator(di
);
3220 static int rdbSaveBackground(char *filename
) {
3223 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
3224 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
3225 if ((childpid
= fork()) == 0) {
3227 if (server
.vm_enabled
) vmReopenSwapFile();
3229 if (rdbSave(filename
) == REDIS_OK
) {
3236 if (childpid
== -1) {
3237 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
3241 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
3242 server
.bgsavechildpid
= childpid
;
3245 return REDIS_OK
; /* unreached */
3248 static void rdbRemoveTempFile(pid_t childpid
) {
3251 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
3255 static int rdbLoadType(FILE *fp
) {
3257 if (fread(&type
,1,1,fp
) == 0) return -1;
3261 static time_t rdbLoadTime(FILE *fp
) {
3263 if (fread(&t32
,4,1,fp
) == 0) return -1;
3264 return (time_t) t32
;
3267 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3268 * of this file for a description of how this are stored on disk.
3270 * isencoded is set to 1 if the readed length is not actually a length but
3271 * an "encoding type", check the above comments for more info */
3272 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
3273 unsigned char buf
[2];
3277 if (isencoded
) *isencoded
= 0;
3278 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3279 type
= (buf
[0]&0xC0)>>6;
3280 if (type
== REDIS_RDB_6BITLEN
) {
3281 /* Read a 6 bit len */
3283 } else if (type
== REDIS_RDB_ENCVAL
) {
3284 /* Read a 6 bit len encoding type */
3285 if (isencoded
) *isencoded
= 1;
3287 } else if (type
== REDIS_RDB_14BITLEN
) {
3288 /* Read a 14 bit len */
3289 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3290 return ((buf
[0]&0x3F)<<8)|buf
[1];
3292 /* Read a 32 bit len */
3293 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
3298 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
) {
3299 unsigned char enc
[4];
3302 if (enctype
== REDIS_RDB_ENC_INT8
) {
3303 if (fread(enc
,1,1,fp
) == 0) return NULL
;
3304 val
= (signed char)enc
[0];
3305 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
3307 if (fread(enc
,2,1,fp
) == 0) return NULL
;
3308 v
= enc
[0]|(enc
[1]<<8);
3310 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
3312 if (fread(enc
,4,1,fp
) == 0) return NULL
;
3313 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
3316 val
= 0; /* anti-warning */
3319 return createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",val
));
3322 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
3323 unsigned int len
, clen
;
3324 unsigned char *c
= NULL
;
3327 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3328 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3329 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
3330 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
3331 if (fread(c
,clen
,1,fp
) == 0) goto err
;
3332 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
3334 return createObject(REDIS_STRING
,val
);
3341 static robj
*rdbLoadStringObject(FILE*fp
) {
3346 len
= rdbLoadLen(fp
,&isencoded
);
3349 case REDIS_RDB_ENC_INT8
:
3350 case REDIS_RDB_ENC_INT16
:
3351 case REDIS_RDB_ENC_INT32
:
3352 return tryObjectSharing(rdbLoadIntegerObject(fp
,len
));
3353 case REDIS_RDB_ENC_LZF
:
3354 return tryObjectSharing(rdbLoadLzfStringObject(fp
));
3360 if (len
== REDIS_RDB_LENERR
) return NULL
;
3361 val
= sdsnewlen(NULL
,len
);
3362 if (len
&& fread(val
,len
,1,fp
) == 0) {
3366 return tryObjectSharing(createObject(REDIS_STRING
,val
));
3369 /* For information about double serialization check rdbSaveDoubleValue() */
3370 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
3374 if (fread(&len
,1,1,fp
) == 0) return -1;
3376 case 255: *val
= R_NegInf
; return 0;
3377 case 254: *val
= R_PosInf
; return 0;
3378 case 253: *val
= R_Nan
; return 0;
3380 if (fread(buf
,len
,1,fp
) == 0) return -1;
3382 sscanf(buf
, "%lg", val
);
3387 /* Load a Redis object of the specified type from the specified file.
3388 * On success a newly allocated object is returned, otherwise NULL. */
3389 static robj
*rdbLoadObject(int type
, FILE *fp
) {
3392 if (type
== REDIS_STRING
) {
3393 /* Read string value */
3394 if ((o
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3395 tryObjectEncoding(o
);
3396 } else if (type
== REDIS_LIST
|| type
== REDIS_SET
) {
3397 /* Read list/set value */
3400 if ((listlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3401 o
= (type
== REDIS_LIST
) ? createListObject() : createSetObject();
3402 /* It's faster to expand the dict to the right size asap in order
3403 * to avoid rehashing */
3404 if (type
== REDIS_SET
&& listlen
> DICT_HT_INITIAL_SIZE
)
3405 dictExpand(o
->ptr
,listlen
);
3406 /* Load every single element of the list/set */
3410 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3411 tryObjectEncoding(ele
);
3412 if (type
== REDIS_LIST
) {
3413 listAddNodeTail((list
*)o
->ptr
,ele
);
3415 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
3418 } else if (type
== REDIS_ZSET
) {
3419 /* Read list/set value */
3423 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3424 o
= createZsetObject();
3426 /* Load every single element of the list/set */
3429 double *score
= zmalloc(sizeof(double));
3431 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3432 tryObjectEncoding(ele
);
3433 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
3434 dictAdd(zs
->dict
,ele
,score
);
3435 zslInsert(zs
->zsl
,*score
,ele
);
3436 incrRefCount(ele
); /* added to skiplist */
3439 redisAssert(0 != 0);
3444 static int rdbLoad(char *filename
) {
3446 robj
*keyobj
= NULL
;
3448 int type
, retval
, rdbver
;
3449 dict
*d
= server
.db
[0].dict
;
3450 redisDb
*db
= server
.db
+0;
3452 time_t expiretime
= -1, now
= time(NULL
);
3453 long long loadedkeys
= 0;
3455 fp
= fopen(filename
,"r");
3456 if (!fp
) return REDIS_ERR
;
3457 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
3459 if (memcmp(buf
,"REDIS",5) != 0) {
3461 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
3464 rdbver
= atoi(buf
+5);
3467 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
3474 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3475 if (type
== REDIS_EXPIRETIME
) {
3476 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
3477 /* We read the time so we need to read the object type again */
3478 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3480 if (type
== REDIS_EOF
) break;
3481 /* Handle SELECT DB opcode as a special case */
3482 if (type
== REDIS_SELECTDB
) {
3483 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
3485 if (dbid
>= (unsigned)server
.dbnum
) {
3486 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
3489 db
= server
.db
+dbid
;
3494 if ((keyobj
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
3496 if ((o
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
3497 /* Add the new object in the hash table */
3498 retval
= dictAdd(d
,keyobj
,o
);
3499 if (retval
== DICT_ERR
) {
3500 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj
->ptr
);
3503 /* Set the expire time if needed */
3504 if (expiretime
!= -1) {
3505 setExpire(db
,keyobj
,expiretime
);
3506 /* Delete this key if already expired */
3507 if (expiretime
< now
) deleteKey(db
,keyobj
);
3511 /* Handle swapping while loading big datasets when VM is on */
3513 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
3514 while (zmalloc_used_memory() > server
.vm_max_memory
) {
3515 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
3522 eoferr
: /* unexpected end of file is handled here with a fatal exit */
3523 if (keyobj
) decrRefCount(keyobj
);
3524 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3526 return REDIS_ERR
; /* Just to avoid warning */
3529 /*================================== Commands =============================== */
3531 static void authCommand(redisClient
*c
) {
3532 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
3533 c
->authenticated
= 1;
3534 addReply(c
,shared
.ok
);
3536 c
->authenticated
= 0;
3537 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3541 static void pingCommand(redisClient
*c
) {
3542 addReply(c
,shared
.pong
);
3545 static void echoCommand(redisClient
*c
) {
3546 addReplyBulkLen(c
,c
->argv
[1]);
3547 addReply(c
,c
->argv
[1]);
3548 addReply(c
,shared
.crlf
);
3551 /*=================================== Strings =============================== */
3553 static void setGenericCommand(redisClient
*c
, int nx
) {
3556 if (nx
) deleteIfVolatile(c
->db
,c
->argv
[1]);
3557 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3558 if (retval
== DICT_ERR
) {
3560 /* If the key is about a swapped value, we want a new key object
3561 * to overwrite the old. So we delete the old key in the database.
3562 * This will also make sure that swap pages about the old object
3563 * will be marked as free. */
3564 if (server
.vm_enabled
&& deleteIfSwapped(c
->db
,c
->argv
[1]))
3565 incrRefCount(c
->argv
[1]);
3566 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3567 incrRefCount(c
->argv
[2]);
3569 addReply(c
,shared
.czero
);
3573 incrRefCount(c
->argv
[1]);
3574 incrRefCount(c
->argv
[2]);
3577 removeExpire(c
->db
,c
->argv
[1]);
3578 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3581 static void setCommand(redisClient
*c
) {
3582 setGenericCommand(c
,0);
3585 static void setnxCommand(redisClient
*c
) {
3586 setGenericCommand(c
,1);
3589 static int getGenericCommand(redisClient
*c
) {
3590 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3593 addReply(c
,shared
.nullbulk
);
3596 if (o
->type
!= REDIS_STRING
) {
3597 addReply(c
,shared
.wrongtypeerr
);
3600 addReplyBulkLen(c
,o
);
3602 addReply(c
,shared
.crlf
);
3608 static void getCommand(redisClient
*c
) {
3609 getGenericCommand(c
);
3612 static void getsetCommand(redisClient
*c
) {
3613 if (getGenericCommand(c
) == REDIS_ERR
) return;
3614 if (dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]) == DICT_ERR
) {
3615 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3617 incrRefCount(c
->argv
[1]);
3619 incrRefCount(c
->argv
[2]);
3621 removeExpire(c
->db
,c
->argv
[1]);
3624 static void mgetCommand(redisClient
*c
) {
3627 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
3628 for (j
= 1; j
< c
->argc
; j
++) {
3629 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
3631 addReply(c
,shared
.nullbulk
);
3633 if (o
->type
!= REDIS_STRING
) {
3634 addReply(c
,shared
.nullbulk
);
3636 addReplyBulkLen(c
,o
);
3638 addReply(c
,shared
.crlf
);
3644 static void msetGenericCommand(redisClient
*c
, int nx
) {
3645 int j
, busykeys
= 0;
3647 if ((c
->argc
% 2) == 0) {
3648 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3651 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3652 * set nothing at all if at least one already key exists. */
3654 for (j
= 1; j
< c
->argc
; j
+= 2) {
3655 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
3661 addReply(c
, shared
.czero
);
3665 for (j
= 1; j
< c
->argc
; j
+= 2) {
3668 tryObjectEncoding(c
->argv
[j
+1]);
3669 retval
= dictAdd(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3670 if (retval
== DICT_ERR
) {
3671 dictReplace(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3672 incrRefCount(c
->argv
[j
+1]);
3674 incrRefCount(c
->argv
[j
]);
3675 incrRefCount(c
->argv
[j
+1]);
3677 removeExpire(c
->db
,c
->argv
[j
]);
3679 server
.dirty
+= (c
->argc
-1)/2;
3680 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3683 static void msetCommand(redisClient
*c
) {
3684 msetGenericCommand(c
,0);
3687 static void msetnxCommand(redisClient
*c
) {
3688 msetGenericCommand(c
,1);
3691 static void incrDecrCommand(redisClient
*c
, long long incr
) {
3696 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3700 if (o
->type
!= REDIS_STRING
) {
3705 if (o
->encoding
== REDIS_ENCODING_RAW
)
3706 value
= strtoll(o
->ptr
, &eptr
, 10);
3707 else if (o
->encoding
== REDIS_ENCODING_INT
)
3708 value
= (long)o
->ptr
;
3710 redisAssert(1 != 1);
3715 o
= createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",value
));
3716 tryObjectEncoding(o
);
3717 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],o
);
3718 if (retval
== DICT_ERR
) {
3719 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3720 removeExpire(c
->db
,c
->argv
[1]);
3722 incrRefCount(c
->argv
[1]);
3725 addReply(c
,shared
.colon
);
3727 addReply(c
,shared
.crlf
);
3730 static void incrCommand(redisClient
*c
) {
3731 incrDecrCommand(c
,1);
3734 static void decrCommand(redisClient
*c
) {
3735 incrDecrCommand(c
,-1);
3738 static void incrbyCommand(redisClient
*c
) {
3739 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3740 incrDecrCommand(c
,incr
);
3743 static void decrbyCommand(redisClient
*c
) {
3744 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3745 incrDecrCommand(c
,-incr
);
3748 static void appendCommand(redisClient
*c
) {
3753 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3755 /* Create the key */
3756 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3757 incrRefCount(c
->argv
[1]);
3758 incrRefCount(c
->argv
[2]);
3759 totlen
= stringObjectLen(c
->argv
[2]);
3763 de
= dictFind(c
->db
->dict
,c
->argv
[1]);
3766 o
= dictGetEntryVal(de
);
3767 if (o
->type
!= REDIS_STRING
) {
3768 addReply(c
,shared
.wrongtypeerr
);
3771 /* If the object is specially encoded or shared we have to make
3773 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
3774 robj
*decoded
= getDecodedObject(o
);
3776 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
3777 decrRefCount(decoded
);
3778 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3781 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
3782 o
->ptr
= sdscatlen(o
->ptr
,
3783 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
3785 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
3786 (unsigned long) c
->argv
[2]->ptr
);
3788 totlen
= sdslen(o
->ptr
);
3791 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
3794 static void substrCommand(redisClient
*c
) {
3796 long start
= atoi(c
->argv
[2]->ptr
);
3797 long end
= atoi(c
->argv
[3]->ptr
);
3799 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3801 addReply(c
,shared
.nullbulk
);
3803 if (o
->type
!= REDIS_STRING
) {
3804 addReply(c
,shared
.wrongtypeerr
);
3806 size_t rangelen
, strlen
;
3809 o
= getDecodedObject(o
);
3810 strlen
= sdslen(o
->ptr
);
3812 /* convert negative indexes */
3813 if (start
< 0) start
= strlen
+start
;
3814 if (end
< 0) end
= strlen
+end
;
3815 if (start
< 0) start
= 0;
3816 if (end
< 0) end
= 0;
3818 /* indexes sanity checks */
3819 if (start
> end
|| (size_t)start
>= strlen
) {
3820 /* Out of range start or start > end result in null reply */
3821 addReply(c
,shared
.nullbulk
);
3825 if ((size_t)end
>= strlen
) end
= strlen
-1;
3826 rangelen
= (end
-start
)+1;
3828 /* Return the result */
3829 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",rangelen
));
3830 range
= sdsnewlen((char*)o
->ptr
+start
,rangelen
);
3831 addReplySds(c
,range
);
3832 addReply(c
,shared
.crlf
);
3838 /* ========================= Type agnostic commands ========================= */
3840 static void delCommand(redisClient
*c
) {
3843 for (j
= 1; j
< c
->argc
; j
++) {
3844 if (deleteKey(c
->db
,c
->argv
[j
])) {
3851 addReply(c
,shared
.czero
);
3854 addReply(c
,shared
.cone
);
3857 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",deleted
));
3862 static void existsCommand(redisClient
*c
) {
3863 addReply(c
,lookupKeyRead(c
->db
,c
->argv
[1]) ? shared
.cone
: shared
.czero
);
3866 static void selectCommand(redisClient
*c
) {
3867 int id
= atoi(c
->argv
[1]->ptr
);
3869 if (selectDb(c
,id
) == REDIS_ERR
) {
3870 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
3872 addReply(c
,shared
.ok
);
3876 static void randomkeyCommand(redisClient
*c
) {
3880 de
= dictGetRandomKey(c
->db
->dict
);
3881 if (!de
|| expireIfNeeded(c
->db
,dictGetEntryKey(de
)) == 0) break;
3884 addReply(c
,shared
.plus
);
3885 addReply(c
,shared
.crlf
);
3887 addReply(c
,shared
.plus
);
3888 addReply(c
,dictGetEntryKey(de
));
3889 addReply(c
,shared
.crlf
);
3893 static void keysCommand(redisClient
*c
) {
3896 sds pattern
= c
->argv
[1]->ptr
;
3897 int plen
= sdslen(pattern
);
3898 unsigned long numkeys
= 0;
3899 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
3901 di
= dictGetIterator(c
->db
->dict
);
3903 decrRefCount(lenobj
);
3904 while((de
= dictNext(di
)) != NULL
) {
3905 robj
*keyobj
= dictGetEntryKey(de
);
3907 sds key
= keyobj
->ptr
;
3908 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
3909 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
3910 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
3911 addReplyBulkLen(c
,keyobj
);
3913 addReply(c
,shared
.crlf
);
3918 dictReleaseIterator(di
);
3919 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
3922 static void dbsizeCommand(redisClient
*c
) {
3924 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
3927 static void lastsaveCommand(redisClient
*c
) {
3929 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
3932 static void typeCommand(redisClient
*c
) {
3936 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3941 case REDIS_STRING
: type
= "+string"; break;
3942 case REDIS_LIST
: type
= "+list"; break;
3943 case REDIS_SET
: type
= "+set"; break;
3944 case REDIS_ZSET
: type
= "+zset"; break;
3945 default: type
= "unknown"; break;
3948 addReplySds(c
,sdsnew(type
));
3949 addReply(c
,shared
.crlf
);
3952 static void saveCommand(redisClient
*c
) {
3953 if (server
.bgsavechildpid
!= -1) {
3954 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
3957 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
3958 addReply(c
,shared
.ok
);
3960 addReply(c
,shared
.err
);
3964 static void bgsaveCommand(redisClient
*c
) {
3965 if (server
.bgsavechildpid
!= -1) {
3966 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
3969 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
3970 char *status
= "+Background saving started\r\n";
3971 addReplySds(c
,sdsnew(status
));
3973 addReply(c
,shared
.err
);
3977 static void shutdownCommand(redisClient
*c
) {
3978 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
3979 /* Kill the saving child if there is a background saving in progress.
3980 We want to avoid race conditions, for instance our saving child may
3981 overwrite the synchronous saving did by SHUTDOWN. */
3982 if (server
.bgsavechildpid
!= -1) {
3983 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
3984 kill(server
.bgsavechildpid
,SIGKILL
);
3985 rdbRemoveTempFile(server
.bgsavechildpid
);
3987 if (server
.appendonly
) {
3988 /* Append only file: fsync() the AOF and exit */
3989 fsync(server
.appendfd
);
3990 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
3993 /* Snapshotting. Perform a SYNC SAVE and exit */
3994 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
3995 if (server
.daemonize
)
3996 unlink(server
.pidfile
);
3997 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
3998 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
3999 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4002 /* Ooops.. error saving! The best we can do is to continue operating.
4003 * Note that if there was a background saving process, in the next
4004 * cron() Redis will be notified that the background saving aborted,
4005 * handling special stuff like slaves pending for synchronization... */
4006 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
4007 addReplySds(c
,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4012 static void renameGenericCommand(redisClient
*c
, int nx
) {
4015 /* To use the same key as src and dst is probably an error */
4016 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
4017 addReply(c
,shared
.sameobjecterr
);
4021 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4023 addReply(c
,shared
.nokeyerr
);
4027 deleteIfVolatile(c
->db
,c
->argv
[2]);
4028 if (dictAdd(c
->db
->dict
,c
->argv
[2],o
) == DICT_ERR
) {
4031 addReply(c
,shared
.czero
);
4034 dictReplace(c
->db
->dict
,c
->argv
[2],o
);
4036 incrRefCount(c
->argv
[2]);
4038 deleteKey(c
->db
,c
->argv
[1]);
4040 addReply(c
,nx
? shared
.cone
: shared
.ok
);
4043 static void renameCommand(redisClient
*c
) {
4044 renameGenericCommand(c
,0);
4047 static void renamenxCommand(redisClient
*c
) {
4048 renameGenericCommand(c
,1);
4051 static void moveCommand(redisClient
*c
) {
4056 /* Obtain source and target DB pointers */
4059 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
4060 addReply(c
,shared
.outofrangeerr
);
4064 selectDb(c
,srcid
); /* Back to the source DB */
4066 /* If the user is moving using as target the same
4067 * DB as the source DB it is probably an error. */
4069 addReply(c
,shared
.sameobjecterr
);
4073 /* Check if the element exists and get a reference */
4074 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4076 addReply(c
,shared
.czero
);
4080 /* Try to add the element to the target DB */
4081 deleteIfVolatile(dst
,c
->argv
[1]);
4082 if (dictAdd(dst
->dict
,c
->argv
[1],o
) == DICT_ERR
) {
4083 addReply(c
,shared
.czero
);
4086 incrRefCount(c
->argv
[1]);
4089 /* OK! key moved, free the entry in the source DB */
4090 deleteKey(src
,c
->argv
[1]);
4092 addReply(c
,shared
.cone
);
4095 /* =================================== Lists ================================ */
4096 static void pushGenericCommand(redisClient
*c
, int where
) {
4100 lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4102 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4103 addReply(c
,shared
.cone
);
4106 lobj
= createListObject();
4108 if (where
== REDIS_HEAD
) {
4109 listAddNodeHead(list
,c
->argv
[2]);
4111 listAddNodeTail(list
,c
->argv
[2]);
4113 dictAdd(c
->db
->dict
,c
->argv
[1],lobj
);
4114 incrRefCount(c
->argv
[1]);
4115 incrRefCount(c
->argv
[2]);
4117 if (lobj
->type
!= REDIS_LIST
) {
4118 addReply(c
,shared
.wrongtypeerr
);
4121 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4122 addReply(c
,shared
.cone
);
4126 if (where
== REDIS_HEAD
) {
4127 listAddNodeHead(list
,c
->argv
[2]);
4129 listAddNodeTail(list
,c
->argv
[2]);
4131 incrRefCount(c
->argv
[2]);
4134 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(list
)));
4137 static void lpushCommand(redisClient
*c
) {
4138 pushGenericCommand(c
,REDIS_HEAD
);
4141 static void rpushCommand(redisClient
*c
) {
4142 pushGenericCommand(c
,REDIS_TAIL
);
4145 static void llenCommand(redisClient
*c
) {
4149 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4151 addReply(c
,shared
.czero
);
4154 if (o
->type
!= REDIS_LIST
) {
4155 addReply(c
,shared
.wrongtypeerr
);
4158 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(l
)));
4163 static void lindexCommand(redisClient
*c
) {
4165 int index
= atoi(c
->argv
[2]->ptr
);
4167 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4169 addReply(c
,shared
.nullbulk
);
4171 if (o
->type
!= REDIS_LIST
) {
4172 addReply(c
,shared
.wrongtypeerr
);
4174 list
*list
= o
->ptr
;
4177 ln
= listIndex(list
, index
);
4179 addReply(c
,shared
.nullbulk
);
4181 robj
*ele
= listNodeValue(ln
);
4182 addReplyBulkLen(c
,ele
);
4184 addReply(c
,shared
.crlf
);
4190 static void lsetCommand(redisClient
*c
) {
4192 int index
= atoi(c
->argv
[2]->ptr
);
4194 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4196 addReply(c
,shared
.nokeyerr
);
4198 if (o
->type
!= REDIS_LIST
) {
4199 addReply(c
,shared
.wrongtypeerr
);
4201 list
*list
= o
->ptr
;
4204 ln
= listIndex(list
, index
);
4206 addReply(c
,shared
.outofrangeerr
);
4208 robj
*ele
= listNodeValue(ln
);
4211 listNodeValue(ln
) = c
->argv
[3];
4212 incrRefCount(c
->argv
[3]);
4213 addReply(c
,shared
.ok
);
4220 static void popGenericCommand(redisClient
*c
, int where
) {
4223 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4225 addReply(c
,shared
.nullbulk
);
4227 if (o
->type
!= REDIS_LIST
) {
4228 addReply(c
,shared
.wrongtypeerr
);
4230 list
*list
= o
->ptr
;
4233 if (where
== REDIS_HEAD
)
4234 ln
= listFirst(list
);
4236 ln
= listLast(list
);
4239 addReply(c
,shared
.nullbulk
);
4241 robj
*ele
= listNodeValue(ln
);
4242 addReplyBulkLen(c
,ele
);
4244 addReply(c
,shared
.crlf
);
4245 listDelNode(list
,ln
);
4252 static void lpopCommand(redisClient
*c
) {
4253 popGenericCommand(c
,REDIS_HEAD
);
4256 static void rpopCommand(redisClient
*c
) {
4257 popGenericCommand(c
,REDIS_TAIL
);
4260 static void lrangeCommand(redisClient
*c
) {
4262 int start
= atoi(c
->argv
[2]->ptr
);
4263 int end
= atoi(c
->argv
[3]->ptr
);
4265 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4267 addReply(c
,shared
.nullmultibulk
);
4269 if (o
->type
!= REDIS_LIST
) {
4270 addReply(c
,shared
.wrongtypeerr
);
4272 list
*list
= o
->ptr
;
4274 int llen
= listLength(list
);
4278 /* convert negative indexes */
4279 if (start
< 0) start
= llen
+start
;
4280 if (end
< 0) end
= llen
+end
;
4281 if (start
< 0) start
= 0;
4282 if (end
< 0) end
= 0;
4284 /* indexes sanity checks */
4285 if (start
> end
|| start
>= llen
) {
4286 /* Out of range start or start > end result in empty list */
4287 addReply(c
,shared
.emptymultibulk
);
4290 if (end
>= llen
) end
= llen
-1;
4291 rangelen
= (end
-start
)+1;
4293 /* Return the result in form of a multi-bulk reply */
4294 ln
= listIndex(list
, start
);
4295 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
4296 for (j
= 0; j
< rangelen
; j
++) {
4297 ele
= listNodeValue(ln
);
4298 addReplyBulkLen(c
,ele
);
4300 addReply(c
,shared
.crlf
);
4307 static void ltrimCommand(redisClient
*c
) {
4309 int start
= atoi(c
->argv
[2]->ptr
);
4310 int end
= atoi(c
->argv
[3]->ptr
);
4312 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4314 addReply(c
,shared
.ok
);
4316 if (o
->type
!= REDIS_LIST
) {
4317 addReply(c
,shared
.wrongtypeerr
);
4319 list
*list
= o
->ptr
;
4321 int llen
= listLength(list
);
4322 int j
, ltrim
, rtrim
;
4324 /* convert negative indexes */
4325 if (start
< 0) start
= llen
+start
;
4326 if (end
< 0) end
= llen
+end
;
4327 if (start
< 0) start
= 0;
4328 if (end
< 0) end
= 0;
4330 /* indexes sanity checks */
4331 if (start
> end
|| start
>= llen
) {
4332 /* Out of range start or start > end result in empty list */
4336 if (end
>= llen
) end
= llen
-1;
4341 /* Remove list elements to perform the trim */
4342 for (j
= 0; j
< ltrim
; j
++) {
4343 ln
= listFirst(list
);
4344 listDelNode(list
,ln
);
4346 for (j
= 0; j
< rtrim
; j
++) {
4347 ln
= listLast(list
);
4348 listDelNode(list
,ln
);
4351 addReply(c
,shared
.ok
);
4356 static void lremCommand(redisClient
*c
) {
4359 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4361 addReply(c
,shared
.czero
);
4363 if (o
->type
!= REDIS_LIST
) {
4364 addReply(c
,shared
.wrongtypeerr
);
4366 list
*list
= o
->ptr
;
4367 listNode
*ln
, *next
;
4368 int toremove
= atoi(c
->argv
[2]->ptr
);
4373 toremove
= -toremove
;
4376 ln
= fromtail
? list
->tail
: list
->head
;
4378 robj
*ele
= listNodeValue(ln
);
4380 next
= fromtail
? ln
->prev
: ln
->next
;
4381 if (compareStringObjects(ele
,c
->argv
[3]) == 0) {
4382 listDelNode(list
,ln
);
4385 if (toremove
&& removed
== toremove
) break;
4389 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
4394 /* This is the semantic of this command:
4395 * RPOPLPUSH srclist dstlist:
4396 * IF LLEN(srclist) > 0
4397 * element = RPOP srclist
4398 * LPUSH dstlist element
4405 * The idea is to be able to get an element from a list in a reliable way
4406 * since the element is not just returned but pushed against another list
4407 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4409 static void rpoplpushcommand(redisClient
*c
) {
4412 sobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4414 addReply(c
,shared
.nullbulk
);
4416 if (sobj
->type
!= REDIS_LIST
) {
4417 addReply(c
,shared
.wrongtypeerr
);
4419 list
*srclist
= sobj
->ptr
;
4420 listNode
*ln
= listLast(srclist
);
4423 addReply(c
,shared
.nullbulk
);
4425 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4426 robj
*ele
= listNodeValue(ln
);
4429 if (dobj
&& dobj
->type
!= REDIS_LIST
) {
4430 addReply(c
,shared
.wrongtypeerr
);
4434 /* Add the element to the target list (unless it's directly
4435 * passed to some BLPOP-ing client */
4436 if (!handleClientsWaitingListPush(c
,c
->argv
[2],ele
)) {
4438 /* Create the list if the key does not exist */
4439 dobj
= createListObject();
4440 dictAdd(c
->db
->dict
,c
->argv
[2],dobj
);
4441 incrRefCount(c
->argv
[2]);
4443 dstlist
= dobj
->ptr
;
4444 listAddNodeHead(dstlist
,ele
);
4448 /* Send the element to the client as reply as well */
4449 addReplyBulkLen(c
,ele
);
4451 addReply(c
,shared
.crlf
);
4453 /* Finally remove the element from the source list */
4454 listDelNode(srclist
,ln
);
4462 /* ==================================== Sets ================================ */
4464 static void saddCommand(redisClient
*c
) {
4467 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4469 set
= createSetObject();
4470 dictAdd(c
->db
->dict
,c
->argv
[1],set
);
4471 incrRefCount(c
->argv
[1]);
4473 if (set
->type
!= REDIS_SET
) {
4474 addReply(c
,shared
.wrongtypeerr
);
4478 if (dictAdd(set
->ptr
,c
->argv
[2],NULL
) == DICT_OK
) {
4479 incrRefCount(c
->argv
[2]);
4481 addReply(c
,shared
.cone
);
4483 addReply(c
,shared
.czero
);
4487 static void sremCommand(redisClient
*c
) {
4490 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4492 addReply(c
,shared
.czero
);
4494 if (set
->type
!= REDIS_SET
) {
4495 addReply(c
,shared
.wrongtypeerr
);
4498 if (dictDelete(set
->ptr
,c
->argv
[2]) == DICT_OK
) {
4500 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4501 addReply(c
,shared
.cone
);
4503 addReply(c
,shared
.czero
);
4508 static void smoveCommand(redisClient
*c
) {
4509 robj
*srcset
, *dstset
;
4511 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4512 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4514 /* If the source key does not exist return 0, if it's of the wrong type
4516 if (srcset
== NULL
|| srcset
->type
!= REDIS_SET
) {
4517 addReply(c
, srcset
? shared
.wrongtypeerr
: shared
.czero
);
4520 /* Error if the destination key is not a set as well */
4521 if (dstset
&& dstset
->type
!= REDIS_SET
) {
4522 addReply(c
,shared
.wrongtypeerr
);
4525 /* Remove the element from the source set */
4526 if (dictDelete(srcset
->ptr
,c
->argv
[3]) == DICT_ERR
) {
4527 /* Key not found in the src set! return zero */
4528 addReply(c
,shared
.czero
);
4532 /* Add the element to the destination set */
4534 dstset
= createSetObject();
4535 dictAdd(c
->db
->dict
,c
->argv
[2],dstset
);
4536 incrRefCount(c
->argv
[2]);
4538 if (dictAdd(dstset
->ptr
,c
->argv
[3],NULL
) == DICT_OK
)
4539 incrRefCount(c
->argv
[3]);
4540 addReply(c
,shared
.cone
);
4543 static void sismemberCommand(redisClient
*c
) {
4546 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4548 addReply(c
,shared
.czero
);
4550 if (set
->type
!= REDIS_SET
) {
4551 addReply(c
,shared
.wrongtypeerr
);
4554 if (dictFind(set
->ptr
,c
->argv
[2]))
4555 addReply(c
,shared
.cone
);
4557 addReply(c
,shared
.czero
);
4561 static void scardCommand(redisClient
*c
) {
4565 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4567 addReply(c
,shared
.czero
);
4570 if (o
->type
!= REDIS_SET
) {
4571 addReply(c
,shared
.wrongtypeerr
);
4574 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4580 static void spopCommand(redisClient
*c
) {
4584 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4586 addReply(c
,shared
.nullbulk
);
4588 if (set
->type
!= REDIS_SET
) {
4589 addReply(c
,shared
.wrongtypeerr
);
4592 de
= dictGetRandomKey(set
->ptr
);
4594 addReply(c
,shared
.nullbulk
);
4596 robj
*ele
= dictGetEntryKey(de
);
4598 addReplyBulkLen(c
,ele
);
4600 addReply(c
,shared
.crlf
);
4601 dictDelete(set
->ptr
,ele
);
4602 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4608 static void srandmemberCommand(redisClient
*c
) {
4612 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4614 addReply(c
,shared
.nullbulk
);
4616 if (set
->type
!= REDIS_SET
) {
4617 addReply(c
,shared
.wrongtypeerr
);
4620 de
= dictGetRandomKey(set
->ptr
);
4622 addReply(c
,shared
.nullbulk
);
4624 robj
*ele
= dictGetEntryKey(de
);
4626 addReplyBulkLen(c
,ele
);
4628 addReply(c
,shared
.crlf
);
4633 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
4634 dict
**d1
= (void*) s1
, **d2
= (void*) s2
;
4636 return dictSize(*d1
)-dictSize(*d2
);
4639 static void sinterGenericCommand(redisClient
*c
, robj
**setskeys
, unsigned long setsnum
, robj
*dstkey
) {
4640 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4643 robj
*lenobj
= NULL
, *dstset
= NULL
;
4644 unsigned long j
, cardinality
= 0;
4646 for (j
= 0; j
< setsnum
; j
++) {
4650 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4651 lookupKeyRead(c
->db
,setskeys
[j
]);
4655 if (deleteKey(c
->db
,dstkey
))
4657 addReply(c
,shared
.czero
);
4659 addReply(c
,shared
.nullmultibulk
);
4663 if (setobj
->type
!= REDIS_SET
) {
4665 addReply(c
,shared
.wrongtypeerr
);
4668 dv
[j
] = setobj
->ptr
;
4670 /* Sort sets from the smallest to largest, this will improve our
4671 * algorithm's performace */
4672 qsort(dv
,setsnum
,sizeof(dict
*),qsortCompareSetsByCardinality
);
4674 /* The first thing we should output is the total number of elements...
4675 * since this is a multi-bulk write, but at this stage we don't know
4676 * the intersection set size, so we use a trick, append an empty object
4677 * to the output list and save the pointer to later modify it with the
4680 lenobj
= createObject(REDIS_STRING
,NULL
);
4682 decrRefCount(lenobj
);
4684 /* If we have a target key where to store the resulting set
4685 * create this key with an empty set inside */
4686 dstset
= createSetObject();
4689 /* Iterate all the elements of the first (smallest) set, and test
4690 * the element against all the other sets, if at least one set does
4691 * not include the element it is discarded */
4692 di
= dictGetIterator(dv
[0]);
4694 while((de
= dictNext(di
)) != NULL
) {
4697 for (j
= 1; j
< setsnum
; j
++)
4698 if (dictFind(dv
[j
],dictGetEntryKey(de
)) == NULL
) break;
4700 continue; /* at least one set does not contain the member */
4701 ele
= dictGetEntryKey(de
);
4703 addReplyBulkLen(c
,ele
);
4705 addReply(c
,shared
.crlf
);
4708 dictAdd(dstset
->ptr
,ele
,NULL
);
4712 dictReleaseIterator(di
);
4715 /* Store the resulting set into the target */
4716 deleteKey(c
->db
,dstkey
);
4717 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4718 incrRefCount(dstkey
);
4722 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
4724 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4725 dictSize((dict
*)dstset
->ptr
)));
4731 static void sinterCommand(redisClient
*c
) {
4732 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
4735 static void sinterstoreCommand(redisClient
*c
) {
4736 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
4739 #define REDIS_OP_UNION 0
4740 #define REDIS_OP_DIFF 1
4742 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setskeys
, int setsnum
, robj
*dstkey
, int op
) {
4743 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4746 robj
*dstset
= NULL
;
4747 int j
, cardinality
= 0;
4749 for (j
= 0; j
< setsnum
; j
++) {
4753 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4754 lookupKeyRead(c
->db
,setskeys
[j
]);
4759 if (setobj
->type
!= REDIS_SET
) {
4761 addReply(c
,shared
.wrongtypeerr
);
4764 dv
[j
] = setobj
->ptr
;
4767 /* We need a temp set object to store our union. If the dstkey
4768 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4769 * this set object will be the resulting object to set into the target key*/
4770 dstset
= createSetObject();
4772 /* Iterate all the elements of all the sets, add every element a single
4773 * time to the result set */
4774 for (j
= 0; j
< setsnum
; j
++) {
4775 if (op
== REDIS_OP_DIFF
&& j
== 0 && !dv
[j
]) break; /* result set is empty */
4776 if (!dv
[j
]) continue; /* non existing keys are like empty sets */
4778 di
= dictGetIterator(dv
[j
]);
4780 while((de
= dictNext(di
)) != NULL
) {
4783 /* dictAdd will not add the same element multiple times */
4784 ele
= dictGetEntryKey(de
);
4785 if (op
== REDIS_OP_UNION
|| j
== 0) {
4786 if (dictAdd(dstset
->ptr
,ele
,NULL
) == DICT_OK
) {
4790 } else if (op
== REDIS_OP_DIFF
) {
4791 if (dictDelete(dstset
->ptr
,ele
) == DICT_OK
) {
4796 dictReleaseIterator(di
);
4798 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break; /* result set is empty */
4801 /* Output the content of the resulting set, if not in STORE mode */
4803 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
4804 di
= dictGetIterator(dstset
->ptr
);
4805 while((de
= dictNext(di
)) != NULL
) {
4808 ele
= dictGetEntryKey(de
);
4809 addReplyBulkLen(c
,ele
);
4811 addReply(c
,shared
.crlf
);
4813 dictReleaseIterator(di
);
4815 /* If we have a target key where to store the resulting set
4816 * create this key with the result set inside */
4817 deleteKey(c
->db
,dstkey
);
4818 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4819 incrRefCount(dstkey
);
4824 decrRefCount(dstset
);
4826 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4827 dictSize((dict
*)dstset
->ptr
)));
4833 static void sunionCommand(redisClient
*c
) {
4834 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
4837 static void sunionstoreCommand(redisClient
*c
) {
4838 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
4841 static void sdiffCommand(redisClient
*c
) {
4842 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
4845 static void sdiffstoreCommand(redisClient
*c
) {
4846 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
4849 /* ==================================== ZSets =============================== */
4851 /* ZSETs are ordered sets using two data structures to hold the same elements
4852 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4855 * The elements are added to an hash table mapping Redis objects to scores.
4856 * At the same time the elements are added to a skip list mapping scores
4857 * to Redis objects (so objects are sorted by scores in this "view"). */
4859 /* This skiplist implementation is almost a C translation of the original
4860 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4861 * Alternative to Balanced Trees", modified in three ways:
4862 * a) this implementation allows for repeated values.
4863 * b) the comparison is not just by key (our 'score') but by satellite data.
4864 * c) there is a back pointer, so it's a doubly linked list with the back
4865 * pointers being only at "level 1". This allows to traverse the list
4866 * from tail to head, useful for ZREVRANGE. */
4868 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
4869 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
4871 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
4873 zn
->span
= zmalloc(sizeof(unsigned int) * (level
- 1));
4879 static zskiplist
*zslCreate(void) {
4883 zsl
= zmalloc(sizeof(*zsl
));
4886 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
4887 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
4888 zsl
->header
->forward
[j
] = NULL
;
4890 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4891 if (j
< ZSKIPLIST_MAXLEVEL
-1)
4892 zsl
->header
->span
[j
] = 0;
4894 zsl
->header
->backward
= NULL
;
4899 static void zslFreeNode(zskiplistNode
*node
) {
4900 decrRefCount(node
->obj
);
4901 zfree(node
->forward
);
4906 static void zslFree(zskiplist
*zsl
) {
4907 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
4909 zfree(zsl
->header
->forward
);
4910 zfree(zsl
->header
->span
);
4913 next
= node
->forward
[0];
4920 static int zslRandomLevel(void) {
4922 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
4927 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
4928 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4929 unsigned int rank
[ZSKIPLIST_MAXLEVEL
];
4933 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4934 /* store rank that is crossed to reach the insert position */
4935 rank
[i
] = i
== (zsl
->level
-1) ? 0 : rank
[i
+1];
4937 while (x
->forward
[i
] &&
4938 (x
->forward
[i
]->score
< score
||
4939 (x
->forward
[i
]->score
== score
&&
4940 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
4941 rank
[i
] += i
> 0 ? x
->span
[i
-1] : 1;
4946 /* we assume the key is not already inside, since we allow duplicated
4947 * scores, and the re-insertion of score and redis object should never
4948 * happpen since the caller of zslInsert() should test in the hash table
4949 * if the element is already inside or not. */
4950 level
= zslRandomLevel();
4951 if (level
> zsl
->level
) {
4952 for (i
= zsl
->level
; i
< level
; i
++) {
4954 update
[i
] = zsl
->header
;
4955 update
[i
]->span
[i
-1] = zsl
->length
;
4959 x
= zslCreateNode(level
,score
,obj
);
4960 for (i
= 0; i
< level
; i
++) {
4961 x
->forward
[i
] = update
[i
]->forward
[i
];
4962 update
[i
]->forward
[i
] = x
;
4964 /* update span covered by update[i] as x is inserted here */
4966 x
->span
[i
-1] = update
[i
]->span
[i
-1] - (rank
[0] - rank
[i
]);
4967 update
[i
]->span
[i
-1] = (rank
[0] - rank
[i
]) + 1;
4971 /* increment span for untouched levels */
4972 for (i
= level
; i
< zsl
->level
; i
++) {
4973 update
[i
]->span
[i
-1]++;
4976 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
4978 x
->forward
[0]->backward
= x
;
4984 /* Delete an element with matching score/object from the skiplist. */
4985 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
4986 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4990 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4991 while (x
->forward
[i
] &&
4992 (x
->forward
[i
]->score
< score
||
4993 (x
->forward
[i
]->score
== score
&&
4994 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
4998 /* We may have multiple elements with the same score, what we need
4999 * is to find the element with both the right score and object. */
5001 if (x
&& score
== x
->score
&& compareStringObjects(x
->obj
,obj
) == 0) {
5002 for (i
= 0; i
< zsl
->level
; i
++) {
5003 if (update
[i
]->forward
[i
] == x
) {
5005 update
[i
]->span
[i
-1] += x
->span
[i
-1] - 1;
5007 update
[i
]->forward
[i
] = x
->forward
[i
];
5009 /* invariant: i > 0, because update[0]->forward[0]
5010 * is always equal to x */
5011 update
[i
]->span
[i
-1] -= 1;
5014 if (x
->forward
[0]) {
5015 x
->forward
[0]->backward
= x
->backward
;
5017 zsl
->tail
= x
->backward
;
5020 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
5025 return 0; /* not found */
5027 return 0; /* not found */
5030 /* Delete all the elements with score between min and max from the skiplist.
5031 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5032 * Note that this function takes the reference to the hash table view of the
5033 * sorted set, in order to remove the elements from the hash table too. */
5034 static unsigned long zslDeleteRange(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
5035 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5036 unsigned long removed
= 0;
5040 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5041 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
5045 /* We may have multiple elements with the same score, what we need
5046 * is to find the element with both the right score and object. */
5048 while (x
&& x
->score
<= max
) {
5049 zskiplistNode
*next
;
5051 for (i
= 0; i
< zsl
->level
; i
++) {
5052 if (update
[i
]->forward
[i
] == x
) {
5054 update
[i
]->span
[i
-1] += x
->span
[i
-1] - 1;
5056 update
[i
]->forward
[i
] = x
->forward
[i
];
5058 /* invariant: i > 0, because update[0]->forward[0]
5059 * is always equal to x */
5060 update
[i
]->span
[i
-1] -= 1;
5063 if (x
->forward
[0]) {
5064 x
->forward
[0]->backward
= x
->backward
;
5066 zsl
->tail
= x
->backward
;
5068 next
= x
->forward
[0];
5069 dictDelete(dict
,x
->obj
);
5071 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
5077 return removed
; /* not found */
5080 /* Find the first node having a score equal or greater than the specified one.
5081 * Returns NULL if there is no match. */
5082 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
5087 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5088 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
5091 /* We may have multiple elements with the same score, what we need
5092 * is to find the element with both the right score and object. */
5093 return x
->forward
[0];
5096 /* Find the rank for an element by both score and key.
5097 * Returns 0 when the element cannot be found, rank otherwise.
5098 * Note that the rank is 1-based due to the span of zsl->header to the
5100 static unsigned long zslGetRank(zskiplist
*zsl
, double score
, robj
*o
) {
5102 unsigned long rank
= 0;
5106 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5107 while (x
->forward
[i
] &&
5108 (x
->forward
[i
]->score
< score
||
5109 (x
->forward
[i
]->score
== score
&&
5110 compareStringObjects(x
->forward
[i
]->obj
,o
) <= 0))) {
5111 rank
+= i
> 0 ? x
->span
[i
-1] : 1;
5115 /* x might be equal to zsl->header, so test if obj is non-NULL */
5116 if (x
->obj
&& compareStringObjects(x
->obj
,o
) == 0) {
5123 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5124 zskiplistNode
* zslGetElementByRank(zskiplist
*zsl
, unsigned long rank
) {
5126 unsigned long traversed
= 0;
5130 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5131 while (x
->forward
[i
] && (traversed
+ (i
> 0 ? x
->span
[i
-1] : 1)) <= rank
) {
5132 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5136 if (traversed
== rank
) {
5143 /* The actual Z-commands implementations */
5145 /* This generic command implements both ZADD and ZINCRBY.
5146 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5147 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5148 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
5153 zsetobj
= lookupKeyWrite(c
->db
,key
);
5154 if (zsetobj
== NULL
) {
5155 zsetobj
= createZsetObject();
5156 dictAdd(c
->db
->dict
,key
,zsetobj
);
5159 if (zsetobj
->type
!= REDIS_ZSET
) {
5160 addReply(c
,shared
.wrongtypeerr
);
5166 /* Ok now since we implement both ZADD and ZINCRBY here the code
5167 * needs to handle the two different conditions. It's all about setting
5168 * '*score', that is, the new score to set, to the right value. */
5169 score
= zmalloc(sizeof(double));
5173 /* Read the old score. If the element was not present starts from 0 */
5174 de
= dictFind(zs
->dict
,ele
);
5176 double *oldscore
= dictGetEntryVal(de
);
5177 *score
= *oldscore
+ scoreval
;
5185 /* What follows is a simple remove and re-insert operation that is common
5186 * to both ZADD and ZINCRBY... */
5187 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
5188 /* case 1: New element */
5189 incrRefCount(ele
); /* added to hash */
5190 zslInsert(zs
->zsl
,*score
,ele
);
5191 incrRefCount(ele
); /* added to skiplist */
5194 addReplyDouble(c
,*score
);
5196 addReply(c
,shared
.cone
);
5201 /* case 2: Score update operation */
5202 de
= dictFind(zs
->dict
,ele
);
5203 redisAssert(de
!= NULL
);
5204 oldscore
= dictGetEntryVal(de
);
5205 if (*score
!= *oldscore
) {
5208 /* Remove and insert the element in the skip list with new score */
5209 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
5210 redisAssert(deleted
!= 0);
5211 zslInsert(zs
->zsl
,*score
,ele
);
5213 /* Update the score in the hash table */
5214 dictReplace(zs
->dict
,ele
,score
);
5220 addReplyDouble(c
,*score
);
5222 addReply(c
,shared
.czero
);
5226 static void zaddCommand(redisClient
*c
) {
5229 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5230 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
5233 static void zincrbyCommand(redisClient
*c
) {
5236 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5237 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
5240 static void zremCommand(redisClient
*c
) {
5244 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5245 if (zsetobj
== NULL
) {
5246 addReply(c
,shared
.czero
);
5252 if (zsetobj
->type
!= REDIS_ZSET
) {
5253 addReply(c
,shared
.wrongtypeerr
);
5257 de
= dictFind(zs
->dict
,c
->argv
[2]);
5259 addReply(c
,shared
.czero
);
5262 /* Delete from the skiplist */
5263 oldscore
= dictGetEntryVal(de
);
5264 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
5265 redisAssert(deleted
!= 0);
5267 /* Delete from the hash table */
5268 dictDelete(zs
->dict
,c
->argv
[2]);
5269 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5271 addReply(c
,shared
.cone
);
5275 static void zremrangebyscoreCommand(redisClient
*c
) {
5276 double min
= strtod(c
->argv
[2]->ptr
,NULL
);
5277 double max
= strtod(c
->argv
[3]->ptr
,NULL
);
5281 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5282 if (zsetobj
== NULL
) {
5283 addReply(c
,shared
.czero
);
5287 if (zsetobj
->type
!= REDIS_ZSET
) {
5288 addReply(c
,shared
.wrongtypeerr
);
5292 deleted
= zslDeleteRange(zs
->zsl
,min
,max
,zs
->dict
);
5293 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5294 server
.dirty
+= deleted
;
5295 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",deleted
));
5299 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
5301 int start
= atoi(c
->argv
[2]->ptr
);
5302 int end
= atoi(c
->argv
[3]->ptr
);
5305 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
5307 } else if (c
->argc
>= 5) {
5308 addReply(c
,shared
.syntaxerr
);
5312 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5314 addReply(c
,shared
.nullmultibulk
);
5316 if (o
->type
!= REDIS_ZSET
) {
5317 addReply(c
,shared
.wrongtypeerr
);
5319 zset
*zsetobj
= o
->ptr
;
5320 zskiplist
*zsl
= zsetobj
->zsl
;
5323 int llen
= zsl
->length
;
5327 /* convert negative indexes */
5328 if (start
< 0) start
= llen
+start
;
5329 if (end
< 0) end
= llen
+end
;
5330 if (start
< 0) start
= 0;
5331 if (end
< 0) end
= 0;
5333 /* indexes sanity checks */
5334 if (start
> end
|| start
>= llen
) {
5335 /* Out of range start or start > end result in empty list */
5336 addReply(c
,shared
.emptymultibulk
);
5339 if (end
>= llen
) end
= llen
-1;
5340 rangelen
= (end
-start
)+1;
5342 /* check if starting point is trivial, before searching
5343 * the element in log(N) time */
5345 ln
= start
== 0 ? zsl
->tail
: zslGetElementByRank(zsl
, llen
- start
);
5347 ln
= start
== 0 ? zsl
->header
->forward
[0] : zslGetElementByRank(zsl
, start
+ 1);
5350 /* Return the result in form of a multi-bulk reply */
5351 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
5352 withscores
? (rangelen
*2) : rangelen
));
5353 for (j
= 0; j
< rangelen
; j
++) {
5355 addReplyBulkLen(c
,ele
);
5357 addReply(c
,shared
.crlf
);
5359 addReplyDouble(c
,ln
->score
);
5360 ln
= reverse
? ln
->backward
: ln
->forward
[0];
5366 static void zrangeCommand(redisClient
*c
) {
5367 zrangeGenericCommand(c
,0);
5370 static void zrevrangeCommand(redisClient
*c
) {
5371 zrangeGenericCommand(c
,1);
5374 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5375 * If justcount is non-zero, just the count is returned. */
5376 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
5379 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
5380 int offset
= 0, limit
= -1;
5384 /* Parse the min-max interval. If one of the values is prefixed
5385 * by the "(" character, it's considered "open". For instance
5386 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5387 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5388 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
5389 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
5392 min
= strtod(c
->argv
[2]->ptr
,NULL
);
5394 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
5395 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
5398 max
= strtod(c
->argv
[3]->ptr
,NULL
);
5401 /* Parse "WITHSCORES": note that if the command was called with
5402 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5403 * enter the following paths to parse WITHSCORES and LIMIT. */
5404 if (c
->argc
== 5 || c
->argc
== 8) {
5405 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
5410 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
5414 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5419 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
5420 addReply(c
,shared
.syntaxerr
);
5422 } else if (c
->argc
== (7 + withscores
)) {
5423 offset
= atoi(c
->argv
[5]->ptr
);
5424 limit
= atoi(c
->argv
[6]->ptr
);
5425 if (offset
< 0) offset
= 0;
5428 /* Ok, lookup the key and get the range */
5429 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5431 addReply(c
,justcount
? shared
.czero
: shared
.nullmultibulk
);
5433 if (o
->type
!= REDIS_ZSET
) {
5434 addReply(c
,shared
.wrongtypeerr
);
5436 zset
*zsetobj
= o
->ptr
;
5437 zskiplist
*zsl
= zsetobj
->zsl
;
5439 robj
*ele
, *lenobj
= NULL
;
5440 unsigned long rangelen
= 0;
5442 /* Get the first node with the score >= min, or with
5443 * score > min if 'minex' is true. */
5444 ln
= zslFirstWithScore(zsl
,min
);
5445 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
5448 /* No element matching the speciifed interval */
5449 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
5453 /* We don't know in advance how many matching elements there
5454 * are in the list, so we push this object that will represent
5455 * the multi-bulk length in the output buffer, and will "fix"
5458 lenobj
= createObject(REDIS_STRING
,NULL
);
5460 decrRefCount(lenobj
);
5463 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
5466 ln
= ln
->forward
[0];
5469 if (limit
== 0) break;
5472 addReplyBulkLen(c
,ele
);
5474 addReply(c
,shared
.crlf
);
5476 addReplyDouble(c
,ln
->score
);
5478 ln
= ln
->forward
[0];
5480 if (limit
> 0) limit
--;
5483 addReplyLong(c
,(long)rangelen
);
5485 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
5486 withscores
? (rangelen
*2) : rangelen
);
5492 static void zrangebyscoreCommand(redisClient
*c
) {
5493 genericZrangebyscoreCommand(c
,0);
5496 static void zcountCommand(redisClient
*c
) {
5497 genericZrangebyscoreCommand(c
,1);
5500 static void zcardCommand(redisClient
*c
) {
5504 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5506 addReply(c
,shared
.czero
);
5509 if (o
->type
!= REDIS_ZSET
) {
5510 addReply(c
,shared
.wrongtypeerr
);
5513 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",zs
->zsl
->length
));
5518 static void zscoreCommand(redisClient
*c
) {
5522 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5524 addReply(c
,shared
.nullbulk
);
5527 if (o
->type
!= REDIS_ZSET
) {
5528 addReply(c
,shared
.wrongtypeerr
);
5533 de
= dictFind(zs
->dict
,c
->argv
[2]);
5535 addReply(c
,shared
.nullbulk
);
5537 double *score
= dictGetEntryVal(de
);
5539 addReplyDouble(c
,*score
);
5545 static void zrankCommand(redisClient
*c
) {
5547 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5549 addReply(c
,shared
.nullbulk
);
5552 if (o
->type
!= REDIS_ZSET
) {
5553 addReply(c
,shared
.wrongtypeerr
);
5556 zskiplist
*zsl
= zs
->zsl
;
5560 de
= dictFind(zs
->dict
,c
->argv
[2]);
5562 addReply(c
,shared
.nullbulk
);
5566 double *score
= dictGetEntryVal(de
);
5567 rank
= zslGetRank(zsl
, *score
, c
->argv
[2]);
5569 addReplyLong(c
, rank
-1);
5571 addReply(c
,shared
.nullbulk
);
5576 /* ========================= Non type-specific commands ==================== */
5578 static void flushdbCommand(redisClient
*c
) {
5579 server
.dirty
+= dictSize(c
->db
->dict
);
5580 dictEmpty(c
->db
->dict
);
5581 dictEmpty(c
->db
->expires
);
5582 addReply(c
,shared
.ok
);
5585 static void flushallCommand(redisClient
*c
) {
5586 server
.dirty
+= emptyDb();
5587 addReply(c
,shared
.ok
);
5588 rdbSave(server
.dbfilename
);
5592 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
5593 redisSortOperation
*so
= zmalloc(sizeof(*so
));
5595 so
->pattern
= pattern
;
5599 /* Return the value associated to the key with a name obtained
5600 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5601 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
5605 int prefixlen
, sublen
, postfixlen
;
5606 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5610 char buf
[REDIS_SORTKEY_MAX
+1];
5613 /* If the pattern is "#" return the substitution object itself in order
5614 * to implement the "SORT ... GET #" feature. */
5615 spat
= pattern
->ptr
;
5616 if (spat
[0] == '#' && spat
[1] == '\0') {
5620 /* The substitution object may be specially encoded. If so we create
5621 * a decoded object on the fly. Otherwise getDecodedObject will just
5622 * increment the ref count, that we'll decrement later. */
5623 subst
= getDecodedObject(subst
);
5626 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
5627 p
= strchr(spat
,'*');
5629 decrRefCount(subst
);
5634 sublen
= sdslen(ssub
);
5635 postfixlen
= sdslen(spat
)-(prefixlen
+1);
5636 memcpy(keyname
.buf
,spat
,prefixlen
);
5637 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
5638 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
5639 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
5640 keyname
.len
= prefixlen
+sublen
+postfixlen
;
5642 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2))
5643 decrRefCount(subst
);
5645 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5646 return lookupKeyRead(db
,&keyobj
);
5649 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5650 * the additional parameter is not standard but a BSD-specific we have to
5651 * pass sorting parameters via the global 'server' structure */
5652 static int sortCompare(const void *s1
, const void *s2
) {
5653 const redisSortObject
*so1
= s1
, *so2
= s2
;
5656 if (!server
.sort_alpha
) {
5657 /* Numeric sorting. Here it's trivial as we precomputed scores */
5658 if (so1
->u
.score
> so2
->u
.score
) {
5660 } else if (so1
->u
.score
< so2
->u
.score
) {
5666 /* Alphanumeric sorting */
5667 if (server
.sort_bypattern
) {
5668 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
5669 /* At least one compare object is NULL */
5670 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
5672 else if (so1
->u
.cmpobj
== NULL
)
5677 /* We have both the objects, use strcoll */
5678 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
5681 /* Compare elements directly */
5684 dec1
= getDecodedObject(so1
->obj
);
5685 dec2
= getDecodedObject(so2
->obj
);
5686 cmp
= strcoll(dec1
->ptr
,dec2
->ptr
);
5691 return server
.sort_desc
? -cmp
: cmp
;
5694 /* The SORT command is the most complex command in Redis. Warning: this code
5695 * is optimized for speed and a bit less for readability */
5696 static void sortCommand(redisClient
*c
) {
5699 int desc
= 0, alpha
= 0;
5700 int limit_start
= 0, limit_count
= -1, start
, end
;
5701 int j
, dontsort
= 0, vectorlen
;
5702 int getop
= 0; /* GET operation counter */
5703 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
5704 redisSortObject
*vector
; /* Resulting vector to sort */
5706 /* Lookup the key to sort. It must be of the right types */
5707 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
5708 if (sortval
== NULL
) {
5709 addReply(c
,shared
.nullmultibulk
);
5712 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
5713 sortval
->type
!= REDIS_ZSET
)
5715 addReply(c
,shared
.wrongtypeerr
);
5719 /* Create a list of operations to perform for every sorted element.
5720 * Operations can be GET/DEL/INCR/DECR */
5721 operations
= listCreate();
5722 listSetFreeMethod(operations
,zfree
);
5725 /* Now we need to protect sortval incrementing its count, in the future
5726 * SORT may have options able to overwrite/delete keys during the sorting
5727 * and the sorted key itself may get destroied */
5728 incrRefCount(sortval
);
5730 /* The SORT command has an SQL-alike syntax, parse it */
5731 while(j
< c
->argc
) {
5732 int leftargs
= c
->argc
-j
-1;
5733 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
5735 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
5737 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
5739 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
5740 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
5741 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
5743 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
5744 storekey
= c
->argv
[j
+1];
5746 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
5747 sortby
= c
->argv
[j
+1];
5748 /* If the BY pattern does not contain '*', i.e. it is constant,
5749 * we don't need to sort nor to lookup the weight keys. */
5750 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
5752 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
5753 listAddNodeTail(operations
,createSortOperation(
5754 REDIS_SORT_GET
,c
->argv
[j
+1]));
5758 decrRefCount(sortval
);
5759 listRelease(operations
);
5760 addReply(c
,shared
.syntaxerr
);
5766 /* Load the sorting vector with all the objects to sort */
5767 switch(sortval
->type
) {
5768 case REDIS_LIST
: vectorlen
= listLength((list
*)sortval
->ptr
); break;
5769 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
5770 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
5771 default: vectorlen
= 0; redisAssert(0); /* Avoid GCC warning */
5773 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
5776 if (sortval
->type
== REDIS_LIST
) {
5777 list
*list
= sortval
->ptr
;
5781 listRewind(list
,&li
);
5782 while((ln
= listNext(&li
))) {
5783 robj
*ele
= ln
->value
;
5784 vector
[j
].obj
= ele
;
5785 vector
[j
].u
.score
= 0;
5786 vector
[j
].u
.cmpobj
= NULL
;
5794 if (sortval
->type
== REDIS_SET
) {
5797 zset
*zs
= sortval
->ptr
;
5801 di
= dictGetIterator(set
);
5802 while((setele
= dictNext(di
)) != NULL
) {
5803 vector
[j
].obj
= dictGetEntryKey(setele
);
5804 vector
[j
].u
.score
= 0;
5805 vector
[j
].u
.cmpobj
= NULL
;
5808 dictReleaseIterator(di
);
5810 redisAssert(j
== vectorlen
);
5812 /* Now it's time to load the right scores in the sorting vector */
5813 if (dontsort
== 0) {
5814 for (j
= 0; j
< vectorlen
; j
++) {
5818 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
5819 if (!byval
|| byval
->type
!= REDIS_STRING
) continue;
5821 vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
5823 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
5824 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
5826 /* Don't need to decode the object if it's
5827 * integer-encoded (the only encoding supported) so
5828 * far. We can just cast it */
5829 if (byval
->encoding
== REDIS_ENCODING_INT
) {
5830 vector
[j
].u
.score
= (long)byval
->ptr
;
5832 redisAssert(1 != 1);
5837 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_RAW
)
5838 vector
[j
].u
.score
= strtod(vector
[j
].obj
->ptr
,NULL
);
5840 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_INT
)
5841 vector
[j
].u
.score
= (long) vector
[j
].obj
->ptr
;
5843 redisAssert(1 != 1);
5850 /* We are ready to sort the vector... perform a bit of sanity check
5851 * on the LIMIT option too. We'll use a partial version of quicksort. */
5852 start
= (limit_start
< 0) ? 0 : limit_start
;
5853 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
5854 if (start
>= vectorlen
) {
5855 start
= vectorlen
-1;
5858 if (end
>= vectorlen
) end
= vectorlen
-1;
5860 if (dontsort
== 0) {
5861 server
.sort_desc
= desc
;
5862 server
.sort_alpha
= alpha
;
5863 server
.sort_bypattern
= sortby
? 1 : 0;
5864 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
5865 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
5867 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
5870 /* Send command output to the output buffer, performing the specified
5871 * GET/DEL/INCR/DECR operations if any. */
5872 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
5873 if (storekey
== NULL
) {
5874 /* STORE option not specified, sent the sorting result to client */
5875 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
5876 for (j
= start
; j
<= end
; j
++) {
5881 addReplyBulkLen(c
,vector
[j
].obj
);
5882 addReply(c
,vector
[j
].obj
);
5883 addReply(c
,shared
.crlf
);
5885 listRewind(operations
,&li
);
5886 while((ln
= listNext(&li
))) {
5887 redisSortOperation
*sop
= ln
->value
;
5888 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
5891 if (sop
->type
== REDIS_SORT_GET
) {
5892 if (!val
|| val
->type
!= REDIS_STRING
) {
5893 addReply(c
,shared
.nullbulk
);
5895 addReplyBulkLen(c
,val
);
5897 addReply(c
,shared
.crlf
);
5900 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
5905 robj
*listObject
= createListObject();
5906 list
*listPtr
= (list
*) listObject
->ptr
;
5908 /* STORE option specified, set the sorting result as a List object */
5909 for (j
= start
; j
<= end
; j
++) {
5914 listAddNodeTail(listPtr
,vector
[j
].obj
);
5915 incrRefCount(vector
[j
].obj
);
5917 listRewind(operations
,&li
);
5918 while((ln
= listNext(&li
))) {
5919 redisSortOperation
*sop
= ln
->value
;
5920 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
5923 if (sop
->type
== REDIS_SORT_GET
) {
5924 if (!val
|| val
->type
!= REDIS_STRING
) {
5925 listAddNodeTail(listPtr
,createStringObject("",0));
5927 listAddNodeTail(listPtr
,val
);
5931 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
5935 if (dictReplace(c
->db
->dict
,storekey
,listObject
)) {
5936 incrRefCount(storekey
);
5938 /* Note: we add 1 because the DB is dirty anyway since even if the
5939 * SORT result is empty a new key is set and maybe the old content
5941 server
.dirty
+= 1+outputlen
;
5942 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
5946 decrRefCount(sortval
);
5947 listRelease(operations
);
5948 for (j
= 0; j
< vectorlen
; j
++) {
5949 if (sortby
&& alpha
&& vector
[j
].u
.cmpobj
)
5950 decrRefCount(vector
[j
].u
.cmpobj
);
5955 /* Convert an amount of bytes into a human readable string in the form
5956 * of 100B, 2G, 100M, 4K, and so forth. */
5957 static void bytesToHuman(char *s
, unsigned long long n
) {
5962 sprintf(s
,"%lluB",n
);
5964 } else if (n
< (1024*1024)) {
5965 d
= (double)n
/(1024);
5966 sprintf(s
,"%.2fK",d
);
5967 } else if (n
< (1024LL*1024*1024)) {
5968 d
= (double)n
/(1024*1024);
5969 sprintf(s
,"%.2fM",d
);
5970 } else if (n
< (1024LL*1024*1024*1024)) {
5971 d
= (double)n
/(1024LL*1024*1024);
5972 sprintf(s
,"%.2fG",d
);
5976 /* Create the string returned by the INFO command. This is decoupled
5977 * by the INFO command itself as we need to report the same information
5978 * on memory corruption problems. */
5979 static sds
genRedisInfoString(void) {
5981 time_t uptime
= time(NULL
)-server
.stat_starttime
;
5985 bytesToHuman(hmem
,zmalloc_used_memory());
5986 info
= sdscatprintf(sdsempty(),
5987 "redis_version:%s\r\n"
5989 "multiplexing_api:%s\r\n"
5990 "process_id:%ld\r\n"
5991 "uptime_in_seconds:%ld\r\n"
5992 "uptime_in_days:%ld\r\n"
5993 "connected_clients:%d\r\n"
5994 "connected_slaves:%d\r\n"
5995 "blocked_clients:%d\r\n"
5996 "used_memory:%zu\r\n"
5997 "used_memory_human:%s\r\n"
5998 "changes_since_last_save:%lld\r\n"
5999 "bgsave_in_progress:%d\r\n"
6000 "last_save_time:%ld\r\n"
6001 "bgrewriteaof_in_progress:%d\r\n"
6002 "total_connections_received:%lld\r\n"
6003 "total_commands_processed:%lld\r\n"
6007 (sizeof(long) == 8) ? "64" : "32",
6012 listLength(server
.clients
)-listLength(server
.slaves
),
6013 listLength(server
.slaves
),
6014 server
.blpop_blocked_clients
,
6015 zmalloc_used_memory(),
6018 server
.bgsavechildpid
!= -1,
6020 server
.bgrewritechildpid
!= -1,
6021 server
.stat_numconnections
,
6022 server
.stat_numcommands
,
6023 server
.vm_enabled
!= 0,
6024 server
.masterhost
== NULL
? "master" : "slave"
6026 if (server
.masterhost
) {
6027 info
= sdscatprintf(info
,
6028 "master_host:%s\r\n"
6029 "master_port:%d\r\n"
6030 "master_link_status:%s\r\n"
6031 "master_last_io_seconds_ago:%d\r\n"
6034 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
6036 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
6039 if (server
.vm_enabled
) {
6041 info
= sdscatprintf(info
,
6042 "vm_conf_max_memory:%llu\r\n"
6043 "vm_conf_page_size:%llu\r\n"
6044 "vm_conf_pages:%llu\r\n"
6045 "vm_stats_used_pages:%llu\r\n"
6046 "vm_stats_swapped_objects:%llu\r\n"
6047 "vm_stats_swappin_count:%llu\r\n"
6048 "vm_stats_swappout_count:%llu\r\n"
6049 "vm_stats_io_newjobs_len:%lu\r\n"
6050 "vm_stats_io_processing_len:%lu\r\n"
6051 "vm_stats_io_processed_len:%lu\r\n"
6052 "vm_stats_io_active_threads:%lu\r\n"
6053 "vm_stats_blocked_clients:%lu\r\n"
6054 ,(unsigned long long) server
.vm_max_memory
,
6055 (unsigned long long) server
.vm_page_size
,
6056 (unsigned long long) server
.vm_pages
,
6057 (unsigned long long) server
.vm_stats_used_pages
,
6058 (unsigned long long) server
.vm_stats_swapped_objects
,
6059 (unsigned long long) server
.vm_stats_swapins
,
6060 (unsigned long long) server
.vm_stats_swapouts
,
6061 (unsigned long) listLength(server
.io_newjobs
),
6062 (unsigned long) listLength(server
.io_processing
),
6063 (unsigned long) listLength(server
.io_processed
),
6064 (unsigned long) server
.io_active_threads
,
6065 (unsigned long) server
.vm_blocked_clients
6069 for (j
= 0; j
< server
.dbnum
; j
++) {
6070 long long keys
, vkeys
;
6072 keys
= dictSize(server
.db
[j
].dict
);
6073 vkeys
= dictSize(server
.db
[j
].expires
);
6074 if (keys
|| vkeys
) {
6075 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
6082 static void infoCommand(redisClient
*c
) {
6083 sds info
= genRedisInfoString();
6084 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
6085 (unsigned long)sdslen(info
)));
6086 addReplySds(c
,info
);
6087 addReply(c
,shared
.crlf
);
6090 static void monitorCommand(redisClient
*c
) {
6091 /* ignore MONITOR if aleady slave or in monitor mode */
6092 if (c
->flags
& REDIS_SLAVE
) return;
6094 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
6096 listAddNodeTail(server
.monitors
,c
);
6097 addReply(c
,shared
.ok
);
6100 /* ================================= Expire ================================= */
6101 static int removeExpire(redisDb
*db
, robj
*key
) {
6102 if (dictDelete(db
->expires
,key
) == DICT_OK
) {
6109 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
6110 if (dictAdd(db
->expires
,key
,(void*)when
) == DICT_ERR
) {
6118 /* Return the expire time of the specified key, or -1 if no expire
6119 * is associated with this key (i.e. the key is non volatile) */
6120 static time_t getExpire(redisDb
*db
, robj
*key
) {
6123 /* No expire? return ASAP */
6124 if (dictSize(db
->expires
) == 0 ||
6125 (de
= dictFind(db
->expires
,key
)) == NULL
) return -1;
6127 return (time_t) dictGetEntryVal(de
);
6130 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
6134 /* No expire? return ASAP */
6135 if (dictSize(db
->expires
) == 0 ||
6136 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6138 /* Lookup the expire */
6139 when
= (time_t) dictGetEntryVal(de
);
6140 if (time(NULL
) <= when
) return 0;
6142 /* Delete the key */
6143 dictDelete(db
->expires
,key
);
6144 return dictDelete(db
->dict
,key
) == DICT_OK
;
6147 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
6150 /* No expire? return ASAP */
6151 if (dictSize(db
->expires
) == 0 ||
6152 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6154 /* Delete the key */
6156 dictDelete(db
->expires
,key
);
6157 return dictDelete(db
->dict
,key
) == DICT_OK
;
6160 static void expireGenericCommand(redisClient
*c
, robj
*key
, time_t seconds
) {
6163 de
= dictFind(c
->db
->dict
,key
);
6165 addReply(c
,shared
.czero
);
6169 if (deleteKey(c
->db
,key
)) server
.dirty
++;
6170 addReply(c
, shared
.cone
);
6173 time_t when
= time(NULL
)+seconds
;
6174 if (setExpire(c
->db
,key
,when
)) {
6175 addReply(c
,shared
.cone
);
6178 addReply(c
,shared
.czero
);
6184 static void expireCommand(redisClient
*c
) {
6185 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10));
6188 static void expireatCommand(redisClient
*c
) {
6189 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10)-time(NULL
));
6192 static void ttlCommand(redisClient
*c
) {
6196 expire
= getExpire(c
->db
,c
->argv
[1]);
6198 ttl
= (int) (expire
-time(NULL
));
6199 if (ttl
< 0) ttl
= -1;
6201 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
6204 /* ================================ MULTI/EXEC ============================== */
6206 /* Client state initialization for MULTI/EXEC */
6207 static void initClientMultiState(redisClient
*c
) {
6208 c
->mstate
.commands
= NULL
;
6209 c
->mstate
.count
= 0;
6212 /* Release all the resources associated with MULTI/EXEC state */
6213 static void freeClientMultiState(redisClient
*c
) {
6216 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6218 multiCmd
*mc
= c
->mstate
.commands
+j
;
6220 for (i
= 0; i
< mc
->argc
; i
++)
6221 decrRefCount(mc
->argv
[i
]);
6224 zfree(c
->mstate
.commands
);
6227 /* Add a new command into the MULTI commands queue */
6228 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
6232 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
6233 sizeof(multiCmd
)*(c
->mstate
.count
+1));
6234 mc
= c
->mstate
.commands
+c
->mstate
.count
;
6237 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
6238 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
6239 for (j
= 0; j
< c
->argc
; j
++)
6240 incrRefCount(mc
->argv
[j
]);
6244 static void multiCommand(redisClient
*c
) {
6245 c
->flags
|= REDIS_MULTI
;
6246 addReply(c
,shared
.ok
);
6249 static void discardCommand(redisClient
*c
) {
6250 if (!(c
->flags
& REDIS_MULTI
)) {
6251 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
6255 freeClientMultiState(c
);
6256 initClientMultiState(c
);
6257 c
->flags
&= (~REDIS_MULTI
);
6258 addReply(c
,shared
.ok
);
6261 static void execCommand(redisClient
*c
) {
6266 if (!(c
->flags
& REDIS_MULTI
)) {
6267 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
6271 orig_argv
= c
->argv
;
6272 orig_argc
= c
->argc
;
6273 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
6274 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6275 c
->argc
= c
->mstate
.commands
[j
].argc
;
6276 c
->argv
= c
->mstate
.commands
[j
].argv
;
6277 call(c
,c
->mstate
.commands
[j
].cmd
);
6279 c
->argv
= orig_argv
;
6280 c
->argc
= orig_argc
;
6281 freeClientMultiState(c
);
6282 initClientMultiState(c
);
6283 c
->flags
&= (~REDIS_MULTI
);
6286 /* =========================== Blocking Operations ========================= */
6288 /* Currently Redis blocking operations support is limited to list POP ops,
6289 * so the current implementation is not fully generic, but it is also not
6290 * completely specific so it will not require a rewrite to support new
6291 * kind of blocking operations in the future.
6293 * Still it's important to note that list blocking operations can be already
6294 * used as a notification mechanism in order to implement other blocking
6295 * operations at application level, so there must be a very strong evidence
6296 * of usefulness and generality before new blocking operations are implemented.
6298 * This is how the current blocking POP works, we use BLPOP as example:
6299 * - If the user calls BLPOP and the key exists and contains a non empty list
6300 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6301 * if there is not to block.
6302 * - If instead BLPOP is called and the key does not exists or the list is
6303 * empty we need to block. In order to do so we remove the notification for
6304 * new data to read in the client socket (so that we'll not serve new
6305 * requests if the blocking request is not served). Also we put the client
6306 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6307 * blocking for this keys.
6308 * - If a PUSH operation against a key with blocked clients waiting is
6309 * performed, we serve the first in the list: basically instead to push
6310 * the new element inside the list we return it to the (first / oldest)
6311 * blocking client, unblock the client, and remove it form the list.
6313 * The above comment and the source code should be enough in order to understand
6314 * the implementation and modify / fix it later.
6317 /* Set a client in blocking mode for the specified key, with the specified
6319 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
6324 c
->blockingkeys
= zmalloc(sizeof(robj
*)*numkeys
);
6325 c
->blockingkeysnum
= numkeys
;
6326 c
->blockingto
= timeout
;
6327 for (j
= 0; j
< numkeys
; j
++) {
6328 /* Add the key in the client structure, to map clients -> keys */
6329 c
->blockingkeys
[j
] = keys
[j
];
6330 incrRefCount(keys
[j
]);
6332 /* And in the other "side", to map keys -> clients */
6333 de
= dictFind(c
->db
->blockingkeys
,keys
[j
]);
6337 /* For every key we take a list of clients blocked for it */
6339 retval
= dictAdd(c
->db
->blockingkeys
,keys
[j
],l
);
6340 incrRefCount(keys
[j
]);
6341 assert(retval
== DICT_OK
);
6343 l
= dictGetEntryVal(de
);
6345 listAddNodeTail(l
,c
);
6347 /* Mark the client as a blocked client */
6348 c
->flags
|= REDIS_BLOCKED
;
6349 server
.blpop_blocked_clients
++;
6352 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6353 static void unblockClientWaitingData(redisClient
*c
) {
6358 assert(c
->blockingkeys
!= NULL
);
6359 /* The client may wait for multiple keys, so unblock it for every key. */
6360 for (j
= 0; j
< c
->blockingkeysnum
; j
++) {
6361 /* Remove this client from the list of clients waiting for this key. */
6362 de
= dictFind(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6364 l
= dictGetEntryVal(de
);
6365 listDelNode(l
,listSearchKey(l
,c
));
6366 /* If the list is empty we need to remove it to avoid wasting memory */
6367 if (listLength(l
) == 0)
6368 dictDelete(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6369 decrRefCount(c
->blockingkeys
[j
]);
6371 /* Cleanup the client structure */
6372 zfree(c
->blockingkeys
);
6373 c
->blockingkeys
= NULL
;
6374 c
->flags
&= (~REDIS_BLOCKED
);
6375 server
.blpop_blocked_clients
--;
6376 /* We want to process data if there is some command waiting
6377 * in the input buffer. Note that this is safe even if
6378 * unblockClientWaitingData() gets called from freeClient() because
6379 * freeClient() will be smart enough to call this function
6380 * *after* c->querybuf was set to NULL. */
6381 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
6384 /* This should be called from any function PUSHing into lists.
6385 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6386 * 'ele' is the element pushed.
6388 * If the function returns 0 there was no client waiting for a list push
6391 * If the function returns 1 there was a client waiting for a list push
6392 * against this key, the element was passed to this client thus it's not
6393 * needed to actually add it to the list and the caller should return asap. */
6394 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
6395 struct dictEntry
*de
;
6396 redisClient
*receiver
;
6400 de
= dictFind(c
->db
->blockingkeys
,key
);
6401 if (de
== NULL
) return 0;
6402 l
= dictGetEntryVal(de
);
6405 receiver
= ln
->value
;
6407 addReplySds(receiver
,sdsnew("*2\r\n"));
6408 addReplyBulkLen(receiver
,key
);
6409 addReply(receiver
,key
);
6410 addReply(receiver
,shared
.crlf
);
6411 addReplyBulkLen(receiver
,ele
);
6412 addReply(receiver
,ele
);
6413 addReply(receiver
,shared
.crlf
);
6414 unblockClientWaitingData(receiver
);
6418 /* Blocking RPOP/LPOP */
6419 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
6424 for (j
= 1; j
< c
->argc
-1; j
++) {
6425 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
6427 if (o
->type
!= REDIS_LIST
) {
6428 addReply(c
,shared
.wrongtypeerr
);
6431 list
*list
= o
->ptr
;
6432 if (listLength(list
) != 0) {
6433 /* If the list contains elements fall back to the usual
6434 * non-blocking POP operation */
6435 robj
*argv
[2], **orig_argv
;
6438 /* We need to alter the command arguments before to call
6439 * popGenericCommand() as the command takes a single key. */
6440 orig_argv
= c
->argv
;
6441 orig_argc
= c
->argc
;
6442 argv
[1] = c
->argv
[j
];
6446 /* Also the return value is different, we need to output
6447 * the multi bulk reply header and the key name. The
6448 * "real" command will add the last element (the value)
6449 * for us. If this souds like an hack to you it's just
6450 * because it is... */
6451 addReplySds(c
,sdsnew("*2\r\n"));
6452 addReplyBulkLen(c
,argv
[1]);
6453 addReply(c
,argv
[1]);
6454 addReply(c
,shared
.crlf
);
6455 popGenericCommand(c
,where
);
6457 /* Fix the client structure with the original stuff */
6458 c
->argv
= orig_argv
;
6459 c
->argc
= orig_argc
;
6465 /* If the list is empty or the key does not exists we must block */
6466 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
6467 if (timeout
> 0) timeout
+= time(NULL
);
6468 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
6471 static void blpopCommand(redisClient
*c
) {
6472 blockingPopGenericCommand(c
,REDIS_HEAD
);
6475 static void brpopCommand(redisClient
*c
) {
6476 blockingPopGenericCommand(c
,REDIS_TAIL
);
6479 /* =============================== Replication ============================= */
6481 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6482 ssize_t nwritten
, ret
= size
;
6483 time_t start
= time(NULL
);
6487 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
6488 nwritten
= write(fd
,ptr
,size
);
6489 if (nwritten
== -1) return -1;
6493 if ((time(NULL
)-start
) > timeout
) {
6501 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6502 ssize_t nread
, totread
= 0;
6503 time_t start
= time(NULL
);
6507 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
6508 nread
= read(fd
,ptr
,size
);
6509 if (nread
== -1) return -1;
6514 if ((time(NULL
)-start
) > timeout
) {
6522 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6529 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
6532 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
6543 static void syncCommand(redisClient
*c
) {
6544 /* ignore SYNC if aleady slave or in monitor mode */
6545 if (c
->flags
& REDIS_SLAVE
) return;
6547 /* SYNC can't be issued when the server has pending data to send to
6548 * the client about already issued commands. We need a fresh reply
6549 * buffer registering the differences between the BGSAVE and the current
6550 * dataset, so that we can copy to other slaves if needed. */
6551 if (listLength(c
->reply
) != 0) {
6552 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6556 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
6557 /* Here we need to check if there is a background saving operation
6558 * in progress, or if it is required to start one */
6559 if (server
.bgsavechildpid
!= -1) {
6560 /* Ok a background save is in progress. Let's check if it is a good
6561 * one for replication, i.e. if there is another slave that is
6562 * registering differences since the server forked to save */
6567 listRewind(server
.slaves
,&li
);
6568 while((ln
= listNext(&li
))) {
6570 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
6573 /* Perfect, the server is already registering differences for
6574 * another slave. Set the right state, and copy the buffer. */
6575 listRelease(c
->reply
);
6576 c
->reply
= listDup(slave
->reply
);
6577 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6578 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
6580 /* No way, we need to wait for the next BGSAVE in order to
6581 * register differences */
6582 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
6583 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
6586 /* Ok we don't have a BGSAVE in progress, let's start one */
6587 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
6588 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
6589 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
6590 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
6593 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6596 c
->flags
|= REDIS_SLAVE
;
6598 listAddNodeTail(server
.slaves
,c
);
6602 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
6603 redisClient
*slave
= privdata
;
6605 REDIS_NOTUSED(mask
);
6606 char buf
[REDIS_IOBUF_LEN
];
6607 ssize_t nwritten
, buflen
;
6609 if (slave
->repldboff
== 0) {
6610 /* Write the bulk write count before to transfer the DB. In theory here
6611 * we don't know how much room there is in the output buffer of the
6612 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6613 * operations) will never be smaller than the few bytes we need. */
6616 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6618 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
6626 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
6627 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
6629 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
6630 (buflen
== 0) ? "premature EOF" : strerror(errno
));
6634 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
6635 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
6640 slave
->repldboff
+= nwritten
;
6641 if (slave
->repldboff
== slave
->repldbsize
) {
6642 close(slave
->repldbfd
);
6643 slave
->repldbfd
= -1;
6644 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
6645 slave
->replstate
= REDIS_REPL_ONLINE
;
6646 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
6647 sendReplyToClient
, slave
) == AE_ERR
) {
6651 addReplySds(slave
,sdsempty());
6652 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
6656 /* This function is called at the end of every backgrond saving.
6657 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6658 * otherwise REDIS_ERR is passed to the function.
6660 * The goal of this function is to handle slaves waiting for a successful
6661 * background saving in order to perform non-blocking synchronization. */
6662 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
6664 int startbgsave
= 0;
6667 listRewind(server
.slaves
,&li
);
6668 while((ln
= listNext(&li
))) {
6669 redisClient
*slave
= ln
->value
;
6671 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
6673 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6674 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
6675 struct redis_stat buf
;
6677 if (bgsaveerr
!= REDIS_OK
) {
6679 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
6682 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
6683 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
6685 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
6688 slave
->repldboff
= 0;
6689 slave
->repldbsize
= buf
.st_size
;
6690 slave
->replstate
= REDIS_REPL_SEND_BULK
;
6691 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
6692 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
6699 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
6702 listRewind(server
.slaves
,&li
);
6703 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
6704 while((ln
= listNext(&li
))) {
6705 redisClient
*slave
= ln
->value
;
6707 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
6714 static int syncWithMaster(void) {
6715 char buf
[1024], tmpfile
[256], authcmd
[1024];
6717 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
6721 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
6726 /* AUTH with the master if required. */
6727 if(server
.masterauth
) {
6728 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
6729 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
6731 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
6735 /* Read the AUTH result. */
6736 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
6738 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
6742 if (buf
[0] != '+') {
6744 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
6749 /* Issue the SYNC command */
6750 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
6752 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
6756 /* Read the bulk write count */
6757 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
6759 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
6763 if (buf
[0] != '$') {
6765 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6768 dumpsize
= atoi(buf
+1);
6769 redisLog(REDIS_NOTICE
,"Receiving %d bytes data dump from MASTER",dumpsize
);
6770 /* Read the bulk write data on a temp file */
6771 snprintf(tmpfile
,256,"temp-%d.%ld.rdb",(int)time(NULL
),(long int)random());
6772 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
,0644);
6775 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
6779 int nread
, nwritten
;
6781 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
6783 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
6789 nwritten
= write(dfd
,buf
,nread
);
6790 if (nwritten
== -1) {
6791 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
6799 if (rename(tmpfile
,server
.dbfilename
) == -1) {
6800 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
6806 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
6807 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
6811 server
.master
= createClient(fd
);
6812 server
.master
->flags
|= REDIS_MASTER
;
6813 server
.master
->authenticated
= 1;
6814 server
.replstate
= REDIS_REPL_CONNECTED
;
6818 static void slaveofCommand(redisClient
*c
) {
6819 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
6820 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
6821 if (server
.masterhost
) {
6822 sdsfree(server
.masterhost
);
6823 server
.masterhost
= NULL
;
6824 if (server
.master
) freeClient(server
.master
);
6825 server
.replstate
= REDIS_REPL_NONE
;
6826 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
6829 sdsfree(server
.masterhost
);
6830 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
6831 server
.masterport
= atoi(c
->argv
[2]->ptr
);
6832 if (server
.master
) freeClient(server
.master
);
6833 server
.replstate
= REDIS_REPL_CONNECT
;
6834 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
6835 server
.masterhost
, server
.masterport
);
6837 addReply(c
,shared
.ok
);
6840 /* ============================ Maxmemory directive ======================== */
6842 /* Try to free one object form the pre-allocated objects free list.
6843 * This is useful under low mem conditions as by default we take 1 million
6844 * free objects allocated. On success REDIS_OK is returned, otherwise
6846 static int tryFreeOneObjectFromFreelist(void) {
6849 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
6850 if (listLength(server
.objfreelist
)) {
6851 listNode
*head
= listFirst(server
.objfreelist
);
6852 o
= listNodeValue(head
);
6853 listDelNode(server
.objfreelist
,head
);
6854 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
6858 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
6863 /* This function gets called when 'maxmemory' is set on the config file to limit
6864 * the max memory used by the server, and we are out of memory.
6865 * This function will try to, in order:
6867 * - Free objects from the free list
6868 * - Try to remove keys with an EXPIRE set
6870 * It is not possible to free enough memory to reach used-memory < maxmemory
6871 * the server will start refusing commands that will enlarge even more the
6874 static void freeMemoryIfNeeded(void) {
6875 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
6876 int j
, k
, freed
= 0;
6878 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
6879 for (j
= 0; j
< server
.dbnum
; j
++) {
6881 robj
*minkey
= NULL
;
6882 struct dictEntry
*de
;
6884 if (dictSize(server
.db
[j
].expires
)) {
6886 /* From a sample of three keys drop the one nearest to
6887 * the natural expire */
6888 for (k
= 0; k
< 3; k
++) {
6891 de
= dictGetRandomKey(server
.db
[j
].expires
);
6892 t
= (time_t) dictGetEntryVal(de
);
6893 if (minttl
== -1 || t
< minttl
) {
6894 minkey
= dictGetEntryKey(de
);
6898 deleteKey(server
.db
+j
,minkey
);
6901 if (!freed
) return; /* nothing to free... */
6905 /* ============================== Append Only file ========================== */
6907 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
6908 sds buf
= sdsempty();
6914 /* The DB this command was targetting is not the same as the last command
6915 * we appendend. To issue a SELECT command is needed. */
6916 if (dictid
!= server
.appendseldb
) {
6919 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
6920 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6921 (unsigned long)strlen(seldb
),seldb
);
6922 server
.appendseldb
= dictid
;
6925 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6926 * EXPIREs into EXPIREATs calls */
6927 if (cmd
->proc
== expireCommand
) {
6930 tmpargv
[0] = createStringObject("EXPIREAT",8);
6931 tmpargv
[1] = argv
[1];
6932 incrRefCount(argv
[1]);
6933 when
= time(NULL
)+strtol(argv
[2]->ptr
,NULL
,10);
6934 tmpargv
[2] = createObject(REDIS_STRING
,
6935 sdscatprintf(sdsempty(),"%ld",when
));
6939 /* Append the actual command */
6940 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
6941 for (j
= 0; j
< argc
; j
++) {
6944 o
= getDecodedObject(o
);
6945 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
6946 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
6947 buf
= sdscatlen(buf
,"\r\n",2);
6951 /* Free the objects from the modified argv for EXPIREAT */
6952 if (cmd
->proc
== expireCommand
) {
6953 for (j
= 0; j
< 3; j
++)
6954 decrRefCount(argv
[j
]);
6957 /* We want to perform a single write. This should be guaranteed atomic
6958 * at least if the filesystem we are writing is a real physical one.
6959 * While this will save us against the server being killed I don't think
6960 * there is much to do about the whole server stopping for power problems
6962 nwritten
= write(server
.appendfd
,buf
,sdslen(buf
));
6963 if (nwritten
!= (signed)sdslen(buf
)) {
6964 /* Ooops, we are in troubles. The best thing to do for now is
6965 * to simply exit instead to give the illusion that everything is
6966 * working as expected. */
6967 if (nwritten
== -1) {
6968 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
6970 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
6974 /* If a background append only file rewriting is in progress we want to
6975 * accumulate the differences between the child DB and the current one
6976 * in a buffer, so that when the child process will do its work we
6977 * can append the differences to the new append only file. */
6978 if (server
.bgrewritechildpid
!= -1)
6979 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
6983 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
6984 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
6985 now
-server
.lastfsync
> 1))
6987 fsync(server
.appendfd
); /* Let's try to get this data on the disk */
6988 server
.lastfsync
= now
;
6992 /* In Redis commands are always executed in the context of a client, so in
6993 * order to load the append only file we need to create a fake client. */
6994 static struct redisClient
*createFakeClient(void) {
6995 struct redisClient
*c
= zmalloc(sizeof(*c
));
6999 c
->querybuf
= sdsempty();
7003 /* We set the fake client as a slave waiting for the synchronization
7004 * so that Redis will not try to send replies to this client. */
7005 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
7006 c
->reply
= listCreate();
7007 listSetFreeMethod(c
->reply
,decrRefCount
);
7008 listSetDupMethod(c
->reply
,dupClientReplyValue
);
7012 static void freeFakeClient(struct redisClient
*c
) {
7013 sdsfree(c
->querybuf
);
7014 listRelease(c
->reply
);
7018 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7019 * error (the append only file is zero-length) REDIS_ERR is returned. On
7020 * fatal error an error message is logged and the program exists. */
7021 int loadAppendOnlyFile(char *filename
) {
7022 struct redisClient
*fakeClient
;
7023 FILE *fp
= fopen(filename
,"r");
7024 struct redis_stat sb
;
7025 unsigned long long loadedkeys
= 0;
7027 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
7031 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
7035 fakeClient
= createFakeClient();
7042 struct redisCommand
*cmd
;
7044 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
7050 if (buf
[0] != '*') goto fmterr
;
7052 argv
= zmalloc(sizeof(robj
*)*argc
);
7053 for (j
= 0; j
< argc
; j
++) {
7054 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
7055 if (buf
[0] != '$') goto fmterr
;
7056 len
= strtol(buf
+1,NULL
,10);
7057 argsds
= sdsnewlen(NULL
,len
);
7058 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
7059 argv
[j
] = createObject(REDIS_STRING
,argsds
);
7060 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
7063 /* Command lookup */
7064 cmd
= lookupCommand(argv
[0]->ptr
);
7066 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
7069 /* Try object sharing and encoding */
7070 if (server
.shareobjects
) {
7072 for(j
= 1; j
< argc
; j
++)
7073 argv
[j
] = tryObjectSharing(argv
[j
]);
7075 if (cmd
->flags
& REDIS_CMD_BULK
)
7076 tryObjectEncoding(argv
[argc
-1]);
7077 /* Run the command in the context of a fake client */
7078 fakeClient
->argc
= argc
;
7079 fakeClient
->argv
= argv
;
7080 cmd
->proc(fakeClient
);
7081 /* Discard the reply objects list from the fake client */
7082 while(listLength(fakeClient
->reply
))
7083 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
7084 /* Clean up, ready for the next command */
7085 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
7087 /* Handle swapping while loading big datasets when VM is on */
7089 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
7090 while (zmalloc_used_memory() > server
.vm_max_memory
) {
7091 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
7096 freeFakeClient(fakeClient
);
7101 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
7103 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
7107 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
7111 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7112 static int fwriteBulk(FILE *fp
, robj
*obj
) {
7116 /* Avoid the incr/decr ref count business if possible to help
7117 * copy-on-write (we are often in a child process when this function
7119 * Also makes sure that key objects don't get incrRefCount-ed when VM
7121 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
7122 obj
= getDecodedObject(obj
);
7125 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(long)sdslen(obj
->ptr
));
7126 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) goto err
;
7127 if (sdslen(obj
->ptr
) && fwrite(obj
->ptr
,sdslen(obj
->ptr
),1,fp
) == 0)
7129 if (fwrite("\r\n",2,1,fp
) == 0) goto err
;
7130 if (decrrc
) decrRefCount(obj
);
7133 if (decrrc
) decrRefCount(obj
);
7137 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7138 static int fwriteBulkDouble(FILE *fp
, double d
) {
7139 char buf
[128], dbuf
[128];
7141 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
7142 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
7143 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7144 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
7148 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7149 static int fwriteBulkLong(FILE *fp
, long l
) {
7150 char buf
[128], lbuf
[128];
7152 snprintf(lbuf
,sizeof(lbuf
),"%ld\r\n",l
);
7153 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(lbuf
)-2);
7154 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7155 if (fwrite(lbuf
,strlen(lbuf
),1,fp
) == 0) return 0;
7159 /* Write a sequence of commands able to fully rebuild the dataset into
7160 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7161 static int rewriteAppendOnlyFile(char *filename
) {
7162 dictIterator
*di
= NULL
;
7167 time_t now
= time(NULL
);
7169 /* Note that we have to use a different temp name here compared to the
7170 * one used by rewriteAppendOnlyFileBackground() function. */
7171 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
7172 fp
= fopen(tmpfile
,"w");
7174 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
7177 for (j
= 0; j
< server
.dbnum
; j
++) {
7178 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
7179 redisDb
*db
= server
.db
+j
;
7181 if (dictSize(d
) == 0) continue;
7182 di
= dictGetIterator(d
);
7188 /* SELECT the new DB */
7189 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
7190 if (fwriteBulkLong(fp
,j
) == 0) goto werr
;
7192 /* Iterate this DB writing every entry */
7193 while((de
= dictNext(di
)) != NULL
) {
7198 key
= dictGetEntryKey(de
);
7199 /* If the value for this key is swapped, load a preview in memory.
7200 * We use a "swapped" flag to remember if we need to free the
7201 * value object instead to just increment the ref count anyway
7202 * in order to avoid copy-on-write of pages if we are forked() */
7203 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
7204 key
->storage
== REDIS_VM_SWAPPING
) {
7205 o
= dictGetEntryVal(de
);
7208 o
= vmPreviewObject(key
);
7211 expiretime
= getExpire(db
,key
);
7213 /* Save the key and associated value */
7214 if (o
->type
== REDIS_STRING
) {
7215 /* Emit a SET command */
7216 char cmd
[]="*3\r\n$3\r\nSET\r\n";
7217 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7219 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7220 if (fwriteBulk(fp
,o
) == 0) goto werr
;
7221 } else if (o
->type
== REDIS_LIST
) {
7222 /* Emit the RPUSHes needed to rebuild the list */
7223 list
*list
= o
->ptr
;
7227 listRewind(list
,&li
);
7228 while((ln
= listNext(&li
))) {
7229 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
7230 robj
*eleobj
= listNodeValue(ln
);
7232 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7233 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7234 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7236 } else if (o
->type
== REDIS_SET
) {
7237 /* Emit the SADDs needed to rebuild the set */
7239 dictIterator
*di
= dictGetIterator(set
);
7242 while((de
= dictNext(di
)) != NULL
) {
7243 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
7244 robj
*eleobj
= dictGetEntryKey(de
);
7246 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7247 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7248 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7250 dictReleaseIterator(di
);
7251 } else if (o
->type
== REDIS_ZSET
) {
7252 /* Emit the ZADDs needed to rebuild the sorted set */
7254 dictIterator
*di
= dictGetIterator(zs
->dict
);
7257 while((de
= dictNext(di
)) != NULL
) {
7258 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
7259 robj
*eleobj
= dictGetEntryKey(de
);
7260 double *score
= dictGetEntryVal(de
);
7262 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7263 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7264 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
7265 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7267 dictReleaseIterator(di
);
7269 redisAssert(0 != 0);
7271 /* Save the expire time */
7272 if (expiretime
!= -1) {
7273 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
7274 /* If this key is already expired skip it */
7275 if (expiretime
< now
) continue;
7276 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7277 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7278 if (fwriteBulkLong(fp
,expiretime
) == 0) goto werr
;
7280 if (swapped
) decrRefCount(o
);
7282 dictReleaseIterator(di
);
7285 /* Make sure data will not remain on the OS's output buffers */
7290 /* Use RENAME to make sure the DB file is changed atomically only
7291 * if the generate DB file is ok. */
7292 if (rename(tmpfile
,filename
) == -1) {
7293 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
7297 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
7303 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
7304 if (di
) dictReleaseIterator(di
);
7308 /* This is how rewriting of the append only file in background works:
7310 * 1) The user calls BGREWRITEAOF
7311 * 2) Redis calls this function, that forks():
7312 * 2a) the child rewrite the append only file in a temp file.
7313 * 2b) the parent accumulates differences in server.bgrewritebuf.
7314 * 3) When the child finished '2a' exists.
7315 * 4) The parent will trap the exit code, if it's OK, will append the
7316 * data accumulated into server.bgrewritebuf into the temp file, and
7317 * finally will rename(2) the temp file in the actual file name.
7318 * The the new file is reopened as the new append only file. Profit!
7320 static int rewriteAppendOnlyFileBackground(void) {
7323 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
7324 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
7325 if ((childpid
= fork()) == 0) {
7329 if (server
.vm_enabled
) vmReopenSwapFile();
7331 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7332 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
7339 if (childpid
== -1) {
7340 redisLog(REDIS_WARNING
,
7341 "Can't rewrite append only file in background: fork: %s",
7345 redisLog(REDIS_NOTICE
,
7346 "Background append only file rewriting started by pid %d",childpid
);
7347 server
.bgrewritechildpid
= childpid
;
7348 /* We set appendseldb to -1 in order to force the next call to the
7349 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7350 * accumulated by the parent into server.bgrewritebuf will start
7351 * with a SELECT statement and it will be safe to merge. */
7352 server
.appendseldb
= -1;
7355 return REDIS_OK
; /* unreached */
7358 static void bgrewriteaofCommand(redisClient
*c
) {
7359 if (server
.bgrewritechildpid
!= -1) {
7360 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7363 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
7364 char *status
= "+Background append only file rewriting started\r\n";
7365 addReplySds(c
,sdsnew(status
));
7367 addReply(c
,shared
.err
);
7371 static void aofRemoveTempFile(pid_t childpid
) {
7374 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
7378 /* Virtual Memory is composed mainly of two subsystems:
7379 * - Blocking Virutal Memory
7380 * - Threaded Virtual Memory I/O
7381 * The two parts are not fully decoupled, but functions are split among two
7382 * different sections of the source code (delimited by comments) in order to
7383 * make more clear what functionality is about the blocking VM and what about
7384 * the threaded (not blocking) VM.
7388 * Redis VM is a blocking VM (one that blocks reading swapped values from
7389 * disk into memory when a value swapped out is needed in memory) that is made
7390 * unblocking by trying to examine the command argument vector in order to
7391 * load in background values that will likely be needed in order to exec
7392 * the command. The command is executed only once all the relevant keys
7393 * are loaded into memory.
7395 * This basically is almost as simple of a blocking VM, but almost as parallel
7396 * as a fully non-blocking VM.
7399 /* =================== Virtual Memory - Blocking Side ====================== */
7401 /* substitute the first occurrence of '%p' with the process pid in the
7402 * swap file name. */
7403 static void expandVmSwapFilename(void) {
7404 char *p
= strstr(server
.vm_swap_file
,"%p");
7410 new = sdscat(new,server
.vm_swap_file
);
7411 new = sdscatprintf(new,"%ld",(long) getpid());
7412 new = sdscat(new,p
+2);
7413 zfree(server
.vm_swap_file
);
7414 server
.vm_swap_file
= new;
7417 static void vmInit(void) {
7422 if (server
.vm_max_threads
!= 0)
7423 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7425 expandVmSwapFilename();
7426 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
7427 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
7428 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
7430 if (server
.vm_fp
== NULL
) {
7431 redisLog(REDIS_WARNING
,
7432 "Impossible to open the swap file: %s. Exiting.",
7436 server
.vm_fd
= fileno(server
.vm_fp
);
7437 server
.vm_next_page
= 0;
7438 server
.vm_near_pages
= 0;
7439 server
.vm_stats_used_pages
= 0;
7440 server
.vm_stats_swapped_objects
= 0;
7441 server
.vm_stats_swapouts
= 0;
7442 server
.vm_stats_swapins
= 0;
7443 totsize
= server
.vm_pages
*server
.vm_page_size
;
7444 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
7445 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
7446 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
7450 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
7452 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
7453 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
7454 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
7455 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
7457 /* Initialize threaded I/O (used by Virtual Memory) */
7458 server
.io_newjobs
= listCreate();
7459 server
.io_processing
= listCreate();
7460 server
.io_processed
= listCreate();
7461 server
.io_ready_clients
= listCreate();
7462 pthread_mutex_init(&server
.io_mutex
,NULL
);
7463 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
7464 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
7465 server
.io_active_threads
= 0;
7466 if (pipe(pipefds
) == -1) {
7467 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
7471 server
.io_ready_pipe_read
= pipefds
[0];
7472 server
.io_ready_pipe_write
= pipefds
[1];
7473 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
7474 /* LZF requires a lot of stack */
7475 pthread_attr_init(&server
.io_threads_attr
);
7476 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
7477 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
7478 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
7479 /* Listen for events in the threaded I/O pipe */
7480 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
7481 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
7482 oom("creating file event");
7485 /* Mark the page as used */
7486 static void vmMarkPageUsed(off_t page
) {
7487 off_t byte
= page
/8;
7489 redisAssert(vmFreePage(page
) == 1);
7490 server
.vm_bitmap
[byte
] |= 1<<bit
;
7493 /* Mark N contiguous pages as used, with 'page' being the first. */
7494 static void vmMarkPagesUsed(off_t page
, off_t count
) {
7497 for (j
= 0; j
< count
; j
++)
7498 vmMarkPageUsed(page
+j
);
7499 server
.vm_stats_used_pages
+= count
;
7500 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
7501 (long long)count
, (long long)page
);
7504 /* Mark the page as free */
7505 static void vmMarkPageFree(off_t page
) {
7506 off_t byte
= page
/8;
7508 redisAssert(vmFreePage(page
) == 0);
7509 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
7512 /* Mark N contiguous pages as free, with 'page' being the first. */
7513 static void vmMarkPagesFree(off_t page
, off_t count
) {
7516 for (j
= 0; j
< count
; j
++)
7517 vmMarkPageFree(page
+j
);
7518 server
.vm_stats_used_pages
-= count
;
7519 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
7520 (long long)count
, (long long)page
);
7523 /* Test if the page is free */
7524 static int vmFreePage(off_t page
) {
7525 off_t byte
= page
/8;
7527 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
7530 /* Find N contiguous free pages storing the first page of the cluster in *first.
7531 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7532 * REDIS_ERR is returned.
7534 * This function uses a simple algorithm: we try to allocate
7535 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7536 * again from the start of the swap file searching for free spaces.
7538 * If it looks pretty clear that there are no free pages near our offset
7539 * we try to find less populated places doing a forward jump of
7540 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7541 * without hurry, and then we jump again and so forth...
7543 * This function can be improved using a free list to avoid to guess
7544 * too much, since we could collect data about freed pages.
7546 * note: I implemented this function just after watching an episode of
7547 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7549 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
7550 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
7552 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
7553 server
.vm_near_pages
= 0;
7554 server
.vm_next_page
= 0;
7556 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
7557 base
= server
.vm_next_page
;
7559 while(offset
< server
.vm_pages
) {
7560 off_t
this = base
+offset
;
7562 /* If we overflow, restart from page zero */
7563 if (this >= server
.vm_pages
) {
7564 this -= server
.vm_pages
;
7566 /* Just overflowed, what we found on tail is no longer
7567 * interesting, as it's no longer contiguous. */
7571 if (vmFreePage(this)) {
7572 /* This is a free page */
7574 /* Already got N free pages? Return to the caller, with success */
7576 *first
= this-(n
-1);
7577 server
.vm_next_page
= this+1;
7578 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
7582 /* The current one is not a free page */
7586 /* Fast-forward if the current page is not free and we already
7587 * searched enough near this place. */
7589 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
7590 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
7592 /* Note that even if we rewind after the jump, we are don't need
7593 * to make sure numfree is set to zero as we only jump *if* it
7594 * is set to zero. */
7596 /* Otherwise just check the next page */
7603 /* Write the specified object at the specified page of the swap file */
7604 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
7605 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
7606 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
7607 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7608 redisLog(REDIS_WARNING
,
7609 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7613 rdbSaveObject(server
.vm_fp
,o
);
7614 fflush(server
.vm_fp
);
7615 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7619 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7620 * needed to later retrieve the object into the key object.
7621 * If we can't find enough contiguous empty pages to swap the object on disk
7622 * REDIS_ERR is returned. */
7623 static int vmSwapObjectBlocking(robj
*key
, robj
*val
) {
7624 off_t pages
= rdbSavedObjectPages(val
,NULL
);
7627 assert(key
->storage
== REDIS_VM_MEMORY
);
7628 assert(key
->refcount
== 1);
7629 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return REDIS_ERR
;
7630 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return REDIS_ERR
;
7631 key
->vm
.page
= page
;
7632 key
->vm
.usedpages
= pages
;
7633 key
->storage
= REDIS_VM_SWAPPED
;
7634 key
->vtype
= val
->type
;
7635 decrRefCount(val
); /* Deallocate the object from memory. */
7636 vmMarkPagesUsed(page
,pages
);
7637 redisLog(REDIS_DEBUG
,"VM: object %s swapped out at %lld (%lld pages)",
7638 (unsigned char*) key
->ptr
,
7639 (unsigned long long) page
, (unsigned long long) pages
);
7640 server
.vm_stats_swapped_objects
++;
7641 server
.vm_stats_swapouts
++;
7645 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
7648 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
7649 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
7650 redisLog(REDIS_WARNING
,
7651 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7655 o
= rdbLoadObject(type
,server
.vm_fp
);
7657 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
7660 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7664 /* Load the value object relative to the 'key' object from swap to memory.
7665 * The newly allocated object is returned.
7667 * If preview is true the unserialized object is returned to the caller but
7668 * no changes are made to the key object, nor the pages are marked as freed */
7669 static robj
*vmGenericLoadObject(robj
*key
, int preview
) {
7672 redisAssert(key
->storage
== REDIS_VM_SWAPPED
|| key
->storage
== REDIS_VM_LOADING
);
7673 val
= vmReadObjectFromSwap(key
->vm
.page
,key
->vtype
);
7675 key
->storage
= REDIS_VM_MEMORY
;
7676 key
->vm
.atime
= server
.unixtime
;
7677 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
7678 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk",
7679 (unsigned char*) key
->ptr
);
7680 server
.vm_stats_swapped_objects
--;
7682 redisLog(REDIS_DEBUG
, "VM: object %s previewed from disk",
7683 (unsigned char*) key
->ptr
);
7685 server
.vm_stats_swapins
++;
7689 /* Plain object loading, from swap to memory */
7690 static robj
*vmLoadObject(robj
*key
) {
7691 /* If we are loading the object in background, stop it, we
7692 * need to load this object synchronously ASAP. */
7693 if (key
->storage
== REDIS_VM_LOADING
)
7694 vmCancelThreadedIOJob(key
);
7695 return vmGenericLoadObject(key
,0);
7698 /* Just load the value on disk, without to modify the key.
7699 * This is useful when we want to perform some operation on the value
7700 * without to really bring it from swap to memory, like while saving the
7701 * dataset or rewriting the append only log. */
7702 static robj
*vmPreviewObject(robj
*key
) {
7703 return vmGenericLoadObject(key
,1);
7706 /* How a good candidate is this object for swapping?
7707 * The better candidate it is, the greater the returned value.
7709 * Currently we try to perform a fast estimation of the object size in
7710 * memory, and combine it with aging informations.
7712 * Basically swappability = idle-time * log(estimated size)
7714 * Bigger objects are preferred over smaller objects, but not
7715 * proportionally, this is why we use the logarithm. This algorithm is
7716 * just a first try and will probably be tuned later. */
7717 static double computeObjectSwappability(robj
*o
) {
7718 time_t age
= server
.unixtime
- o
->vm
.atime
;
7722 struct dictEntry
*de
;
7725 if (age
<= 0) return 0;
7728 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
7731 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
7736 listNode
*ln
= listFirst(l
);
7738 asize
= sizeof(list
);
7740 robj
*ele
= ln
->value
;
7743 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
7744 (sizeof(*o
)+sdslen(ele
->ptr
)) :
7746 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
7751 z
= (o
->type
== REDIS_ZSET
);
7752 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
7754 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
7755 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
7760 de
= dictGetRandomKey(d
);
7761 ele
= dictGetEntryKey(de
);
7762 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
7763 (sizeof(*o
)+sdslen(ele
->ptr
)) :
7765 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
7766 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
7770 return (double)age
*log(1+asize
);
7773 /* Try to swap an object that's a good candidate for swapping.
7774 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7775 * to swap any object at all.
7777 * If 'usethreaded' is true, Redis will try to swap the object in background
7778 * using I/O threads. */
7779 static int vmSwapOneObject(int usethreads
) {
7781 struct dictEntry
*best
= NULL
;
7782 double best_swappability
= 0;
7783 redisDb
*best_db
= NULL
;
7786 for (j
= 0; j
< server
.dbnum
; j
++) {
7787 redisDb
*db
= server
.db
+j
;
7788 /* Why maxtries is set to 100?
7789 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7790 * are swappable objects */
7793 if (dictSize(db
->dict
) == 0) continue;
7794 for (i
= 0; i
< 5; i
++) {
7796 double swappability
;
7798 if (maxtries
) maxtries
--;
7799 de
= dictGetRandomKey(db
->dict
);
7800 key
= dictGetEntryKey(de
);
7801 val
= dictGetEntryVal(de
);
7802 /* Only swap objects that are currently in memory.
7804 * Also don't swap shared objects if threaded VM is on, as we
7805 * try to ensure that the main thread does not touch the
7806 * object while the I/O thread is using it, but we can't
7807 * control other keys without adding additional mutex. */
7808 if (key
->storage
!= REDIS_VM_MEMORY
||
7809 (server
.vm_max_threads
!= 0 && val
->refcount
!= 1)) {
7810 if (maxtries
) i
--; /* don't count this try */
7813 swappability
= computeObjectSwappability(val
);
7814 if (!best
|| swappability
> best_swappability
) {
7816 best_swappability
= swappability
;
7821 if (best
== NULL
) return REDIS_ERR
;
7822 key
= dictGetEntryKey(best
);
7823 val
= dictGetEntryVal(best
);
7825 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
7826 key
->ptr
, best_swappability
);
7828 /* Unshare the key if needed */
7829 if (key
->refcount
> 1) {
7830 robj
*newkey
= dupStringObject(key
);
7832 key
= dictGetEntryKey(best
) = newkey
;
7836 vmSwapObjectThreaded(key
,val
,best_db
);
7839 if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
7840 dictGetEntryVal(best
) = NULL
;
7848 static int vmSwapOneObjectBlocking() {
7849 return vmSwapOneObject(0);
7852 static int vmSwapOneObjectThreaded() {
7853 return vmSwapOneObject(1);
7856 /* Return true if it's safe to swap out objects in a given moment.
7857 * Basically we don't want to swap objects out while there is a BGSAVE
7858 * or a BGAEOREWRITE running in backgroud. */
7859 static int vmCanSwapOut(void) {
7860 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
7863 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7864 * and was deleted. Otherwise 0 is returned. */
7865 static int deleteIfSwapped(redisDb
*db
, robj
*key
) {
7869 if ((de
= dictFind(db
->dict
,key
)) == NULL
) return 0;
7870 foundkey
= dictGetEntryKey(de
);
7871 if (foundkey
->storage
== REDIS_VM_MEMORY
) return 0;
7876 /* =================== Virtual Memory - Threaded I/O ======================= */
7878 static void freeIOJob(iojob
*j
) {
7879 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
7880 j
->type
== REDIS_IOJOB_DO_SWAP
||
7881 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
7882 decrRefCount(j
->val
);
7883 decrRefCount(j
->key
);
7887 /* Every time a thread finished a Job, it writes a byte into the write side
7888 * of an unix pipe in order to "awake" the main thread, and this function
7890 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
7894 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
7896 REDIS_NOTUSED(mask
);
7897 REDIS_NOTUSED(privdata
);
7899 /* For every byte we read in the read side of the pipe, there is one
7900 * I/O job completed to process. */
7901 while((retval
= read(fd
,buf
,1)) == 1) {
7905 struct dictEntry
*de
;
7907 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
7909 /* Get the processed element (the oldest one) */
7911 assert(listLength(server
.io_processed
) != 0);
7912 if (toprocess
== -1) {
7913 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
7914 if (toprocess
<= 0) toprocess
= 1;
7916 ln
= listFirst(server
.io_processed
);
7918 listDelNode(server
.io_processed
,ln
);
7920 /* If this job is marked as canceled, just ignore it */
7925 /* Post process it in the main thread, as there are things we
7926 * can do just here to avoid race conditions and/or invasive locks */
7927 redisLog(REDIS_DEBUG
,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j
, j
->type
, (void*)j
->key
, (char*)j
->key
->ptr
, j
->key
->refcount
);
7928 de
= dictFind(j
->db
->dict
,j
->key
);
7930 key
= dictGetEntryKey(de
);
7931 if (j
->type
== REDIS_IOJOB_LOAD
) {
7934 /* Key loaded, bring it at home */
7935 key
->storage
= REDIS_VM_MEMORY
;
7936 key
->vm
.atime
= server
.unixtime
;
7937 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
7938 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
7939 (unsigned char*) key
->ptr
);
7940 server
.vm_stats_swapped_objects
--;
7941 server
.vm_stats_swapins
++;
7942 dictGetEntryVal(de
) = j
->val
;
7943 incrRefCount(j
->val
);
7946 /* Handle clients waiting for this key to be loaded. */
7947 handleClientsBlockedOnSwappedKey(db
,key
);
7948 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
7949 /* Now we know the amount of pages required to swap this object.
7950 * Let's find some space for it, and queue this task again
7951 * rebranded as REDIS_IOJOB_DO_SWAP. */
7952 if (!vmCanSwapOut() ||
7953 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
7955 /* Ooops... no space or we can't swap as there is
7956 * a fork()ed Redis trying to save stuff on disk. */
7958 key
->storage
= REDIS_VM_MEMORY
; /* undo operation */
7960 /* Note that we need to mark this pages as used now,
7961 * if the job will be canceled, we'll mark them as freed
7963 vmMarkPagesUsed(j
->page
,j
->pages
);
7964 j
->type
= REDIS_IOJOB_DO_SWAP
;
7969 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
7972 /* Key swapped. We can finally free some memory. */
7973 if (key
->storage
!= REDIS_VM_SWAPPING
) {
7974 printf("key->storage: %d\n",key
->storage
);
7975 printf("key->name: %s\n",(char*)key
->ptr
);
7976 printf("key->refcount: %d\n",key
->refcount
);
7977 printf("val: %p\n",(void*)j
->val
);
7978 printf("val->type: %d\n",j
->val
->type
);
7979 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
7981 redisAssert(key
->storage
== REDIS_VM_SWAPPING
);
7982 val
= dictGetEntryVal(de
);
7983 key
->vm
.page
= j
->page
;
7984 key
->vm
.usedpages
= j
->pages
;
7985 key
->storage
= REDIS_VM_SWAPPED
;
7986 key
->vtype
= j
->val
->type
;
7987 decrRefCount(val
); /* Deallocate the object from memory. */
7988 dictGetEntryVal(de
) = NULL
;
7989 redisLog(REDIS_DEBUG
,
7990 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7991 (unsigned char*) key
->ptr
,
7992 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
7993 server
.vm_stats_swapped_objects
++;
7994 server
.vm_stats_swapouts
++;
7996 /* Put a few more swap requests in queue if we are still
7998 if (trytoswap
&& vmCanSwapOut() &&
7999 zmalloc_used_memory() > server
.vm_max_memory
)
8004 more
= listLength(server
.io_newjobs
) <
8005 (unsigned) server
.vm_max_threads
;
8007 /* Don't waste CPU time if swappable objects are rare. */
8008 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
8016 if (processed
== toprocess
) return;
8018 if (retval
< 0 && errno
!= EAGAIN
) {
8019 redisLog(REDIS_WARNING
,
8020 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8025 static void lockThreadedIO(void) {
8026 pthread_mutex_lock(&server
.io_mutex
);
8029 static void unlockThreadedIO(void) {
8030 pthread_mutex_unlock(&server
.io_mutex
);
8033 /* Remove the specified object from the threaded I/O queue if still not
8034 * processed, otherwise make sure to flag it as canceled. */
8035 static void vmCancelThreadedIOJob(robj
*o
) {
8037 server
.io_newjobs
, /* 0 */
8038 server
.io_processing
, /* 1 */
8039 server
.io_processed
/* 2 */
8043 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
8046 /* Search for a matching key in one of the queues */
8047 for (i
= 0; i
< 3; i
++) {
8051 listRewind(lists
[i
],&li
);
8052 while ((ln
= listNext(&li
)) != NULL
) {
8053 iojob
*job
= ln
->value
;
8055 if (job
->canceled
) continue; /* Skip this, already canceled. */
8056 if (compareStringObjects(job
->key
,o
) == 0) {
8057 redisLog(REDIS_DEBUG
,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8058 (void*)job
, (char*)o
->ptr
, job
->type
, i
);
8059 /* Mark the pages as free since the swap didn't happened
8060 * or happened but is now discarded. */
8061 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
8062 vmMarkPagesFree(job
->page
,job
->pages
);
8063 /* Cancel the job. It depends on the list the job is
8066 case 0: /* io_newjobs */
8067 /* If the job was yet not processed the best thing to do
8068 * is to remove it from the queue at all */
8070 listDelNode(lists
[i
],ln
);
8072 case 1: /* io_processing */
8073 /* Oh Shi- the thread is messing with the Job:
8075 * Probably it's accessing the object if this is a
8076 * PREPARE_SWAP or DO_SWAP job.
8077 * If it's a LOAD job it may be reading from disk and
8078 * if we don't wait for the job to terminate before to
8079 * cancel it, maybe in a few microseconds data can be
8080 * corrupted in this pages. So the short story is:
8082 * Better to wait for the job to move into the
8083 * next queue (processed)... */
8085 /* We try again and again until the job is completed. */
8087 /* But let's wait some time for the I/O thread
8088 * to finish with this job. After all this condition
8089 * should be very rare. */
8092 case 2: /* io_processed */
8093 /* The job was already processed, that's easy...
8094 * just mark it as canceled so that we'll ignore it
8095 * when processing completed jobs. */
8099 /* Finally we have to adjust the storage type of the object
8100 * in order to "UNDO" the operaiton. */
8101 if (o
->storage
== REDIS_VM_LOADING
)
8102 o
->storage
= REDIS_VM_SWAPPED
;
8103 else if (o
->storage
== REDIS_VM_SWAPPING
)
8104 o
->storage
= REDIS_VM_MEMORY
;
8111 assert(1 != 1); /* We should never reach this */
8114 static void *IOThreadEntryPoint(void *arg
) {
8119 pthread_detach(pthread_self());
8121 /* Get a new job to process */
8123 if (listLength(server
.io_newjobs
) == 0) {
8124 /* No new jobs in queue, exit. */
8125 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
8126 (long) pthread_self());
8127 server
.io_active_threads
--;
8131 ln
= listFirst(server
.io_newjobs
);
8133 listDelNode(server
.io_newjobs
,ln
);
8134 /* Add the job in the processing queue */
8135 j
->thread
= pthread_self();
8136 listAddNodeTail(server
.io_processing
,j
);
8137 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
8139 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
8140 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
8142 /* Process the Job */
8143 if (j
->type
== REDIS_IOJOB_LOAD
) {
8144 j
->val
= vmReadObjectFromSwap(j
->page
,j
->key
->vtype
);
8145 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
8146 FILE *fp
= fopen("/dev/null","w+");
8147 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
8149 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
8150 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
8154 /* Done: insert the job into the processed queue */
8155 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
8156 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
8158 listDelNode(server
.io_processing
,ln
);
8159 listAddNodeTail(server
.io_processed
,j
);
8162 /* Signal the main thread there is new stuff to process */
8163 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
8165 return NULL
; /* never reached */
8168 static void spawnIOThread(void) {
8170 sigset_t mask
, omask
;
8173 sigaddset(&mask
,SIGCHLD
);
8174 sigaddset(&mask
,SIGHUP
);
8175 sigaddset(&mask
,SIGPIPE
);
8176 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
8177 pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
);
8178 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
8179 server
.io_active_threads
++;
8182 /* We need to wait for the last thread to exit before we are able to
8183 * fork() in order to BGSAVE or BGREWRITEAOF. */
8184 static void waitEmptyIOJobsQueue(void) {
8186 int io_processed_len
;
8189 if (listLength(server
.io_newjobs
) == 0 &&
8190 listLength(server
.io_processing
) == 0 &&
8191 server
.io_active_threads
== 0)
8196 /* While waiting for empty jobs queue condition we post-process some
8197 * finshed job, as I/O threads may be hanging trying to write against
8198 * the io_ready_pipe_write FD but there are so much pending jobs that
8200 io_processed_len
= listLength(server
.io_processed
);
8202 if (io_processed_len
) {
8203 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
8204 usleep(1000); /* 1 millisecond */
8206 usleep(10000); /* 10 milliseconds */
8211 static void vmReopenSwapFile(void) {
8212 /* Note: we don't close the old one as we are in the child process
8213 * and don't want to mess at all with the original file object. */
8214 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
8215 if (server
.vm_fp
== NULL
) {
8216 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
8217 server
.vm_swap_file
);
8220 server
.vm_fd
= fileno(server
.vm_fp
);
8223 /* This function must be called while with threaded IO locked */
8224 static void queueIOJob(iojob
*j
) {
8225 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
8226 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
8227 listAddNodeTail(server
.io_newjobs
,j
);
8228 if (server
.io_active_threads
< server
.vm_max_threads
)
8232 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
8235 assert(key
->storage
== REDIS_VM_MEMORY
);
8236 assert(key
->refcount
== 1);
8238 j
= zmalloc(sizeof(*j
));
8239 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
8241 j
->key
= dupStringObject(key
);
8245 j
->thread
= (pthread_t
) -1;
8246 key
->storage
= REDIS_VM_SWAPPING
;
8254 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8256 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8257 * If there is not already a job loading the key, it is craeted.
8258 * The key is added to the io_keys list in the client structure, and also
8259 * in the hash table mapping swapped keys to waiting clients, that is,
8260 * server.io_waited_keys. */
8261 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
8262 struct dictEntry
*de
;
8266 /* If the key does not exist or is already in RAM we don't need to
8267 * block the client at all. */
8268 de
= dictFind(c
->db
->dict
,key
);
8269 if (de
== NULL
) return 0;
8270 o
= dictGetEntryKey(de
);
8271 if (o
->storage
== REDIS_VM_MEMORY
) {
8273 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
8274 /* We were swapping the key, undo it! */
8275 vmCancelThreadedIOJob(o
);
8279 /* OK: the key is either swapped, or being loaded just now. */
8281 /* Add the key to the list of keys this client is waiting for.
8282 * This maps clients to keys they are waiting for. */
8283 listAddNodeTail(c
->io_keys
,key
);
8286 /* Add the client to the swapped keys => clients waiting map. */
8287 de
= dictFind(c
->db
->io_keys
,key
);
8291 /* For every key we take a list of clients blocked for it */
8293 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
8295 assert(retval
== DICT_OK
);
8297 l
= dictGetEntryVal(de
);
8299 listAddNodeTail(l
,c
);
8301 /* Are we already loading the key from disk? If not create a job */
8302 if (o
->storage
== REDIS_VM_SWAPPED
) {
8305 o
->storage
= REDIS_VM_LOADING
;
8306 j
= zmalloc(sizeof(*j
));
8307 j
->type
= REDIS_IOJOB_LOAD
;
8309 j
->key
= dupStringObject(key
);
8310 j
->key
->vtype
= o
->vtype
;
8311 j
->page
= o
->vm
.page
;
8314 j
->thread
= (pthread_t
) -1;
8322 /* Is this client attempting to run a command against swapped keys?
8323 * If so, block it ASAP, load the keys in background, then resume it.
8325 * The important idea about this function is that it can fail! If keys will
8326 * still be swapped when the client is resumed, this key lookups will
8327 * just block loading keys from disk. In practical terms this should only
8328 * happen with SORT BY command or if there is a bug in this function.
8330 * Return 1 if the client is marked as blocked, 0 if the client can
8331 * continue as the keys it is going to access appear to be in memory. */
8332 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
) {
8335 if (cmd
->vm_firstkey
== 0) return 0;
8336 last
= cmd
->vm_lastkey
;
8337 if (last
< 0) last
= c
->argc
+last
;
8338 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
)
8339 waitForSwappedKey(c
,c
->argv
[j
]);
8340 /* If the client was blocked for at least one key, mark it as blocked. */
8341 if (listLength(c
->io_keys
)) {
8342 c
->flags
|= REDIS_IO_WAIT
;
8343 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
8344 server
.vm_blocked_clients
++;
8351 /* Remove the 'key' from the list of blocked keys for a given client.
8353 * The function returns 1 when there are no longer blocking keys after
8354 * the current one was removed (and the client can be unblocked). */
8355 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
8359 struct dictEntry
*de
;
8361 /* Remove the key from the list of keys this client is waiting for. */
8362 listRewind(c
->io_keys
,&li
);
8363 while ((ln
= listNext(&li
)) != NULL
) {
8364 if (compareStringObjects(ln
->value
,key
) == 0) {
8365 listDelNode(c
->io_keys
,ln
);
8371 /* Remove the client form the key => waiting clients map. */
8372 de
= dictFind(c
->db
->io_keys
,key
);
8374 l
= dictGetEntryVal(de
);
8375 ln
= listSearchKey(l
,c
);
8378 if (listLength(l
) == 0)
8379 dictDelete(c
->db
->io_keys
,key
);
8381 return listLength(c
->io_keys
) == 0;
8384 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
8385 struct dictEntry
*de
;
8390 de
= dictFind(db
->io_keys
,key
);
8393 l
= dictGetEntryVal(de
);
8394 len
= listLength(l
);
8395 /* Note: we can't use something like while(listLength(l)) as the list
8396 * can be freed by the calling function when we remove the last element. */
8399 redisClient
*c
= ln
->value
;
8401 if (dontWaitForSwappedKey(c
,key
)) {
8402 /* Put the client in the list of clients ready to go as we
8403 * loaded all the keys about it. */
8404 listAddNodeTail(server
.io_ready_clients
,c
);
8409 /* ================================= Debugging ============================== */
8411 static void debugCommand(redisClient
*c
) {
8412 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
8414 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
8415 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
8416 addReply(c
,shared
.err
);
8420 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
8421 addReply(c
,shared
.err
);
8424 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
8425 addReply(c
,shared
.ok
);
8426 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
8428 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
8429 addReply(c
,shared
.err
);
8432 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
8433 addReply(c
,shared
.ok
);
8434 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
8435 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
8439 addReply(c
,shared
.nokeyerr
);
8442 key
= dictGetEntryKey(de
);
8443 val
= dictGetEntryVal(de
);
8444 if (!server
.vm_enabled
|| (key
->storage
== REDIS_VM_MEMORY
||
8445 key
->storage
== REDIS_VM_SWAPPING
)) {
8446 addReplySds(c
,sdscatprintf(sdsempty(),
8447 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8448 "encoding:%d serializedlength:%lld\r\n",
8449 (void*)key
, key
->refcount
, (void*)val
, val
->refcount
,
8450 val
->encoding
, (long long) rdbSavedObjectLen(val
,NULL
)));
8452 addReplySds(c
,sdscatprintf(sdsempty(),
8453 "+Key at:%p refcount:%d, value swapped at: page %llu "
8454 "using %llu pages\r\n",
8455 (void*)key
, key
->refcount
, (unsigned long long) key
->vm
.page
,
8456 (unsigned long long) key
->vm
.usedpages
));
8458 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
8459 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
8462 if (!server
.vm_enabled
) {
8463 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8467 addReply(c
,shared
.nokeyerr
);
8470 key
= dictGetEntryKey(de
);
8471 val
= dictGetEntryVal(de
);
8472 /* If the key is shared we want to create a copy */
8473 if (key
->refcount
> 1) {
8474 robj
*newkey
= dupStringObject(key
);
8476 key
= dictGetEntryKey(de
) = newkey
;
8479 if (key
->storage
!= REDIS_VM_MEMORY
) {
8480 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
8481 } else if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
8482 dictGetEntryVal(de
) = NULL
;
8483 addReply(c
,shared
.ok
);
8485 addReply(c
,shared
.err
);
8488 addReplySds(c
,sdsnew(
8489 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8493 static void _redisAssert(char *estr
, char *file
, int line
) {
8494 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
8495 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true\n",file
,line
,estr
);
8496 #ifdef HAVE_BACKTRACE
8497 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
8502 /* =================================== Main! ================================ */
8505 int linuxOvercommitMemoryValue(void) {
8506 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
8510 if (fgets(buf
,64,fp
) == NULL
) {
8519 void linuxOvercommitMemoryWarning(void) {
8520 if (linuxOvercommitMemoryValue() == 0) {
8521 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8524 #endif /* __linux__ */
8526 static void daemonize(void) {
8530 if (fork() != 0) exit(0); /* parent exits */
8531 setsid(); /* create a new session */
8533 /* Every output goes to /dev/null. If Redis is daemonized but
8534 * the 'logfile' is set to 'stdout' in the configuration file
8535 * it will not log at all. */
8536 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
8537 dup2(fd
, STDIN_FILENO
);
8538 dup2(fd
, STDOUT_FILENO
);
8539 dup2(fd
, STDERR_FILENO
);
8540 if (fd
> STDERR_FILENO
) close(fd
);
8542 /* Try to write the pid file */
8543 fp
= fopen(server
.pidfile
,"w");
8545 fprintf(fp
,"%d\n",getpid());
8550 int main(int argc
, char **argv
) {
8555 resetServerSaveParams();
8556 loadServerConfig(argv
[1]);
8557 } else if (argc
> 2) {
8558 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
8561 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8563 if (server
.daemonize
) daemonize();
8565 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
8567 linuxOvercommitMemoryWarning();
8570 if (server
.appendonly
) {
8571 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
8572 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
8574 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
8575 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
8577 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
8578 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
8580 aeDeleteEventLoop(server
.el
);
8584 /* ============================= Backtrace support ========================= */
8586 #ifdef HAVE_BACKTRACE
8587 static char *findFuncName(void *pointer
, unsigned long *offset
);
8589 static void *getMcontextEip(ucontext_t
*uc
) {
8590 #if defined(__FreeBSD__)
8591 return (void*) uc
->uc_mcontext
.mc_eip
;
8592 #elif defined(__dietlibc__)
8593 return (void*) uc
->uc_mcontext
.eip
;
8594 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8596 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
8598 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
8600 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8601 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8602 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
8604 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
8606 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8607 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
8608 #elif defined(__ia64__) /* Linux IA64 */
8609 return (void*) uc
->uc_mcontext
.sc_ip
;
8615 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
8617 char **messages
= NULL
;
8618 int i
, trace_size
= 0;
8619 unsigned long offset
=0;
8620 ucontext_t
*uc
= (ucontext_t
*) secret
;
8622 REDIS_NOTUSED(info
);
8624 redisLog(REDIS_WARNING
,
8625 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
8626 infostring
= genRedisInfoString();
8627 redisLog(REDIS_WARNING
, "%s",infostring
);
8628 /* It's not safe to sdsfree() the returned string under memory
8629 * corruption conditions. Let it leak as we are going to abort */
8631 trace_size
= backtrace(trace
, 100);
8632 /* overwrite sigaction with caller's address */
8633 if (getMcontextEip(uc
) != NULL
) {
8634 trace
[1] = getMcontextEip(uc
);
8636 messages
= backtrace_symbols(trace
, trace_size
);
8638 for (i
=1; i
<trace_size
; ++i
) {
8639 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
8641 p
= strchr(messages
[i
],'+');
8642 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
8643 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
8645 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
8648 /* free(messages); Don't call free() with possibly corrupted memory. */
8652 static void setupSigSegvAction(void) {
8653 struct sigaction act
;
8655 sigemptyset (&act
.sa_mask
);
8656 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8657 * is used. Otherwise, sa_handler is used */
8658 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
8659 act
.sa_sigaction
= segvHandler
;
8660 sigaction (SIGSEGV
, &act
, NULL
);
8661 sigaction (SIGBUS
, &act
, NULL
);
8662 sigaction (SIGFPE
, &act
, NULL
);
8663 sigaction (SIGILL
, &act
, NULL
);
8664 sigaction (SIGBUS
, &act
, NULL
);
8668 #include "staticsymbols.h"
8669 /* This function try to convert a pointer into a function name. It's used in
8670 * oreder to provide a backtrace under segmentation fault that's able to
8671 * display functions declared as static (otherwise the backtrace is useless). */
8672 static char *findFuncName(void *pointer
, unsigned long *offset
){
8674 unsigned long off
, minoff
= 0;
8676 /* Try to match against the Symbol with the smallest offset */
8677 for (i
=0; symsTable
[i
].pointer
; i
++) {
8678 unsigned long lp
= (unsigned long) pointer
;
8680 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
8681 off
=lp
-symsTable
[i
].pointer
;
8682 if (ret
< 0 || off
< minoff
) {
8688 if (ret
== -1) return NULL
;
8690 return symsTable
[ret
].name
;
8692 #else /* HAVE_BACKTRACE */
8693 static void setupSigSegvAction(void) {
8695 #endif /* HAVE_BACKTRACE */