2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "1.3.4"
40 #define __USE_POSIX199309
47 #endif /* HAVE_BACKTRACE */
55 #include <arpa/inet.h>
59 #include <sys/resource.h>
66 #include "solarisfixes.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
83 /* Static server configuration */
84 #define REDIS_SERVERPORT 6379 /* TCP port */
85 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
86 #define REDIS_IOBUF_LEN 1024
87 #define REDIS_LOADBUF_LEN 1024
88 #define REDIS_STATIC_ARGS 4
89 #define REDIS_DEFAULT_DBNUM 16
90 #define REDIS_CONFIGLINE_MAX 1024
91 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
92 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
93 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
94 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
95 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
98 #define REDIS_WRITEV_THRESHOLD 3
99 /* Max number of iovecs used for each writev call */
100 #define REDIS_WRITEV_IOVEC_COUNT 256
102 /* Hash table parameters */
103 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106 #define REDIS_CMD_BULK 1 /* Bulk write command */
107 #define REDIS_CMD_INLINE 2 /* Inline command */
108 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
109 this flags will return an error when the 'maxmemory' option is set in the
110 config file and the server is using more than maxmemory bytes of memory.
111 In short this commands are denied on low memory conditions. */
112 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_STRING 0
121 /* Objects encoding */
122 #define REDIS_ENCODING_RAW 0 /* Raw representation */
123 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
125 /* Object types only used for dumping to disk */
126 #define REDIS_EXPIRETIME 253
127 #define REDIS_SELECTDB 254
128 #define REDIS_EOF 255
130 /* Defines related to the dump file format. To store 32 bits lengths for short
131 * keys requires a lot of space, so we check the most significant 2 bits of
132 * the first byte to interpreter the length:
134 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
135 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
136 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
137 * 11|000000 this means: specially encoded object will follow. The six bits
138 * number specify the kind of object that follows.
139 * See the REDIS_RDB_ENC_* defines.
141 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
142 * values, will fit inside. */
143 #define REDIS_RDB_6BITLEN 0
144 #define REDIS_RDB_14BITLEN 1
145 #define REDIS_RDB_32BITLEN 2
146 #define REDIS_RDB_ENCVAL 3
147 #define REDIS_RDB_LENERR UINT_MAX
149 /* When a length of a string object stored on disk has the first two bits
150 * set, the remaining two bits specify a special encoding for the object
151 * accordingly to the following defines: */
152 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
153 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
154 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
155 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
157 /* Virtual memory object->where field. */
158 #define REDIS_VM_MEMORY 0 /* The object is on memory */
159 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
160 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
161 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
163 /* Virtual memory static configuration stuff.
164 * Check vmFindContiguousPages() to know more about this magic numbers. */
165 #define REDIS_VM_MAX_NEAR_PAGES 65536
166 #define REDIS_VM_MAX_RANDOM_JUMP 4096
167 #define REDIS_VM_MAX_THREADS 32
168 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
169 /* The following is the *percentage* of completed I/O jobs to process when the
170 * handelr is called. While Virtual Memory I/O operations are performed by
171 * threads, this operations must be processed by the main thread when completed
172 * in order to take effect. */
173 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
176 #define REDIS_SLAVE 1 /* This client is a slave server */
177 #define REDIS_MASTER 2 /* This client is a master server */
178 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
179 #define REDIS_MULTI 8 /* This client is in a MULTI context */
180 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
181 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
183 /* Slave replication state - slave side */
184 #define REDIS_REPL_NONE 0 /* No active replication */
185 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
186 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
188 /* Slave replication state - from the point of view of master
189 * Note that in SEND_BULK and ONLINE state the slave receives new updates
190 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
191 * to start the next background saving in order to send updates to it. */
192 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
193 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
194 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
195 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
197 /* List related stuff */
201 /* Sort operations */
202 #define REDIS_SORT_GET 0
203 #define REDIS_SORT_ASC 1
204 #define REDIS_SORT_DESC 2
205 #define REDIS_SORTKEY_MAX 1024
208 #define REDIS_DEBUG 0
209 #define REDIS_VERBOSE 1
210 #define REDIS_NOTICE 2
211 #define REDIS_WARNING 3
213 /* Anti-warning macro... */
214 #define REDIS_NOTUSED(V) ((void) V)
216 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
217 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
219 /* Append only defines */
220 #define APPENDFSYNC_NO 0
221 #define APPENDFSYNC_ALWAYS 1
222 #define APPENDFSYNC_EVERYSEC 2
224 /* We can print the stacktrace, so our assert is defined this way: */
225 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
226 static void _redisAssert(char *estr
, char *file
, int line
);
228 /*================================= Data types ============================== */
230 /* A redis object, that is a type able to hold a string / list / set */
232 /* The VM object structure */
233 struct redisObjectVM
{
234 off_t page
; /* the page at witch the object is stored on disk */
235 off_t usedpages
; /* number of pages used on disk */
236 time_t atime
; /* Last access time */
239 /* The actual Redis Object */
240 typedef struct redisObject
{
243 unsigned char encoding
;
244 unsigned char storage
; /* If this object is a key, where is the value?
245 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
246 unsigned char vtype
; /* If this object is a key, and value is swapped out,
247 * this is the type of the swapped out object. */
249 /* VM fields, this are only allocated if VM is active, otherwise the
250 * object allocation function will just allocate
251 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
252 * Redis without VM active will not have any overhead. */
253 struct redisObjectVM vm
;
256 /* Macro used to initalize a Redis object allocated on the stack.
257 * Note that this macro is taken near the structure definition to make sure
258 * we'll update it when the structure is changed, to avoid bugs like
259 * bug #85 introduced exactly in this way. */
260 #define initStaticStringObject(_var,_ptr) do { \
262 _var.type = REDIS_STRING; \
263 _var.encoding = REDIS_ENCODING_RAW; \
265 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
268 typedef struct redisDb
{
269 dict
*dict
; /* The keyspace for this DB */
270 dict
*expires
; /* Timeout of keys with a timeout set */
271 dict
*blockingkeys
; /* Keys with clients waiting for data (BLPOP) */
272 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
276 /* Client MULTI/EXEC state */
277 typedef struct multiCmd
{
280 struct redisCommand
*cmd
;
283 typedef struct multiState
{
284 multiCmd
*commands
; /* Array of MULTI commands */
285 int count
; /* Total number of MULTI commands */
288 /* With multiplexing we need to take per-clinet state.
289 * Clients are taken in a liked list. */
290 typedef struct redisClient
{
295 robj
**argv
, **mbargv
;
297 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
298 int multibulk
; /* multi bulk command format active */
301 time_t lastinteraction
; /* time of the last interaction, used for timeout */
302 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
303 int slaveseldb
; /* slave selected db, if this client is a slave */
304 int authenticated
; /* when requirepass is non-NULL */
305 int replstate
; /* replication state if this is a slave */
306 int repldbfd
; /* replication DB file descriptor */
307 long repldboff
; /* replication DB file offset */
308 off_t repldbsize
; /* replication DB file size */
309 multiState mstate
; /* MULTI/EXEC state */
310 robj
**blockingkeys
; /* The key we are waiting to terminate a blocking
311 * operation such as BLPOP. Otherwise NULL. */
312 int blockingkeysnum
; /* Number of blocking keys */
313 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
314 * is >= blockingto then the operation timed out. */
315 list
*io_keys
; /* Keys this client is waiting to be loaded from the
316 * swap file in order to continue. */
324 /* Global server state structure */
329 dict
*sharingpool
; /* Poll used for object sharing */
330 unsigned int sharingpoolsize
;
331 long long dirty
; /* changes to DB from the last save */
333 list
*slaves
, *monitors
;
334 char neterr
[ANET_ERR_LEN
];
336 int cronloops
; /* number of times the cron function run */
337 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
338 time_t lastsave
; /* Unix time of last save succeeede */
339 /* Fields used only for stats */
340 time_t stat_starttime
; /* server start time */
341 long long stat_numcommands
; /* number of processed commands */
342 long long stat_numconnections
; /* number of connections received */
355 pid_t bgsavechildpid
;
356 pid_t bgrewritechildpid
;
357 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
358 struct saveparam
*saveparams
;
363 char *appendfilename
;
367 /* Replication related */
372 redisClient
*master
; /* client that is master for this slave */
374 unsigned int maxclients
;
375 unsigned long long maxmemory
;
376 unsigned int blpop_blocked_clients
;
377 unsigned int vm_blocked_clients
;
378 /* Sort parameters - qsort_r() is only available under BSD so we
379 * have to take this state global, in order to pass it to sortCompare() */
383 /* Virtual memory configuration */
388 unsigned long long vm_max_memory
;
389 /* Virtual memory state */
392 off_t vm_next_page
; /* Next probably empty page */
393 off_t vm_near_pages
; /* Number of pages allocated sequentially */
394 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
395 time_t unixtime
; /* Unix time sampled every second. */
396 /* Virtual memory I/O threads stuff */
397 /* An I/O thread process an element taken from the io_jobs queue and
398 * put the result of the operation in the io_done list. While the
399 * job is being processed, it's put on io_processing queue. */
400 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
401 list
*io_processing
; /* List of VM I/O jobs being processed */
402 list
*io_processed
; /* List of VM I/O jobs already processed */
403 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
404 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
405 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
406 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
407 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
408 int io_active_threads
; /* Number of running I/O threads */
409 int vm_max_threads
; /* Max number of I/O threads running at the same time */
410 /* Our main thread is blocked on the event loop, locking for sockets ready
411 * to be read or written, so when a threaded I/O operation is ready to be
412 * processed by the main thread, the I/O thread will use a unix pipe to
413 * awake the main thread. The followings are the two pipe FDs. */
414 int io_ready_pipe_read
;
415 int io_ready_pipe_write
;
416 /* Virtual memory stats */
417 unsigned long long vm_stats_used_pages
;
418 unsigned long long vm_stats_swapped_objects
;
419 unsigned long long vm_stats_swapouts
;
420 unsigned long long vm_stats_swapins
;
424 typedef void redisCommandProc(redisClient
*c
);
425 struct redisCommand
{
427 redisCommandProc
*proc
;
430 /* What keys should be loaded in background when calling this command? */
431 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
432 int vm_lastkey
; /* THe last argument that's a key */
433 int vm_keystep
; /* The step between first and last key */
436 struct redisFunctionSym
{
438 unsigned long pointer
;
441 typedef struct _redisSortObject
{
449 typedef struct _redisSortOperation
{
452 } redisSortOperation
;
454 /* ZSETs use a specialized version of Skiplists */
456 typedef struct zskiplistNode
{
457 struct zskiplistNode
**forward
;
458 struct zskiplistNode
*backward
;
464 typedef struct zskiplist
{
465 struct zskiplistNode
*header
, *tail
;
466 unsigned long length
;
470 typedef struct zset
{
475 /* Our shared "common" objects */
477 struct sharedObjectsStruct
{
478 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
479 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
480 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
481 *outofrangeerr
, *plus
,
482 *select0
, *select1
, *select2
, *select3
, *select4
,
483 *select5
, *select6
, *select7
, *select8
, *select9
;
486 /* Global vars that are actally used as constants. The following double
487 * values are used for double on-disk serialization, and are initialized
488 * at runtime to avoid strange compiler optimizations. */
490 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
492 /* VM threaded I/O request message */
493 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
494 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
495 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
496 typedef struct iojob
{
497 int type
; /* Request type, REDIS_IOJOB_* */
498 redisDb
*db
;/* Redis database */
499 robj
*key
; /* This I/O request is about swapping this key */
500 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
501 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
502 off_t page
; /* Swap page where to read/write the object */
503 off_t pages
; /* Swap pages needed to safe object. PREPARE_SWAP return val */
504 int canceled
; /* True if this command was canceled by blocking side of VM */
505 pthread_t thread
; /* ID of the thread processing this entry */
508 /*================================ Prototypes =============================== */
510 static void freeStringObject(robj
*o
);
511 static void freeListObject(robj
*o
);
512 static void freeSetObject(robj
*o
);
513 static void decrRefCount(void *o
);
514 static robj
*createObject(int type
, void *ptr
);
515 static void freeClient(redisClient
*c
);
516 static int rdbLoad(char *filename
);
517 static void addReply(redisClient
*c
, robj
*obj
);
518 static void addReplySds(redisClient
*c
, sds s
);
519 static void incrRefCount(robj
*o
);
520 static int rdbSaveBackground(char *filename
);
521 static robj
*createStringObject(char *ptr
, size_t len
);
522 static robj
*dupStringObject(robj
*o
);
523 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
524 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
525 static int syncWithMaster(void);
526 static robj
*tryObjectSharing(robj
*o
);
527 static int tryObjectEncoding(robj
*o
);
528 static robj
*getDecodedObject(robj
*o
);
529 static int removeExpire(redisDb
*db
, robj
*key
);
530 static int expireIfNeeded(redisDb
*db
, robj
*key
);
531 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
532 static int deleteIfSwapped(redisDb
*db
, robj
*key
);
533 static int deleteKey(redisDb
*db
, robj
*key
);
534 static time_t getExpire(redisDb
*db
, robj
*key
);
535 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
536 static void updateSlavesWaitingBgsave(int bgsaveerr
);
537 static void freeMemoryIfNeeded(void);
538 static int processCommand(redisClient
*c
);
539 static void setupSigSegvAction(void);
540 static void rdbRemoveTempFile(pid_t childpid
);
541 static void aofRemoveTempFile(pid_t childpid
);
542 static size_t stringObjectLen(robj
*o
);
543 static void processInputBuffer(redisClient
*c
);
544 static zskiplist
*zslCreate(void);
545 static void zslFree(zskiplist
*zsl
);
546 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
547 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
548 static void initClientMultiState(redisClient
*c
);
549 static void freeClientMultiState(redisClient
*c
);
550 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
551 static void unblockClientWaitingData(redisClient
*c
);
552 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
553 static void vmInit(void);
554 static void vmMarkPagesFree(off_t page
, off_t count
);
555 static robj
*vmLoadObject(robj
*key
);
556 static robj
*vmPreviewObject(robj
*key
);
557 static int vmSwapOneObjectBlocking(void);
558 static int vmSwapOneObjectThreaded(void);
559 static int vmCanSwapOut(void);
560 static int tryFreeOneObjectFromFreelist(void);
561 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
562 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
563 static void vmCancelThreadedIOJob(robj
*o
);
564 static void lockThreadedIO(void);
565 static void unlockThreadedIO(void);
566 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
567 static void freeIOJob(iojob
*j
);
568 static void queueIOJob(iojob
*j
);
569 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
570 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
571 static void waitEmptyIOJobsQueue(void);
572 static void vmReopenSwapFile(void);
573 static int vmFreePage(off_t page
);
574 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
);
575 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
576 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
577 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
578 static struct redisCommand
*lookupCommand(char *name
);
579 static void call(redisClient
*c
, struct redisCommand
*cmd
);
580 static void resetClient(redisClient
*c
);
582 static void authCommand(redisClient
*c
);
583 static void pingCommand(redisClient
*c
);
584 static void echoCommand(redisClient
*c
);
585 static void setCommand(redisClient
*c
);
586 static void setnxCommand(redisClient
*c
);
587 static void getCommand(redisClient
*c
);
588 static void delCommand(redisClient
*c
);
589 static void existsCommand(redisClient
*c
);
590 static void incrCommand(redisClient
*c
);
591 static void decrCommand(redisClient
*c
);
592 static void incrbyCommand(redisClient
*c
);
593 static void decrbyCommand(redisClient
*c
);
594 static void selectCommand(redisClient
*c
);
595 static void randomkeyCommand(redisClient
*c
);
596 static void keysCommand(redisClient
*c
);
597 static void dbsizeCommand(redisClient
*c
);
598 static void lastsaveCommand(redisClient
*c
);
599 static void saveCommand(redisClient
*c
);
600 static void bgsaveCommand(redisClient
*c
);
601 static void bgrewriteaofCommand(redisClient
*c
);
602 static void shutdownCommand(redisClient
*c
);
603 static void moveCommand(redisClient
*c
);
604 static void renameCommand(redisClient
*c
);
605 static void renamenxCommand(redisClient
*c
);
606 static void lpushCommand(redisClient
*c
);
607 static void rpushCommand(redisClient
*c
);
608 static void lpopCommand(redisClient
*c
);
609 static void rpopCommand(redisClient
*c
);
610 static void llenCommand(redisClient
*c
);
611 static void lindexCommand(redisClient
*c
);
612 static void lrangeCommand(redisClient
*c
);
613 static void ltrimCommand(redisClient
*c
);
614 static void typeCommand(redisClient
*c
);
615 static void lsetCommand(redisClient
*c
);
616 static void saddCommand(redisClient
*c
);
617 static void sremCommand(redisClient
*c
);
618 static void smoveCommand(redisClient
*c
);
619 static void sismemberCommand(redisClient
*c
);
620 static void scardCommand(redisClient
*c
);
621 static void spopCommand(redisClient
*c
);
622 static void srandmemberCommand(redisClient
*c
);
623 static void sinterCommand(redisClient
*c
);
624 static void sinterstoreCommand(redisClient
*c
);
625 static void sunionCommand(redisClient
*c
);
626 static void sunionstoreCommand(redisClient
*c
);
627 static void sdiffCommand(redisClient
*c
);
628 static void sdiffstoreCommand(redisClient
*c
);
629 static void syncCommand(redisClient
*c
);
630 static void flushdbCommand(redisClient
*c
);
631 static void flushallCommand(redisClient
*c
);
632 static void sortCommand(redisClient
*c
);
633 static void lremCommand(redisClient
*c
);
634 static void rpoplpushcommand(redisClient
*c
);
635 static void infoCommand(redisClient
*c
);
636 static void mgetCommand(redisClient
*c
);
637 static void monitorCommand(redisClient
*c
);
638 static void expireCommand(redisClient
*c
);
639 static void expireatCommand(redisClient
*c
);
640 static void getsetCommand(redisClient
*c
);
641 static void ttlCommand(redisClient
*c
);
642 static void slaveofCommand(redisClient
*c
);
643 static void debugCommand(redisClient
*c
);
644 static void msetCommand(redisClient
*c
);
645 static void msetnxCommand(redisClient
*c
);
646 static void zaddCommand(redisClient
*c
);
647 static void zincrbyCommand(redisClient
*c
);
648 static void zrangeCommand(redisClient
*c
);
649 static void zrangebyscoreCommand(redisClient
*c
);
650 static void zcountCommand(redisClient
*c
);
651 static void zrevrangeCommand(redisClient
*c
);
652 static void zcardCommand(redisClient
*c
);
653 static void zremCommand(redisClient
*c
);
654 static void zscoreCommand(redisClient
*c
);
655 static void zremrangebyscoreCommand(redisClient
*c
);
656 static void multiCommand(redisClient
*c
);
657 static void execCommand(redisClient
*c
);
658 static void discardCommand(redisClient
*c
);
659 static void blpopCommand(redisClient
*c
);
660 static void brpopCommand(redisClient
*c
);
661 static void appendCommand(redisClient
*c
);
662 static void zrankCommand(redisClient
*c
);
664 /*================================= Globals ================================= */
667 static struct redisServer server
; /* server global state */
668 static struct redisCommand cmdTable
[] = {
669 {"get",getCommand
,2,REDIS_CMD_INLINE
,1,1,1},
670 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
671 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
672 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
673 {"del",delCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
674 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,1,1,1},
675 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
676 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
677 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,1,-1,1},
678 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
679 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
680 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
681 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
682 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
683 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
684 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,1,1,1},
685 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,1,1,1},
686 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
687 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,1,1,1},
688 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,1,1,1},
689 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,1,1,1},
690 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,2,1},
691 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
692 {"srem",sremCommand
,3,REDIS_CMD_BULK
,1,1,1},
693 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,1,2,1},
694 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,1,1,1},
695 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
696 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
697 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,1,1,1},
698 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
699 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
700 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
701 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
702 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
703 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
704 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,1,1,1},
705 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
706 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
707 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,1,1,1},
708 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,1,1,1},
709 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
710 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
711 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,1,1,1},
712 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
713 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
714 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
715 {"zrank",zrankCommand
,3,REDIS_CMD_INLINE
,1,1,1},
716 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
717 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
718 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
719 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
720 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
721 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,0,0,0},
722 {"select",selectCommand
,2,REDIS_CMD_INLINE
,0,0,0},
723 {"move",moveCommand
,3,REDIS_CMD_INLINE
,1,1,1},
724 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,1,1,1},
725 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,1,1,1},
726 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,0,0,0},
727 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,0,0,0},
728 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,0,0,0},
729 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,0,0,0},
730 {"auth",authCommand
,2,REDIS_CMD_INLINE
,0,0,0},
731 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,0,0,0},
732 {"echo",echoCommand
,2,REDIS_CMD_BULK
,0,0,0},
733 {"save",saveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
734 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
735 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,0,0,0},
736 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,0,0,0},
737 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
738 {"type",typeCommand
,2,REDIS_CMD_INLINE
,1,1,1},
739 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,0,0,0},
740 {"exec",execCommand
,1,REDIS_CMD_INLINE
,0,0,0},
741 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,0,0,0},
742 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,0,0,0},
743 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,0,0,0},
744 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,0,0,0},
745 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
746 {"info",infoCommand
,1,REDIS_CMD_INLINE
,0,0,0},
747 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,0,0,0},
748 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,1,1,1},
749 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,0,0,0},
750 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
751 {NULL
,NULL
,0,0,0,0,0}
754 /*============================ Utility functions ============================ */
756 /* Glob-style pattern matching. */
757 int stringmatchlen(const char *pattern
, int patternLen
,
758 const char *string
, int stringLen
, int nocase
)
763 while (pattern
[1] == '*') {
768 return 1; /* match */
770 if (stringmatchlen(pattern
+1, patternLen
-1,
771 string
, stringLen
, nocase
))
772 return 1; /* match */
776 return 0; /* no match */
780 return 0; /* no match */
790 not = pattern
[0] == '^';
797 if (pattern
[0] == '\\') {
800 if (pattern
[0] == string
[0])
802 } else if (pattern
[0] == ']') {
804 } else if (patternLen
== 0) {
808 } else if (pattern
[1] == '-' && patternLen
>= 3) {
809 int start
= pattern
[0];
810 int end
= pattern
[2];
818 start
= tolower(start
);
824 if (c
>= start
&& c
<= end
)
828 if (pattern
[0] == string
[0])
831 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
841 return 0; /* no match */
847 if (patternLen
>= 2) {
854 if (pattern
[0] != string
[0])
855 return 0; /* no match */
857 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
858 return 0; /* no match */
866 if (stringLen
== 0) {
867 while(*pattern
== '*') {
874 if (patternLen
== 0 && stringLen
== 0)
879 static void redisLog(int level
, const char *fmt
, ...) {
883 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
887 if (level
>= server
.verbosity
) {
893 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
894 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
895 vfprintf(fp
, fmt
, ap
);
901 if (server
.logfile
) fclose(fp
);
904 /*====================== Hash table type implementation ==================== */
906 /* This is an hash table type that uses the SDS dynamic strings libary as
907 * keys and radis objects as values (objects can hold SDS strings,
910 static void dictVanillaFree(void *privdata
, void *val
)
912 DICT_NOTUSED(privdata
);
916 static void dictListDestructor(void *privdata
, void *val
)
918 DICT_NOTUSED(privdata
);
919 listRelease((list
*)val
);
922 static int sdsDictKeyCompare(void *privdata
, const void *key1
,
926 DICT_NOTUSED(privdata
);
928 l1
= sdslen((sds
)key1
);
929 l2
= sdslen((sds
)key2
);
930 if (l1
!= l2
) return 0;
931 return memcmp(key1
, key2
, l1
) == 0;
934 static void dictRedisObjectDestructor(void *privdata
, void *val
)
936 DICT_NOTUSED(privdata
);
938 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
942 static int dictObjKeyCompare(void *privdata
, const void *key1
,
945 const robj
*o1
= key1
, *o2
= key2
;
946 return sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
949 static unsigned int dictObjHash(const void *key
) {
951 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
954 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
957 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
960 o1
= getDecodedObject(o1
);
961 o2
= getDecodedObject(o2
);
962 cmp
= sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
968 static unsigned int dictEncObjHash(const void *key
) {
969 robj
*o
= (robj
*) key
;
971 if (o
->encoding
== REDIS_ENCODING_RAW
) {
972 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
974 if (o
->encoding
== REDIS_ENCODING_INT
) {
978 len
= snprintf(buf
,32,"%ld",(long)o
->ptr
);
979 return dictGenHashFunction((unsigned char*)buf
, len
);
983 o
= getDecodedObject(o
);
984 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
991 /* Sets type and expires */
992 static dictType setDictType
= {
993 dictEncObjHash
, /* hash function */
996 dictEncObjKeyCompare
, /* key compare */
997 dictRedisObjectDestructor
, /* key destructor */
998 NULL
/* val destructor */
1001 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1002 static dictType zsetDictType
= {
1003 dictEncObjHash
, /* hash function */
1006 dictEncObjKeyCompare
, /* key compare */
1007 dictRedisObjectDestructor
, /* key destructor */
1008 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1012 static dictType hashDictType
= {
1013 dictObjHash
, /* hash function */
1016 dictObjKeyCompare
, /* key compare */
1017 dictRedisObjectDestructor
, /* key destructor */
1018 dictRedisObjectDestructor
/* val destructor */
1022 static dictType keyptrDictType
= {
1023 dictObjHash
, /* hash function */
1026 dictObjKeyCompare
, /* key compare */
1027 dictRedisObjectDestructor
, /* key destructor */
1028 NULL
/* val destructor */
1031 /* Keylist hash table type has unencoded redis objects as keys and
1032 * lists as values. It's used for blocking operations (BLPOP) and to
1033 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1034 static dictType keylistDictType
= {
1035 dictObjHash
, /* hash function */
1038 dictObjKeyCompare
, /* key compare */
1039 dictRedisObjectDestructor
, /* key destructor */
1040 dictListDestructor
/* val destructor */
1043 /* ========================= Random utility functions ======================= */
1045 /* Redis generally does not try to recover from out of memory conditions
1046 * when allocating objects or strings, it is not clear if it will be possible
1047 * to report this condition to the client since the networking layer itself
1048 * is based on heap allocation for send buffers, so we simply abort.
1049 * At least the code will be simpler to read... */
1050 static void oom(const char *msg
) {
1051 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1056 /* ====================== Redis server networking stuff ===================== */
1057 static void closeTimedoutClients(void) {
1060 time_t now
= time(NULL
);
1063 listRewind(server
.clients
,&li
);
1064 while ((ln
= listNext(&li
)) != NULL
) {
1065 c
= listNodeValue(ln
);
1066 if (server
.maxidletime
&&
1067 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1068 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1069 (now
- c
->lastinteraction
> server
.maxidletime
))
1071 redisLog(REDIS_VERBOSE
,"Closing idle client");
1073 } else if (c
->flags
& REDIS_BLOCKED
) {
1074 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1075 addReply(c
,shared
.nullmultibulk
);
1076 unblockClientWaitingData(c
);
1082 static int htNeedsResize(dict
*dict
) {
1083 long long size
, used
;
1085 size
= dictSlots(dict
);
1086 used
= dictSize(dict
);
1087 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1088 (used
*100/size
< REDIS_HT_MINFILL
));
1091 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1092 * we resize the hash table to save memory */
1093 static void tryResizeHashTables(void) {
1096 for (j
= 0; j
< server
.dbnum
; j
++) {
1097 if (htNeedsResize(server
.db
[j
].dict
)) {
1098 redisLog(REDIS_VERBOSE
,"The hash table %d is too sparse, resize it...",j
);
1099 dictResize(server
.db
[j
].dict
);
1100 redisLog(REDIS_VERBOSE
,"Hash table %d resized.",j
);
1102 if (htNeedsResize(server
.db
[j
].expires
))
1103 dictResize(server
.db
[j
].expires
);
1107 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1108 void backgroundSaveDoneHandler(int statloc
) {
1109 int exitcode
= WEXITSTATUS(statloc
);
1110 int bysignal
= WIFSIGNALED(statloc
);
1112 if (!bysignal
&& exitcode
== 0) {
1113 redisLog(REDIS_NOTICE
,
1114 "Background saving terminated with success");
1116 server
.lastsave
= time(NULL
);
1117 } else if (!bysignal
&& exitcode
!= 0) {
1118 redisLog(REDIS_WARNING
, "Background saving error");
1120 redisLog(REDIS_WARNING
,
1121 "Background saving terminated by signal");
1122 rdbRemoveTempFile(server
.bgsavechildpid
);
1124 server
.bgsavechildpid
= -1;
1125 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1126 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1127 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1130 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1132 void backgroundRewriteDoneHandler(int statloc
) {
1133 int exitcode
= WEXITSTATUS(statloc
);
1134 int bysignal
= WIFSIGNALED(statloc
);
1136 if (!bysignal
&& exitcode
== 0) {
1140 redisLog(REDIS_NOTICE
,
1141 "Background append only file rewriting terminated with success");
1142 /* Now it's time to flush the differences accumulated by the parent */
1143 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1144 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1146 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1149 /* Flush our data... */
1150 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1151 (signed) sdslen(server
.bgrewritebuf
)) {
1152 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1156 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1157 /* Now our work is to rename the temp file into the stable file. And
1158 * switch the file descriptor used by the server for append only. */
1159 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1160 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1164 /* Mission completed... almost */
1165 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1166 if (server
.appendfd
!= -1) {
1167 /* If append only is actually enabled... */
1168 close(server
.appendfd
);
1169 server
.appendfd
= fd
;
1171 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1172 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1174 /* If append only is disabled we just generate a dump in this
1175 * format. Why not? */
1178 } else if (!bysignal
&& exitcode
!= 0) {
1179 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1181 redisLog(REDIS_WARNING
,
1182 "Background append only file rewriting terminated by signal");
1185 sdsfree(server
.bgrewritebuf
);
1186 server
.bgrewritebuf
= sdsempty();
1187 aofRemoveTempFile(server
.bgrewritechildpid
);
1188 server
.bgrewritechildpid
= -1;
1191 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1192 int j
, loops
= server
.cronloops
++;
1193 REDIS_NOTUSED(eventLoop
);
1195 REDIS_NOTUSED(clientData
);
1197 /* We take a cached value of the unix time in the global state because
1198 * with virtual memory and aging there is to store the current time
1199 * in objects at every object access, and accuracy is not needed.
1200 * To access a global var is faster than calling time(NULL) */
1201 server
.unixtime
= time(NULL
);
1203 /* Show some info about non-empty databases */
1204 for (j
= 0; j
< server
.dbnum
; j
++) {
1205 long long size
, used
, vkeys
;
1207 size
= dictSlots(server
.db
[j
].dict
);
1208 used
= dictSize(server
.db
[j
].dict
);
1209 vkeys
= dictSize(server
.db
[j
].expires
);
1210 if (!(loops
% 5) && (used
|| vkeys
)) {
1211 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1212 /* dictPrintStats(server.dict); */
1216 /* We don't want to resize the hash tables while a bacground saving
1217 * is in progress: the saving child is created using fork() that is
1218 * implemented with a copy-on-write semantic in most modern systems, so
1219 * if we resize the HT while there is the saving child at work actually
1220 * a lot of memory movements in the parent will cause a lot of pages
1222 if (server
.bgsavechildpid
== -1) tryResizeHashTables();
1224 /* Show information about connected clients */
1226 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1227 listLength(server
.clients
)-listLength(server
.slaves
),
1228 listLength(server
.slaves
),
1229 zmalloc_used_memory(),
1230 dictSize(server
.sharingpool
));
1233 /* Close connections of timedout clients */
1234 if ((server
.maxidletime
&& !(loops
% 10)) || server
.blpop_blocked_clients
)
1235 closeTimedoutClients();
1237 /* Check if a background saving or AOF rewrite in progress terminated */
1238 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1242 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1243 if (pid
== server
.bgsavechildpid
) {
1244 backgroundSaveDoneHandler(statloc
);
1246 backgroundRewriteDoneHandler(statloc
);
1250 /* If there is not a background saving in progress check if
1251 * we have to save now */
1252 time_t now
= time(NULL
);
1253 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1254 struct saveparam
*sp
= server
.saveparams
+j
;
1256 if (server
.dirty
>= sp
->changes
&&
1257 now
-server
.lastsave
> sp
->seconds
) {
1258 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1259 sp
->changes
, sp
->seconds
);
1260 rdbSaveBackground(server
.dbfilename
);
1266 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1267 * will use few CPU cycles if there are few expiring keys, otherwise
1268 * it will get more aggressive to avoid that too much memory is used by
1269 * keys that can be removed from the keyspace. */
1270 for (j
= 0; j
< server
.dbnum
; j
++) {
1272 redisDb
*db
= server
.db
+j
;
1274 /* Continue to expire if at the end of the cycle more than 25%
1275 * of the keys were expired. */
1277 long num
= dictSize(db
->expires
);
1278 time_t now
= time(NULL
);
1281 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1282 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1287 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1288 t
= (time_t) dictGetEntryVal(de
);
1290 deleteKey(db
,dictGetEntryKey(de
));
1294 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1297 /* Swap a few keys on disk if we are over the memory limit and VM
1298 * is enbled. Try to free objects from the free list first. */
1299 if (vmCanSwapOut()) {
1300 while (server
.vm_enabled
&& zmalloc_used_memory() >
1301 server
.vm_max_memory
)
1305 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1306 retval
= (server
.vm_max_threads
== 0) ?
1307 vmSwapOneObjectBlocking() :
1308 vmSwapOneObjectThreaded();
1309 if (retval
== REDIS_ERR
&& (loops
% 30) == 0 &&
1310 zmalloc_used_memory() >
1311 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1313 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1315 /* Note that when using threade I/O we free just one object,
1316 * because anyway when the I/O thread in charge to swap this
1317 * object out will finish, the handler of completed jobs
1318 * will try to swap more objects if we are still out of memory. */
1319 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1323 /* Check if we should connect to a MASTER */
1324 if (server
.replstate
== REDIS_REPL_CONNECT
) {
1325 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1326 if (syncWithMaster() == REDIS_OK
) {
1327 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1333 /* This function gets called every time Redis is entering the
1334 * main loop of the event driven library, that is, before to sleep
1335 * for ready file descriptors. */
1336 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1337 REDIS_NOTUSED(eventLoop
);
1339 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1343 listRewind(server
.io_ready_clients
,&li
);
1344 while((ln
= listNext(&li
))) {
1345 redisClient
*c
= ln
->value
;
1346 struct redisCommand
*cmd
;
1348 /* Resume the client. */
1349 listDelNode(server
.io_ready_clients
,ln
);
1350 c
->flags
&= (~REDIS_IO_WAIT
);
1351 server
.vm_blocked_clients
--;
1352 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1353 readQueryFromClient
, c
);
1354 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1355 assert(cmd
!= NULL
);
1358 /* There may be more data to process in the input buffer. */
1359 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1360 processInputBuffer(c
);
1365 static void createSharedObjects(void) {
1366 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1367 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1368 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1369 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1370 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1371 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1372 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1373 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1374 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1375 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1376 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1377 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1378 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1379 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1380 "-ERR no such key\r\n"));
1381 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1382 "-ERR syntax error\r\n"));
1383 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1384 "-ERR source and destination objects are the same\r\n"));
1385 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1386 "-ERR index out of range\r\n"));
1387 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1388 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1389 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1390 shared
.select0
= createStringObject("select 0\r\n",10);
1391 shared
.select1
= createStringObject("select 1\r\n",10);
1392 shared
.select2
= createStringObject("select 2\r\n",10);
1393 shared
.select3
= createStringObject("select 3\r\n",10);
1394 shared
.select4
= createStringObject("select 4\r\n",10);
1395 shared
.select5
= createStringObject("select 5\r\n",10);
1396 shared
.select6
= createStringObject("select 6\r\n",10);
1397 shared
.select7
= createStringObject("select 7\r\n",10);
1398 shared
.select8
= createStringObject("select 8\r\n",10);
1399 shared
.select9
= createStringObject("select 9\r\n",10);
1402 static void appendServerSaveParams(time_t seconds
, int changes
) {
1403 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1404 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1405 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1406 server
.saveparamslen
++;
1409 static void resetServerSaveParams() {
1410 zfree(server
.saveparams
);
1411 server
.saveparams
= NULL
;
1412 server
.saveparamslen
= 0;
1415 static void initServerConfig() {
1416 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1417 server
.port
= REDIS_SERVERPORT
;
1418 server
.verbosity
= REDIS_VERBOSE
;
1419 server
.maxidletime
= REDIS_MAXIDLETIME
;
1420 server
.saveparams
= NULL
;
1421 server
.logfile
= NULL
; /* NULL = log on standard output */
1422 server
.bindaddr
= NULL
;
1423 server
.glueoutputbuf
= 1;
1424 server
.daemonize
= 0;
1425 server
.appendonly
= 0;
1426 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1427 server
.lastfsync
= time(NULL
);
1428 server
.appendfd
= -1;
1429 server
.appendseldb
= -1; /* Make sure the first time will not match */
1430 server
.pidfile
= "/var/run/redis.pid";
1431 server
.dbfilename
= "dump.rdb";
1432 server
.appendfilename
= "appendonly.aof";
1433 server
.requirepass
= NULL
;
1434 server
.shareobjects
= 0;
1435 server
.rdbcompression
= 1;
1436 server
.sharingpoolsize
= 1024;
1437 server
.maxclients
= 0;
1438 server
.blpop_blocked_clients
= 0;
1439 server
.maxmemory
= 0;
1440 server
.vm_enabled
= 0;
1441 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1442 server
.vm_page_size
= 256; /* 256 bytes per page */
1443 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1444 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1445 server
.vm_max_threads
= 4;
1446 server
.vm_blocked_clients
= 0;
1448 resetServerSaveParams();
1450 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1451 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1452 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1453 /* Replication related */
1455 server
.masterauth
= NULL
;
1456 server
.masterhost
= NULL
;
1457 server
.masterport
= 6379;
1458 server
.master
= NULL
;
1459 server
.replstate
= REDIS_REPL_NONE
;
1461 /* Double constants initialization */
1463 R_PosInf
= 1.0/R_Zero
;
1464 R_NegInf
= -1.0/R_Zero
;
1465 R_Nan
= R_Zero
/R_Zero
;
1468 static void initServer() {
1471 signal(SIGHUP
, SIG_IGN
);
1472 signal(SIGPIPE
, SIG_IGN
);
1473 setupSigSegvAction();
1475 server
.devnull
= fopen("/dev/null","w");
1476 if (server
.devnull
== NULL
) {
1477 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1480 server
.clients
= listCreate();
1481 server
.slaves
= listCreate();
1482 server
.monitors
= listCreate();
1483 server
.objfreelist
= listCreate();
1484 createSharedObjects();
1485 server
.el
= aeCreateEventLoop();
1486 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1487 server
.sharingpool
= dictCreate(&setDictType
,NULL
);
1488 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1489 if (server
.fd
== -1) {
1490 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1493 for (j
= 0; j
< server
.dbnum
; j
++) {
1494 server
.db
[j
].dict
= dictCreate(&hashDictType
,NULL
);
1495 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1496 server
.db
[j
].blockingkeys
= dictCreate(&keylistDictType
,NULL
);
1497 if (server
.vm_enabled
)
1498 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1499 server
.db
[j
].id
= j
;
1501 server
.cronloops
= 0;
1502 server
.bgsavechildpid
= -1;
1503 server
.bgrewritechildpid
= -1;
1504 server
.bgrewritebuf
= sdsempty();
1505 server
.lastsave
= time(NULL
);
1507 server
.stat_numcommands
= 0;
1508 server
.stat_numconnections
= 0;
1509 server
.stat_starttime
= time(NULL
);
1510 server
.unixtime
= time(NULL
);
1511 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1512 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1513 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1515 if (server
.appendonly
) {
1516 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1517 if (server
.appendfd
== -1) {
1518 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1524 if (server
.vm_enabled
) vmInit();
1527 /* Empty the whole database */
1528 static long long emptyDb() {
1530 long long removed
= 0;
1532 for (j
= 0; j
< server
.dbnum
; j
++) {
1533 removed
+= dictSize(server
.db
[j
].dict
);
1534 dictEmpty(server
.db
[j
].dict
);
1535 dictEmpty(server
.db
[j
].expires
);
1540 static int yesnotoi(char *s
) {
1541 if (!strcasecmp(s
,"yes")) return 1;
1542 else if (!strcasecmp(s
,"no")) return 0;
1546 /* I agree, this is a very rudimental way to load a configuration...
1547 will improve later if the config gets more complex */
1548 static void loadServerConfig(char *filename
) {
1550 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1554 if (filename
[0] == '-' && filename
[1] == '\0')
1557 if ((fp
= fopen(filename
,"r")) == NULL
) {
1558 redisLog(REDIS_WARNING
,"Fatal error, can't open config file");
1563 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1569 line
= sdstrim(line
," \t\r\n");
1571 /* Skip comments and blank lines*/
1572 if (line
[0] == '#' || line
[0] == '\0') {
1577 /* Split into arguments */
1578 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1579 sdstolower(argv
[0]);
1581 /* Execute config directives */
1582 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1583 server
.maxidletime
= atoi(argv
[1]);
1584 if (server
.maxidletime
< 0) {
1585 err
= "Invalid timeout value"; goto loaderr
;
1587 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1588 server
.port
= atoi(argv
[1]);
1589 if (server
.port
< 1 || server
.port
> 65535) {
1590 err
= "Invalid port"; goto loaderr
;
1592 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1593 server
.bindaddr
= zstrdup(argv
[1]);
1594 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1595 int seconds
= atoi(argv
[1]);
1596 int changes
= atoi(argv
[2]);
1597 if (seconds
< 1 || changes
< 0) {
1598 err
= "Invalid save parameters"; goto loaderr
;
1600 appendServerSaveParams(seconds
,changes
);
1601 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1602 if (chdir(argv
[1]) == -1) {
1603 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1604 argv
[1], strerror(errno
));
1607 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1608 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1609 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1610 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1611 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1613 err
= "Invalid log level. Must be one of debug, notice, warning";
1616 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1619 server
.logfile
= zstrdup(argv
[1]);
1620 if (!strcasecmp(server
.logfile
,"stdout")) {
1621 zfree(server
.logfile
);
1622 server
.logfile
= NULL
;
1624 if (server
.logfile
) {
1625 /* Test if we are able to open the file. The server will not
1626 * be able to abort just for this problem later... */
1627 logfp
= fopen(server
.logfile
,"a");
1628 if (logfp
== NULL
) {
1629 err
= sdscatprintf(sdsempty(),
1630 "Can't open the log file: %s", strerror(errno
));
1635 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1636 server
.dbnum
= atoi(argv
[1]);
1637 if (server
.dbnum
< 1) {
1638 err
= "Invalid number of databases"; goto loaderr
;
1640 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1641 server
.maxclients
= atoi(argv
[1]);
1642 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1643 server
.maxmemory
= strtoll(argv
[1], NULL
, 10);
1644 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1645 server
.masterhost
= sdsnew(argv
[1]);
1646 server
.masterport
= atoi(argv
[2]);
1647 server
.replstate
= REDIS_REPL_CONNECT
;
1648 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1649 server
.masterauth
= zstrdup(argv
[1]);
1650 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1651 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1652 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1654 } else if (!strcasecmp(argv
[0],"shareobjects") && argc
== 2) {
1655 if ((server
.shareobjects
= yesnotoi(argv
[1])) == -1) {
1656 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1658 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1659 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1660 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1662 } else if (!strcasecmp(argv
[0],"shareobjectspoolsize") && argc
== 2) {
1663 server
.sharingpoolsize
= atoi(argv
[1]);
1664 if (server
.sharingpoolsize
< 1) {
1665 err
= "invalid object sharing pool size"; goto loaderr
;
1667 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1668 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1669 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1671 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
1672 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
1673 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1675 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
1676 if (!strcasecmp(argv
[1],"no")) {
1677 server
.appendfsync
= APPENDFSYNC_NO
;
1678 } else if (!strcasecmp(argv
[1],"always")) {
1679 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1680 } else if (!strcasecmp(argv
[1],"everysec")) {
1681 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1683 err
= "argument must be 'no', 'always' or 'everysec'";
1686 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
1687 server
.requirepass
= zstrdup(argv
[1]);
1688 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
1689 server
.pidfile
= zstrdup(argv
[1]);
1690 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
1691 server
.dbfilename
= zstrdup(argv
[1]);
1692 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
1693 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
1694 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1696 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
1697 zfree(server
.vm_swap_file
);
1698 server
.vm_swap_file
= zstrdup(argv
[1]);
1699 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
1700 server
.vm_max_memory
= strtoll(argv
[1], NULL
, 10);
1701 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
1702 server
.vm_page_size
= strtoll(argv
[1], NULL
, 10);
1703 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
1704 server
.vm_pages
= strtoll(argv
[1], NULL
, 10);
1705 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1706 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1708 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
1710 for (j
= 0; j
< argc
; j
++)
1715 if (fp
!= stdin
) fclose(fp
);
1719 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
1720 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
1721 fprintf(stderr
, ">>> '%s'\n", line
);
1722 fprintf(stderr
, "%s\n", err
);
1726 static void freeClientArgv(redisClient
*c
) {
1729 for (j
= 0; j
< c
->argc
; j
++)
1730 decrRefCount(c
->argv
[j
]);
1731 for (j
= 0; j
< c
->mbargc
; j
++)
1732 decrRefCount(c
->mbargv
[j
]);
1737 static void freeClient(redisClient
*c
) {
1740 /* Note that if the client we are freeing is blocked into a blocking
1741 * call, we have to set querybuf to NULL *before* to call
1742 * unblockClientWaitingData() to avoid processInputBuffer() will get
1743 * called. Also it is important to remove the file events after
1744 * this, because this call adds the READABLE event. */
1745 sdsfree(c
->querybuf
);
1747 if (c
->flags
& REDIS_BLOCKED
)
1748 unblockClientWaitingData(c
);
1750 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
1751 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1752 listRelease(c
->reply
);
1755 /* Remove from the list of clients */
1756 ln
= listSearchKey(server
.clients
,c
);
1757 redisAssert(ln
!= NULL
);
1758 listDelNode(server
.clients
,ln
);
1759 /* Remove from the list of clients waiting for swapped keys */
1760 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
1761 ln
= listSearchKey(server
.io_ready_clients
,c
);
1763 listDelNode(server
.io_ready_clients
,ln
);
1764 server
.vm_blocked_clients
--;
1767 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
1768 ln
= listFirst(c
->io_keys
);
1769 dontWaitForSwappedKey(c
,ln
->value
);
1771 listRelease(c
->io_keys
);
1773 if (c
->flags
& REDIS_SLAVE
) {
1774 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
1776 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
1777 ln
= listSearchKey(l
,c
);
1778 redisAssert(ln
!= NULL
);
1781 if (c
->flags
& REDIS_MASTER
) {
1782 server
.master
= NULL
;
1783 server
.replstate
= REDIS_REPL_CONNECT
;
1787 freeClientMultiState(c
);
1791 #define GLUEREPLY_UP_TO (1024)
1792 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
1794 char buf
[GLUEREPLY_UP_TO
];
1799 listRewind(c
->reply
,&li
);
1800 while((ln
= listNext(&li
))) {
1804 objlen
= sdslen(o
->ptr
);
1805 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
1806 memcpy(buf
+copylen
,o
->ptr
,objlen
);
1808 listDelNode(c
->reply
,ln
);
1810 if (copylen
== 0) return;
1814 /* Now the output buffer is empty, add the new single element */
1815 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
1816 listAddNodeHead(c
->reply
,o
);
1819 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
1820 redisClient
*c
= privdata
;
1821 int nwritten
= 0, totwritten
= 0, objlen
;
1824 REDIS_NOTUSED(mask
);
1826 /* Use writev() if we have enough buffers to send */
1827 if (!server
.glueoutputbuf
&&
1828 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
1829 !(c
->flags
& REDIS_MASTER
))
1831 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
1835 while(listLength(c
->reply
)) {
1836 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
1837 glueReplyBuffersIfNeeded(c
);
1839 o
= listNodeValue(listFirst(c
->reply
));
1840 objlen
= sdslen(o
->ptr
);
1843 listDelNode(c
->reply
,listFirst(c
->reply
));
1847 if (c
->flags
& REDIS_MASTER
) {
1848 /* Don't reply to a master */
1849 nwritten
= objlen
- c
->sentlen
;
1851 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
1852 if (nwritten
<= 0) break;
1854 c
->sentlen
+= nwritten
;
1855 totwritten
+= nwritten
;
1856 /* If we fully sent the object on head go to the next one */
1857 if (c
->sentlen
== objlen
) {
1858 listDelNode(c
->reply
,listFirst(c
->reply
));
1861 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1862 * bytes, in a single threaded server it's a good idea to serve
1863 * other clients as well, even if a very large request comes from
1864 * super fast link that is always able to accept data (in real world
1865 * scenario think about 'KEYS *' against the loopback interfae) */
1866 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
1868 if (nwritten
== -1) {
1869 if (errno
== EAGAIN
) {
1872 redisLog(REDIS_VERBOSE
,
1873 "Error writing to client: %s", strerror(errno
));
1878 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
1879 if (listLength(c
->reply
) == 0) {
1881 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1885 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
1887 redisClient
*c
= privdata
;
1888 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
1890 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
1891 int offset
, ion
= 0;
1893 REDIS_NOTUSED(mask
);
1896 while (listLength(c
->reply
)) {
1897 offset
= c
->sentlen
;
1901 /* fill-in the iov[] array */
1902 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
1903 o
= listNodeValue(node
);
1904 objlen
= sdslen(o
->ptr
);
1906 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
1909 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
1910 break; /* no more iovecs */
1912 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
1913 iov
[ion
].iov_len
= objlen
- offset
;
1914 willwrite
+= objlen
- offset
;
1915 offset
= 0; /* just for the first item */
1922 /* write all collected blocks at once */
1923 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
1924 if (errno
!= EAGAIN
) {
1925 redisLog(REDIS_VERBOSE
,
1926 "Error writing to client: %s", strerror(errno
));
1933 totwritten
+= nwritten
;
1934 offset
= c
->sentlen
;
1936 /* remove written robjs from c->reply */
1937 while (nwritten
&& listLength(c
->reply
)) {
1938 o
= listNodeValue(listFirst(c
->reply
));
1939 objlen
= sdslen(o
->ptr
);
1941 if(nwritten
>= objlen
- offset
) {
1942 listDelNode(c
->reply
, listFirst(c
->reply
));
1943 nwritten
-= objlen
- offset
;
1947 c
->sentlen
+= nwritten
;
1955 c
->lastinteraction
= time(NULL
);
1957 if (listLength(c
->reply
) == 0) {
1959 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1963 static struct redisCommand
*lookupCommand(char *name
) {
1965 while(cmdTable
[j
].name
!= NULL
) {
1966 if (!strcasecmp(name
,cmdTable
[j
].name
)) return &cmdTable
[j
];
1972 /* resetClient prepare the client to process the next command */
1973 static void resetClient(redisClient
*c
) {
1979 /* Call() is the core of Redis execution of a command */
1980 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
1983 dirty
= server
.dirty
;
1985 if (server
.appendonly
&& server
.dirty
-dirty
)
1986 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
1987 if (server
.dirty
-dirty
&& listLength(server
.slaves
))
1988 replicationFeedSlaves(server
.slaves
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
1989 if (listLength(server
.monitors
))
1990 replicationFeedSlaves(server
.monitors
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
1991 server
.stat_numcommands
++;
1994 /* If this function gets called we already read a whole
1995 * command, argments are in the client argv/argc fields.
1996 * processCommand() execute the command or prepare the
1997 * server for a bulk read from the client.
1999 * If 1 is returned the client is still alive and valid and
2000 * and other operations can be performed by the caller. Otherwise
2001 * if 0 is returned the client was destroied (i.e. after QUIT). */
2002 static int processCommand(redisClient
*c
) {
2003 struct redisCommand
*cmd
;
2005 /* Free some memory if needed (maxmemory setting) */
2006 if (server
.maxmemory
) freeMemoryIfNeeded();
2008 /* Handle the multi bulk command type. This is an alternative protocol
2009 * supported by Redis in order to receive commands that are composed of
2010 * multiple binary-safe "bulk" arguments. The latency of processing is
2011 * a bit higher but this allows things like multi-sets, so if this
2012 * protocol is used only for MSET and similar commands this is a big win. */
2013 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2014 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2015 if (c
->multibulk
<= 0) {
2019 decrRefCount(c
->argv
[c
->argc
-1]);
2023 } else if (c
->multibulk
) {
2024 if (c
->bulklen
== -1) {
2025 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2026 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2030 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2031 decrRefCount(c
->argv
[0]);
2032 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2034 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2039 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2043 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2044 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2048 if (c
->multibulk
== 0) {
2052 /* Here we need to swap the multi-bulk argc/argv with the
2053 * normal argc/argv of the client structure. */
2055 c
->argv
= c
->mbargv
;
2056 c
->mbargv
= auxargv
;
2059 c
->argc
= c
->mbargc
;
2060 c
->mbargc
= auxargc
;
2062 /* We need to set bulklen to something different than -1
2063 * in order for the code below to process the command without
2064 * to try to read the last argument of a bulk command as
2065 * a special argument. */
2067 /* continue below and process the command */
2074 /* -- end of multi bulk commands processing -- */
2076 /* The QUIT command is handled as a special case. Normal command
2077 * procs are unable to close the client connection safely */
2078 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2083 /* Now lookup the command and check ASAP about trivial error conditions
2084 * such wrong arity, bad command name and so forth. */
2085 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2088 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2089 (char*)c
->argv
[0]->ptr
));
2092 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2093 (c
->argc
< -cmd
->arity
)) {
2095 sdscatprintf(sdsempty(),
2096 "-ERR wrong number of arguments for '%s' command\r\n",
2100 } else if (server
.maxmemory
&& cmd
->flags
& REDIS_CMD_DENYOOM
&& zmalloc_used_memory() > server
.maxmemory
) {
2101 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2104 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2105 /* This is a bulk command, we have to read the last argument yet. */
2106 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2108 decrRefCount(c
->argv
[c
->argc
-1]);
2109 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2111 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2116 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2117 /* It is possible that the bulk read is already in the
2118 * buffer. Check this condition and handle it accordingly.
2119 * This is just a fast path, alternative to call processInputBuffer().
2120 * It's a good idea since the code is small and this condition
2121 * happens most of the times. */
2122 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2123 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2125 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2127 /* Otherwise return... there is to read the last argument
2128 * from the socket. */
2132 /* Let's try to share objects on the command arguments vector */
2133 if (server
.shareobjects
) {
2135 for(j
= 1; j
< c
->argc
; j
++)
2136 c
->argv
[j
] = tryObjectSharing(c
->argv
[j
]);
2138 /* Let's try to encode the bulk object to save space. */
2139 if (cmd
->flags
& REDIS_CMD_BULK
)
2140 tryObjectEncoding(c
->argv
[c
->argc
-1]);
2142 /* Check if the user is authenticated */
2143 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2144 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2149 /* Exec the command */
2150 if (c
->flags
& REDIS_MULTI
&& cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
) {
2151 queueMultiCommand(c
,cmd
);
2152 addReply(c
,shared
.queued
);
2154 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2155 blockClientOnSwappedKeys(cmd
,c
)) return 1;
2159 /* Prepare the client for the next command */
2164 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
2169 /* (args*2)+1 is enough room for args, spaces, newlines */
2170 robj
*static_outv
[REDIS_STATIC_ARGS
*2+1];
2172 if (argc
<= REDIS_STATIC_ARGS
) {
2175 outv
= zmalloc(sizeof(robj
*)*(argc
*2+1));
2178 for (j
= 0; j
< argc
; j
++) {
2179 if (j
!= 0) outv
[outc
++] = shared
.space
;
2180 if ((cmd
->flags
& REDIS_CMD_BULK
) && j
== argc
-1) {
2183 lenobj
= createObject(REDIS_STRING
,
2184 sdscatprintf(sdsempty(),"%lu\r\n",
2185 (unsigned long) stringObjectLen(argv
[j
])));
2186 lenobj
->refcount
= 0;
2187 outv
[outc
++] = lenobj
;
2189 outv
[outc
++] = argv
[j
];
2191 outv
[outc
++] = shared
.crlf
;
2193 /* Increment all the refcounts at start and decrement at end in order to
2194 * be sure to free objects if there is no slave in a replication state
2195 * able to be feed with commands */
2196 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2197 listRewind(slaves
,&li
);
2198 while((ln
= listNext(&li
))) {
2199 redisClient
*slave
= ln
->value
;
2201 /* Don't feed slaves that are still waiting for BGSAVE to start */
2202 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2204 /* Feed all the other slaves, MONITORs and so on */
2205 if (slave
->slaveseldb
!= dictid
) {
2209 case 0: selectcmd
= shared
.select0
; break;
2210 case 1: selectcmd
= shared
.select1
; break;
2211 case 2: selectcmd
= shared
.select2
; break;
2212 case 3: selectcmd
= shared
.select3
; break;
2213 case 4: selectcmd
= shared
.select4
; break;
2214 case 5: selectcmd
= shared
.select5
; break;
2215 case 6: selectcmd
= shared
.select6
; break;
2216 case 7: selectcmd
= shared
.select7
; break;
2217 case 8: selectcmd
= shared
.select8
; break;
2218 case 9: selectcmd
= shared
.select9
; break;
2220 selectcmd
= createObject(REDIS_STRING
,
2221 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2222 selectcmd
->refcount
= 0;
2225 addReply(slave
,selectcmd
);
2226 slave
->slaveseldb
= dictid
;
2228 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2230 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2231 if (outv
!= static_outv
) zfree(outv
);
2234 static void processInputBuffer(redisClient
*c
) {
2236 /* Before to process the input buffer, make sure the client is not
2237 * waitig for a blocking operation such as BLPOP. Note that the first
2238 * iteration the client is never blocked, otherwise the processInputBuffer
2239 * would not be called at all, but after the execution of the first commands
2240 * in the input buffer the client may be blocked, and the "goto again"
2241 * will try to reiterate. The following line will make it return asap. */
2242 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2243 if (c
->bulklen
== -1) {
2244 /* Read the first line of the query */
2245 char *p
= strchr(c
->querybuf
,'\n');
2252 query
= c
->querybuf
;
2253 c
->querybuf
= sdsempty();
2254 querylen
= 1+(p
-(query
));
2255 if (sdslen(query
) > querylen
) {
2256 /* leave data after the first line of the query in the buffer */
2257 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2259 *p
= '\0'; /* remove "\n" */
2260 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2261 sdsupdatelen(query
);
2263 /* Now we can split the query in arguments */
2264 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2267 if (c
->argv
) zfree(c
->argv
);
2268 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2270 for (j
= 0; j
< argc
; j
++) {
2271 if (sdslen(argv
[j
])) {
2272 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2280 /* Execute the command. If the client is still valid
2281 * after processCommand() return and there is something
2282 * on the query buffer try to process the next command. */
2283 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2285 /* Nothing to process, argc == 0. Just process the query
2286 * buffer if it's not empty or return to the caller */
2287 if (sdslen(c
->querybuf
)) goto again
;
2290 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2291 redisLog(REDIS_VERBOSE
, "Client protocol error");
2296 /* Bulk read handling. Note that if we are at this point
2297 the client already sent a command terminated with a newline,
2298 we are reading the bulk data that is actually the last
2299 argument of the command. */
2300 int qbl
= sdslen(c
->querybuf
);
2302 if (c
->bulklen
<= qbl
) {
2303 /* Copy everything but the final CRLF as final argument */
2304 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2306 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2307 /* Process the command. If the client is still valid after
2308 * the processing and there is more data in the buffer
2309 * try to parse it. */
2310 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2316 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2317 redisClient
*c
= (redisClient
*) privdata
;
2318 char buf
[REDIS_IOBUF_LEN
];
2321 REDIS_NOTUSED(mask
);
2323 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2325 if (errno
== EAGAIN
) {
2328 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2332 } else if (nread
== 0) {
2333 redisLog(REDIS_VERBOSE
, "Client closed connection");
2338 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2339 c
->lastinteraction
= time(NULL
);
2343 if (!(c
->flags
& REDIS_BLOCKED
))
2344 processInputBuffer(c
);
2347 static int selectDb(redisClient
*c
, int id
) {
2348 if (id
< 0 || id
>= server
.dbnum
)
2350 c
->db
= &server
.db
[id
];
2354 static void *dupClientReplyValue(void *o
) {
2355 incrRefCount((robj
*)o
);
2359 static redisClient
*createClient(int fd
) {
2360 redisClient
*c
= zmalloc(sizeof(*c
));
2362 anetNonBlock(NULL
,fd
);
2363 anetTcpNoDelay(NULL
,fd
);
2364 if (!c
) return NULL
;
2367 c
->querybuf
= sdsempty();
2376 c
->lastinteraction
= time(NULL
);
2377 c
->authenticated
= 0;
2378 c
->replstate
= REDIS_REPL_NONE
;
2379 c
->reply
= listCreate();
2380 listSetFreeMethod(c
->reply
,decrRefCount
);
2381 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2382 c
->blockingkeys
= NULL
;
2383 c
->blockingkeysnum
= 0;
2384 c
->io_keys
= listCreate();
2385 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2386 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2387 readQueryFromClient
, c
) == AE_ERR
) {
2391 listAddNodeTail(server
.clients
,c
);
2392 initClientMultiState(c
);
2396 static void addReply(redisClient
*c
, robj
*obj
) {
2397 if (listLength(c
->reply
) == 0 &&
2398 (c
->replstate
== REDIS_REPL_NONE
||
2399 c
->replstate
== REDIS_REPL_ONLINE
) &&
2400 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2401 sendReplyToClient
, c
) == AE_ERR
) return;
2403 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2404 obj
= dupStringObject(obj
);
2405 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2407 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2410 static void addReplySds(redisClient
*c
, sds s
) {
2411 robj
*o
= createObject(REDIS_STRING
,s
);
2416 static void addReplyDouble(redisClient
*c
, double d
) {
2419 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2420 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2421 (unsigned long) strlen(buf
),buf
));
2424 static void addReplyLong(redisClient
*c
, long l
) {
2428 len
= snprintf(buf
,sizeof(buf
),":%ld\r\n",l
);
2429 addReplySds(c
,sdsnewlen(buf
,len
));
2432 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2435 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2436 len
= sdslen(obj
->ptr
);
2438 long n
= (long)obj
->ptr
;
2440 /* Compute how many bytes will take this integer as a radix 10 string */
2446 while((n
= n
/10) != 0) {
2450 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len
));
2453 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2458 REDIS_NOTUSED(mask
);
2459 REDIS_NOTUSED(privdata
);
2461 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2462 if (cfd
== AE_ERR
) {
2463 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2466 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2467 if ((c
= createClient(cfd
)) == NULL
) {
2468 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2469 close(cfd
); /* May be already closed, just ingore errors */
2472 /* If maxclient directive is set and this is one client more... close the
2473 * connection. Note that we create the client instead to check before
2474 * for this condition, since now the socket is already set in nonblocking
2475 * mode and we can send an error for free using the Kernel I/O */
2476 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2477 char *err
= "-ERR max number of clients reached\r\n";
2479 /* That's a best effort error message, don't check write errors */
2480 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2481 /* Nothing to do, Just to avoid the warning... */
2486 server
.stat_numconnections
++;
2489 /* ======================= Redis objects implementation ===================== */
2491 static robj
*createObject(int type
, void *ptr
) {
2494 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2495 if (listLength(server
.objfreelist
)) {
2496 listNode
*head
= listFirst(server
.objfreelist
);
2497 o
= listNodeValue(head
);
2498 listDelNode(server
.objfreelist
,head
);
2499 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2501 if (server
.vm_enabled
) {
2502 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2503 o
= zmalloc(sizeof(*o
));
2505 o
= zmalloc(sizeof(*o
)-sizeof(struct redisObjectVM
));
2509 o
->encoding
= REDIS_ENCODING_RAW
;
2512 if (server
.vm_enabled
) {
2513 /* Note that this code may run in the context of an I/O thread
2514 * and accessing to server.unixtime in theory is an error
2515 * (no locks). But in practice this is safe, and even if we read
2516 * garbage Redis will not fail, as it's just a statistical info */
2517 o
->vm
.atime
= server
.unixtime
;
2518 o
->storage
= REDIS_VM_MEMORY
;
2523 static robj
*createStringObject(char *ptr
, size_t len
) {
2524 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
2527 static robj
*dupStringObject(robj
*o
) {
2528 assert(o
->encoding
== REDIS_ENCODING_RAW
);
2529 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
2532 static robj
*createListObject(void) {
2533 list
*l
= listCreate();
2535 listSetFreeMethod(l
,decrRefCount
);
2536 return createObject(REDIS_LIST
,l
);
2539 static robj
*createSetObject(void) {
2540 dict
*d
= dictCreate(&setDictType
,NULL
);
2541 return createObject(REDIS_SET
,d
);
2544 static robj
*createZsetObject(void) {
2545 zset
*zs
= zmalloc(sizeof(*zs
));
2547 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
2548 zs
->zsl
= zslCreate();
2549 return createObject(REDIS_ZSET
,zs
);
2552 static void freeStringObject(robj
*o
) {
2553 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2558 static void freeListObject(robj
*o
) {
2559 listRelease((list
*) o
->ptr
);
2562 static void freeSetObject(robj
*o
) {
2563 dictRelease((dict
*) o
->ptr
);
2566 static void freeZsetObject(robj
*o
) {
2569 dictRelease(zs
->dict
);
2574 static void freeHashObject(robj
*o
) {
2575 dictRelease((dict
*) o
->ptr
);
2578 static void incrRefCount(robj
*o
) {
2579 redisAssert(!server
.vm_enabled
|| o
->storage
== REDIS_VM_MEMORY
);
2583 static void decrRefCount(void *obj
) {
2586 /* Object is a key of a swapped out value, or in the process of being
2588 if (server
.vm_enabled
&&
2589 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
2591 if (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
) {
2592 redisAssert(o
->refcount
== 1);
2594 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(obj
);
2595 redisAssert(o
->type
== REDIS_STRING
);
2596 freeStringObject(o
);
2597 vmMarkPagesFree(o
->vm
.page
,o
->vm
.usedpages
);
2598 pthread_mutex_lock(&server
.obj_freelist_mutex
);
2599 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2600 !listAddNodeHead(server
.objfreelist
,o
))
2602 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2603 server
.vm_stats_swapped_objects
--;
2606 /* Object is in memory, or in the process of being swapped out. */
2607 if (--(o
->refcount
) == 0) {
2608 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
2609 vmCancelThreadedIOJob(obj
);
2611 case REDIS_STRING
: freeStringObject(o
); break;
2612 case REDIS_LIST
: freeListObject(o
); break;
2613 case REDIS_SET
: freeSetObject(o
); break;
2614 case REDIS_ZSET
: freeZsetObject(o
); break;
2615 case REDIS_HASH
: freeHashObject(o
); break;
2616 default: redisAssert(0 != 0); break;
2618 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2619 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2620 !listAddNodeHead(server
.objfreelist
,o
))
2622 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2626 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
2627 dictEntry
*de
= dictFind(db
->dict
,key
);
2629 robj
*key
= dictGetEntryKey(de
);
2630 robj
*val
= dictGetEntryVal(de
);
2632 if (server
.vm_enabled
) {
2633 if (key
->storage
== REDIS_VM_MEMORY
||
2634 key
->storage
== REDIS_VM_SWAPPING
)
2636 /* If we were swapping the object out, stop it, this key
2638 if (key
->storage
== REDIS_VM_SWAPPING
)
2639 vmCancelThreadedIOJob(key
);
2640 /* Update the access time of the key for the aging algorithm. */
2641 key
->vm
.atime
= server
.unixtime
;
2643 int notify
= (key
->storage
== REDIS_VM_LOADING
);
2645 /* Our value was swapped on disk. Bring it at home. */
2646 redisAssert(val
== NULL
);
2647 val
= vmLoadObject(key
);
2648 dictGetEntryVal(de
) = val
;
2650 /* Clients blocked by the VM subsystem may be waiting for
2652 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
2661 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
2662 expireIfNeeded(db
,key
);
2663 return lookupKey(db
,key
);
2666 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
2667 deleteIfVolatile(db
,key
);
2668 return lookupKey(db
,key
);
2671 static int deleteKey(redisDb
*db
, robj
*key
) {
2674 /* We need to protect key from destruction: after the first dictDelete()
2675 * it may happen that 'key' is no longer valid if we don't increment
2676 * it's count. This may happen when we get the object reference directly
2677 * from the hash table with dictRandomKey() or dict iterators */
2679 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
);
2680 retval
= dictDelete(db
->dict
,key
);
2683 return retval
== DICT_OK
;
2686 /* Try to share an object against the shared objects pool */
2687 static robj
*tryObjectSharing(robj
*o
) {
2688 struct dictEntry
*de
;
2691 if (o
== NULL
|| server
.shareobjects
== 0) return o
;
2693 redisAssert(o
->type
== REDIS_STRING
);
2694 de
= dictFind(server
.sharingpool
,o
);
2696 robj
*shared
= dictGetEntryKey(de
);
2698 c
= ((unsigned long) dictGetEntryVal(de
))+1;
2699 dictGetEntryVal(de
) = (void*) c
;
2700 incrRefCount(shared
);
2704 /* Here we are using a stream algorihtm: Every time an object is
2705 * shared we increment its count, everytime there is a miss we
2706 * recrement the counter of a random object. If this object reaches
2707 * zero we remove the object and put the current object instead. */
2708 if (dictSize(server
.sharingpool
) >=
2709 server
.sharingpoolsize
) {
2710 de
= dictGetRandomKey(server
.sharingpool
);
2711 redisAssert(de
!= NULL
);
2712 c
= ((unsigned long) dictGetEntryVal(de
))-1;
2713 dictGetEntryVal(de
) = (void*) c
;
2715 dictDelete(server
.sharingpool
,de
->key
);
2718 c
= 0; /* If the pool is empty we want to add this object */
2723 retval
= dictAdd(server
.sharingpool
,o
,(void*)1);
2724 redisAssert(retval
== DICT_OK
);
2731 /* Check if the nul-terminated string 's' can be represented by a long
2732 * (that is, is a number that fits into long without any other space or
2733 * character before or after the digits).
2735 * If so, the function returns REDIS_OK and *longval is set to the value
2736 * of the number. Otherwise REDIS_ERR is returned */
2737 static int isStringRepresentableAsLong(sds s
, long *longval
) {
2738 char buf
[32], *endptr
;
2742 value
= strtol(s
, &endptr
, 10);
2743 if (endptr
[0] != '\0') return REDIS_ERR
;
2744 slen
= snprintf(buf
,32,"%ld",value
);
2746 /* If the number converted back into a string is not identical
2747 * then it's not possible to encode the string as integer */
2748 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
2749 if (longval
) *longval
= value
;
2753 /* Try to encode a string object in order to save space */
2754 static int tryObjectEncoding(robj
*o
) {
2758 if (o
->encoding
!= REDIS_ENCODING_RAW
)
2759 return REDIS_ERR
; /* Already encoded */
2761 /* It's not save to encode shared objects: shared objects can be shared
2762 * everywhere in the "object space" of Redis. Encoded objects can only
2763 * appear as "values" (and not, for instance, as keys) */
2764 if (o
->refcount
> 1) return REDIS_ERR
;
2766 /* Currently we try to encode only strings */
2767 redisAssert(o
->type
== REDIS_STRING
);
2769 /* Check if we can represent this string as a long integer */
2770 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return REDIS_ERR
;
2772 /* Ok, this object can be encoded */
2773 o
->encoding
= REDIS_ENCODING_INT
;
2775 o
->ptr
= (void*) value
;
2779 /* Get a decoded version of an encoded object (returned as a new object).
2780 * If the object is already raw-encoded just increment the ref count. */
2781 static robj
*getDecodedObject(robj
*o
) {
2784 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2788 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
2791 snprintf(buf
,32,"%ld",(long)o
->ptr
);
2792 dec
= createStringObject(buf
,strlen(buf
));
2795 redisAssert(1 != 1);
2799 /* Compare two string objects via strcmp() or alike.
2800 * Note that the objects may be integer-encoded. In such a case we
2801 * use snprintf() to get a string representation of the numbers on the stack
2802 * and compare the strings, it's much faster than calling getDecodedObject().
2804 * Important note: if objects are not integer encoded, but binary-safe strings,
2805 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2807 static int compareStringObjects(robj
*a
, robj
*b
) {
2808 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
2809 char bufa
[128], bufb
[128], *astr
, *bstr
;
2812 if (a
== b
) return 0;
2813 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
2814 snprintf(bufa
,sizeof(bufa
),"%ld",(long) a
->ptr
);
2820 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
2821 snprintf(bufb
,sizeof(bufb
),"%ld",(long) b
->ptr
);
2827 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
2830 static size_t stringObjectLen(robj
*o
) {
2831 redisAssert(o
->type
== REDIS_STRING
);
2832 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2833 return sdslen(o
->ptr
);
2837 return snprintf(buf
,32,"%ld",(long)o
->ptr
);
2841 /*============================ RDB saving/loading =========================== */
2843 static int rdbSaveType(FILE *fp
, unsigned char type
) {
2844 if (fwrite(&type
,1,1,fp
) == 0) return -1;
2848 static int rdbSaveTime(FILE *fp
, time_t t
) {
2849 int32_t t32
= (int32_t) t
;
2850 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
2854 /* check rdbLoadLen() comments for more info */
2855 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
2856 unsigned char buf
[2];
2859 /* Save a 6 bit len */
2860 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
2861 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2862 } else if (len
< (1<<14)) {
2863 /* Save a 14 bit len */
2864 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
2866 if (fwrite(buf
,2,1,fp
) == 0) return -1;
2868 /* Save a 32 bit len */
2869 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
2870 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2872 if (fwrite(&len
,4,1,fp
) == 0) return -1;
2877 /* String objects in the form "2391" "-100" without any space and with a
2878 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2879 * encoded as integers to save space */
2880 static int rdbTryIntegerEncoding(sds s
, unsigned char *enc
) {
2882 char *endptr
, buf
[32];
2884 /* Check if it's possible to encode this value as a number */
2885 value
= strtoll(s
, &endptr
, 10);
2886 if (endptr
[0] != '\0') return 0;
2887 snprintf(buf
,32,"%lld",value
);
2889 /* If the number converted back into a string is not identical
2890 * then it's not possible to encode the string as integer */
2891 if (strlen(buf
) != sdslen(s
) || memcmp(buf
,s
,sdslen(s
))) return 0;
2893 /* Finally check if it fits in our ranges */
2894 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
2895 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
2896 enc
[1] = value
&0xFF;
2898 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
2899 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
2900 enc
[1] = value
&0xFF;
2901 enc
[2] = (value
>>8)&0xFF;
2903 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
2904 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
2905 enc
[1] = value
&0xFF;
2906 enc
[2] = (value
>>8)&0xFF;
2907 enc
[3] = (value
>>16)&0xFF;
2908 enc
[4] = (value
>>24)&0xFF;
2915 static int rdbSaveLzfStringObject(FILE *fp
, robj
*obj
) {
2916 unsigned int comprlen
, outlen
;
2920 /* We require at least four bytes compression for this to be worth it */
2921 outlen
= sdslen(obj
->ptr
)-4;
2922 if (outlen
<= 0) return 0;
2923 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
2924 comprlen
= lzf_compress(obj
->ptr
, sdslen(obj
->ptr
), out
, outlen
);
2925 if (comprlen
== 0) {
2929 /* Data compressed! Let's save it on disk */
2930 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
2931 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
2932 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
2933 if (rdbSaveLen(fp
,sdslen(obj
->ptr
)) == -1) goto writeerr
;
2934 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
2943 /* Save a string objet as [len][data] on disk. If the object is a string
2944 * representation of an integer value we try to safe it in a special form */
2945 static int rdbSaveStringObjectRaw(FILE *fp
, robj
*obj
) {
2949 len
= sdslen(obj
->ptr
);
2951 /* Try integer encoding */
2953 unsigned char buf
[5];
2954 if ((enclen
= rdbTryIntegerEncoding(obj
->ptr
,buf
)) > 0) {
2955 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
2960 /* Try LZF compression - under 20 bytes it's unable to compress even
2961 * aaaaaaaaaaaaaaaaaa so skip it */
2962 if (server
.rdbcompression
&& len
> 20) {
2965 retval
= rdbSaveLzfStringObject(fp
,obj
);
2966 if (retval
== -1) return -1;
2967 if (retval
> 0) return 0;
2968 /* retval == 0 means data can't be compressed, save the old way */
2971 /* Store verbatim */
2972 if (rdbSaveLen(fp
,len
) == -1) return -1;
2973 if (len
&& fwrite(obj
->ptr
,len
,1,fp
) == 0) return -1;
2977 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
2978 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
2981 /* Avoid incr/decr ref count business when possible.
2982 * This plays well with copy-on-write given that we are probably
2983 * in a child process (BGSAVE). Also this makes sure key objects
2984 * of swapped objects are not incRefCount-ed (an assert does not allow
2985 * this in order to avoid bugs) */
2986 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
2987 obj
= getDecodedObject(obj
);
2988 retval
= rdbSaveStringObjectRaw(fp
,obj
);
2991 retval
= rdbSaveStringObjectRaw(fp
,obj
);
2996 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
2997 * 8 bit integer specifing the length of the representation.
2998 * This 8 bit integer has special values in order to specify the following
3004 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3005 unsigned char buf
[128];
3011 } else if (!isfinite(val
)) {
3013 buf
[0] = (val
< 0) ? 255 : 254;
3015 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3016 buf
[0] = strlen((char*)buf
+1);
3019 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3023 /* Save a Redis object. */
3024 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3025 if (o
->type
== REDIS_STRING
) {
3026 /* Save a string value */
3027 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3028 } else if (o
->type
== REDIS_LIST
) {
3029 /* Save a list value */
3030 list
*list
= o
->ptr
;
3034 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3035 listRewind(list
,&li
);
3036 while((ln
= listNext(&li
))) {
3037 robj
*eleobj
= listNodeValue(ln
);
3039 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3041 } else if (o
->type
== REDIS_SET
) {
3042 /* Save a set value */
3044 dictIterator
*di
= dictGetIterator(set
);
3047 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3048 while((de
= dictNext(di
)) != NULL
) {
3049 robj
*eleobj
= dictGetEntryKey(de
);
3051 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3053 dictReleaseIterator(di
);
3054 } else if (o
->type
== REDIS_ZSET
) {
3055 /* Save a set value */
3057 dictIterator
*di
= dictGetIterator(zs
->dict
);
3060 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3061 while((de
= dictNext(di
)) != NULL
) {
3062 robj
*eleobj
= dictGetEntryKey(de
);
3063 double *score
= dictGetEntryVal(de
);
3065 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3066 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3068 dictReleaseIterator(di
);
3070 redisAssert(0 != 0);
3075 /* Return the length the object will have on disk if saved with
3076 * the rdbSaveObject() function. Currently we use a trick to get
3077 * this length with very little changes to the code. In the future
3078 * we could switch to a faster solution. */
3079 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3080 if (fp
== NULL
) fp
= server
.devnull
;
3082 assert(rdbSaveObject(fp
,o
) != 1);
3086 /* Return the number of pages required to save this object in the swap file */
3087 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3088 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3090 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3093 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3094 static int rdbSave(char *filename
) {
3095 dictIterator
*di
= NULL
;
3100 time_t now
= time(NULL
);
3102 /* Wait for I/O therads to terminate, just in case this is a
3103 * foreground-saving, to avoid seeking the swap file descriptor at the
3105 if (server
.vm_enabled
)
3106 waitEmptyIOJobsQueue();
3108 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3109 fp
= fopen(tmpfile
,"w");
3111 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3114 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3115 for (j
= 0; j
< server
.dbnum
; j
++) {
3116 redisDb
*db
= server
.db
+j
;
3118 if (dictSize(d
) == 0) continue;
3119 di
= dictGetIterator(d
);
3125 /* Write the SELECT DB opcode */
3126 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3127 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3129 /* Iterate this DB writing every entry */
3130 while((de
= dictNext(di
)) != NULL
) {
3131 robj
*key
= dictGetEntryKey(de
);
3132 robj
*o
= dictGetEntryVal(de
);
3133 time_t expiretime
= getExpire(db
,key
);
3135 /* Save the expire time */
3136 if (expiretime
!= -1) {
3137 /* If this key is already expired skip it */
3138 if (expiretime
< now
) continue;
3139 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3140 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3142 /* Save the key and associated value. This requires special
3143 * handling if the value is swapped out. */
3144 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
3145 key
->storage
== REDIS_VM_SWAPPING
) {
3146 /* Save type, key, value */
3147 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3148 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3149 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3151 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3153 /* Get a preview of the object in memory */
3154 po
= vmPreviewObject(key
);
3155 /* Save type, key, value */
3156 if (rdbSaveType(fp
,key
->vtype
) == -1) goto werr
;
3157 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3158 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3159 /* Remove the loaded object from memory */
3163 dictReleaseIterator(di
);
3166 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3168 /* Make sure data will not remain on the OS's output buffers */
3173 /* Use RENAME to make sure the DB file is changed atomically only
3174 * if the generate DB file is ok. */
3175 if (rename(tmpfile
,filename
) == -1) {
3176 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3180 redisLog(REDIS_NOTICE
,"DB saved on disk");
3182 server
.lastsave
= time(NULL
);
3188 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
3189 if (di
) dictReleaseIterator(di
);
3193 static int rdbSaveBackground(char *filename
) {
3196 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
3197 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
3198 if ((childpid
= fork()) == 0) {
3200 if (server
.vm_enabled
) vmReopenSwapFile();
3202 if (rdbSave(filename
) == REDIS_OK
) {
3209 if (childpid
== -1) {
3210 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
3214 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
3215 server
.bgsavechildpid
= childpid
;
3218 return REDIS_OK
; /* unreached */
3221 static void rdbRemoveTempFile(pid_t childpid
) {
3224 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
3228 static int rdbLoadType(FILE *fp
) {
3230 if (fread(&type
,1,1,fp
) == 0) return -1;
3234 static time_t rdbLoadTime(FILE *fp
) {
3236 if (fread(&t32
,4,1,fp
) == 0) return -1;
3237 return (time_t) t32
;
3240 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3241 * of this file for a description of how this are stored on disk.
3243 * isencoded is set to 1 if the readed length is not actually a length but
3244 * an "encoding type", check the above comments for more info */
3245 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
3246 unsigned char buf
[2];
3250 if (isencoded
) *isencoded
= 0;
3251 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3252 type
= (buf
[0]&0xC0)>>6;
3253 if (type
== REDIS_RDB_6BITLEN
) {
3254 /* Read a 6 bit len */
3256 } else if (type
== REDIS_RDB_ENCVAL
) {
3257 /* Read a 6 bit len encoding type */
3258 if (isencoded
) *isencoded
= 1;
3260 } else if (type
== REDIS_RDB_14BITLEN
) {
3261 /* Read a 14 bit len */
3262 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3263 return ((buf
[0]&0x3F)<<8)|buf
[1];
3265 /* Read a 32 bit len */
3266 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
3271 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
) {
3272 unsigned char enc
[4];
3275 if (enctype
== REDIS_RDB_ENC_INT8
) {
3276 if (fread(enc
,1,1,fp
) == 0) return NULL
;
3277 val
= (signed char)enc
[0];
3278 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
3280 if (fread(enc
,2,1,fp
) == 0) return NULL
;
3281 v
= enc
[0]|(enc
[1]<<8);
3283 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
3285 if (fread(enc
,4,1,fp
) == 0) return NULL
;
3286 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
3289 val
= 0; /* anti-warning */
3292 return createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",val
));
3295 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
3296 unsigned int len
, clen
;
3297 unsigned char *c
= NULL
;
3300 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3301 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3302 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
3303 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
3304 if (fread(c
,clen
,1,fp
) == 0) goto err
;
3305 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
3307 return createObject(REDIS_STRING
,val
);
3314 static robj
*rdbLoadStringObject(FILE*fp
) {
3319 len
= rdbLoadLen(fp
,&isencoded
);
3322 case REDIS_RDB_ENC_INT8
:
3323 case REDIS_RDB_ENC_INT16
:
3324 case REDIS_RDB_ENC_INT32
:
3325 return tryObjectSharing(rdbLoadIntegerObject(fp
,len
));
3326 case REDIS_RDB_ENC_LZF
:
3327 return tryObjectSharing(rdbLoadLzfStringObject(fp
));
3333 if (len
== REDIS_RDB_LENERR
) return NULL
;
3334 val
= sdsnewlen(NULL
,len
);
3335 if (len
&& fread(val
,len
,1,fp
) == 0) {
3339 return tryObjectSharing(createObject(REDIS_STRING
,val
));
3342 /* For information about double serialization check rdbSaveDoubleValue() */
3343 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
3347 if (fread(&len
,1,1,fp
) == 0) return -1;
3349 case 255: *val
= R_NegInf
; return 0;
3350 case 254: *val
= R_PosInf
; return 0;
3351 case 253: *val
= R_Nan
; return 0;
3353 if (fread(buf
,len
,1,fp
) == 0) return -1;
3355 sscanf(buf
, "%lg", val
);
3360 /* Load a Redis object of the specified type from the specified file.
3361 * On success a newly allocated object is returned, otherwise NULL. */
3362 static robj
*rdbLoadObject(int type
, FILE *fp
) {
3365 if (type
== REDIS_STRING
) {
3366 /* Read string value */
3367 if ((o
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3368 tryObjectEncoding(o
);
3369 } else if (type
== REDIS_LIST
|| type
== REDIS_SET
) {
3370 /* Read list/set value */
3373 if ((listlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3374 o
= (type
== REDIS_LIST
) ? createListObject() : createSetObject();
3375 /* It's faster to expand the dict to the right size asap in order
3376 * to avoid rehashing */
3377 if (type
== REDIS_SET
&& listlen
> DICT_HT_INITIAL_SIZE
)
3378 dictExpand(o
->ptr
,listlen
);
3379 /* Load every single element of the list/set */
3383 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3384 tryObjectEncoding(ele
);
3385 if (type
== REDIS_LIST
) {
3386 listAddNodeTail((list
*)o
->ptr
,ele
);
3388 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
3391 } else if (type
== REDIS_ZSET
) {
3392 /* Read list/set value */
3396 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3397 o
= createZsetObject();
3399 /* Load every single element of the list/set */
3402 double *score
= zmalloc(sizeof(double));
3404 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3405 tryObjectEncoding(ele
);
3406 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
3407 dictAdd(zs
->dict
,ele
,score
);
3408 zslInsert(zs
->zsl
,*score
,ele
);
3409 incrRefCount(ele
); /* added to skiplist */
3412 redisAssert(0 != 0);
3417 static int rdbLoad(char *filename
) {
3419 robj
*keyobj
= NULL
;
3421 int type
, retval
, rdbver
;
3422 dict
*d
= server
.db
[0].dict
;
3423 redisDb
*db
= server
.db
+0;
3425 time_t expiretime
= -1, now
= time(NULL
);
3426 long long loadedkeys
= 0;
3428 fp
= fopen(filename
,"r");
3429 if (!fp
) return REDIS_ERR
;
3430 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
3432 if (memcmp(buf
,"REDIS",5) != 0) {
3434 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
3437 rdbver
= atoi(buf
+5);
3440 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
3447 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3448 if (type
== REDIS_EXPIRETIME
) {
3449 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
3450 /* We read the time so we need to read the object type again */
3451 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3453 if (type
== REDIS_EOF
) break;
3454 /* Handle SELECT DB opcode as a special case */
3455 if (type
== REDIS_SELECTDB
) {
3456 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
3458 if (dbid
>= (unsigned)server
.dbnum
) {
3459 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
3462 db
= server
.db
+dbid
;
3467 if ((keyobj
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
3469 if ((o
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
3470 /* Add the new object in the hash table */
3471 retval
= dictAdd(d
,keyobj
,o
);
3472 if (retval
== DICT_ERR
) {
3473 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj
->ptr
);
3476 /* Set the expire time if needed */
3477 if (expiretime
!= -1) {
3478 setExpire(db
,keyobj
,expiretime
);
3479 /* Delete this key if already expired */
3480 if (expiretime
< now
) deleteKey(db
,keyobj
);
3484 /* Handle swapping while loading big datasets when VM is on */
3486 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
3487 while (zmalloc_used_memory() > server
.vm_max_memory
) {
3488 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
3495 eoferr
: /* unexpected end of file is handled here with a fatal exit */
3496 if (keyobj
) decrRefCount(keyobj
);
3497 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3499 return REDIS_ERR
; /* Just to avoid warning */
3502 /*================================== Commands =============================== */
3504 static void authCommand(redisClient
*c
) {
3505 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
3506 c
->authenticated
= 1;
3507 addReply(c
,shared
.ok
);
3509 c
->authenticated
= 0;
3510 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3514 static void pingCommand(redisClient
*c
) {
3515 addReply(c
,shared
.pong
);
3518 static void echoCommand(redisClient
*c
) {
3519 addReplyBulkLen(c
,c
->argv
[1]);
3520 addReply(c
,c
->argv
[1]);
3521 addReply(c
,shared
.crlf
);
3524 /*=================================== Strings =============================== */
3526 static void setGenericCommand(redisClient
*c
, int nx
) {
3529 if (nx
) deleteIfVolatile(c
->db
,c
->argv
[1]);
3530 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3531 if (retval
== DICT_ERR
) {
3533 /* If the key is about a swapped value, we want a new key object
3534 * to overwrite the old. So we delete the old key in the database.
3535 * This will also make sure that swap pages about the old object
3536 * will be marked as free. */
3537 if (deleteIfSwapped(c
->db
,c
->argv
[1]))
3538 incrRefCount(c
->argv
[1]);
3539 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3540 incrRefCount(c
->argv
[2]);
3542 addReply(c
,shared
.czero
);
3546 incrRefCount(c
->argv
[1]);
3547 incrRefCount(c
->argv
[2]);
3550 removeExpire(c
->db
,c
->argv
[1]);
3551 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3554 static void setCommand(redisClient
*c
) {
3555 setGenericCommand(c
,0);
3558 static void setnxCommand(redisClient
*c
) {
3559 setGenericCommand(c
,1);
3562 static int getGenericCommand(redisClient
*c
) {
3563 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3566 addReply(c
,shared
.nullbulk
);
3569 if (o
->type
!= REDIS_STRING
) {
3570 addReply(c
,shared
.wrongtypeerr
);
3573 addReplyBulkLen(c
,o
);
3575 addReply(c
,shared
.crlf
);
3581 static void getCommand(redisClient
*c
) {
3582 getGenericCommand(c
);
3585 static void getsetCommand(redisClient
*c
) {
3586 if (getGenericCommand(c
) == REDIS_ERR
) return;
3587 if (dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]) == DICT_ERR
) {
3588 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3590 incrRefCount(c
->argv
[1]);
3592 incrRefCount(c
->argv
[2]);
3594 removeExpire(c
->db
,c
->argv
[1]);
3597 static void mgetCommand(redisClient
*c
) {
3600 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
3601 for (j
= 1; j
< c
->argc
; j
++) {
3602 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
3604 addReply(c
,shared
.nullbulk
);
3606 if (o
->type
!= REDIS_STRING
) {
3607 addReply(c
,shared
.nullbulk
);
3609 addReplyBulkLen(c
,o
);
3611 addReply(c
,shared
.crlf
);
3617 static void msetGenericCommand(redisClient
*c
, int nx
) {
3618 int j
, busykeys
= 0;
3620 if ((c
->argc
% 2) == 0) {
3621 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3624 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3625 * set nothing at all if at least one already key exists. */
3627 for (j
= 1; j
< c
->argc
; j
+= 2) {
3628 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
3634 addReply(c
, shared
.czero
);
3638 for (j
= 1; j
< c
->argc
; j
+= 2) {
3641 tryObjectEncoding(c
->argv
[j
+1]);
3642 retval
= dictAdd(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3643 if (retval
== DICT_ERR
) {
3644 dictReplace(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3645 incrRefCount(c
->argv
[j
+1]);
3647 incrRefCount(c
->argv
[j
]);
3648 incrRefCount(c
->argv
[j
+1]);
3650 removeExpire(c
->db
,c
->argv
[j
]);
3652 server
.dirty
+= (c
->argc
-1)/2;
3653 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3656 static void msetCommand(redisClient
*c
) {
3657 msetGenericCommand(c
,0);
3660 static void msetnxCommand(redisClient
*c
) {
3661 msetGenericCommand(c
,1);
3664 static void incrDecrCommand(redisClient
*c
, long long incr
) {
3669 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3673 if (o
->type
!= REDIS_STRING
) {
3678 if (o
->encoding
== REDIS_ENCODING_RAW
)
3679 value
= strtoll(o
->ptr
, &eptr
, 10);
3680 else if (o
->encoding
== REDIS_ENCODING_INT
)
3681 value
= (long)o
->ptr
;
3683 redisAssert(1 != 1);
3688 o
= createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",value
));
3689 tryObjectEncoding(o
);
3690 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],o
);
3691 if (retval
== DICT_ERR
) {
3692 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3693 removeExpire(c
->db
,c
->argv
[1]);
3695 incrRefCount(c
->argv
[1]);
3698 addReply(c
,shared
.colon
);
3700 addReply(c
,shared
.crlf
);
3703 static void incrCommand(redisClient
*c
) {
3704 incrDecrCommand(c
,1);
3707 static void decrCommand(redisClient
*c
) {
3708 incrDecrCommand(c
,-1);
3711 static void incrbyCommand(redisClient
*c
) {
3712 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3713 incrDecrCommand(c
,incr
);
3716 static void decrbyCommand(redisClient
*c
) {
3717 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3718 incrDecrCommand(c
,-incr
);
3721 static void appendCommand(redisClient
*c
) {
3726 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3728 /* Create the key */
3729 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3730 incrRefCount(c
->argv
[1]);
3731 incrRefCount(c
->argv
[2]);
3732 totlen
= stringObjectLen(c
->argv
[2]);
3736 de
= dictFind(c
->db
->dict
,c
->argv
[1]);
3739 o
= dictGetEntryVal(de
);
3740 if (o
->type
!= REDIS_STRING
) {
3741 addReply(c
,shared
.wrongtypeerr
);
3744 /* If the object is specially encoded or shared we have to make
3746 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
3747 robj
*decoded
= getDecodedObject(o
);
3749 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
3750 decrRefCount(decoded
);
3751 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3754 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
3755 o
->ptr
= sdscatlen(o
->ptr
,
3756 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
3758 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
3759 (unsigned long) c
->argv
[2]->ptr
);
3761 totlen
= sdslen(o
->ptr
);
3764 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
3767 /* ========================= Type agnostic commands ========================= */
3769 static void delCommand(redisClient
*c
) {
3772 for (j
= 1; j
< c
->argc
; j
++) {
3773 if (deleteKey(c
->db
,c
->argv
[j
])) {
3780 addReply(c
,shared
.czero
);
3783 addReply(c
,shared
.cone
);
3786 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",deleted
));
3791 static void existsCommand(redisClient
*c
) {
3792 addReply(c
,lookupKeyRead(c
->db
,c
->argv
[1]) ? shared
.cone
: shared
.czero
);
3795 static void selectCommand(redisClient
*c
) {
3796 int id
= atoi(c
->argv
[1]->ptr
);
3798 if (selectDb(c
,id
) == REDIS_ERR
) {
3799 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
3801 addReply(c
,shared
.ok
);
3805 static void randomkeyCommand(redisClient
*c
) {
3809 de
= dictGetRandomKey(c
->db
->dict
);
3810 if (!de
|| expireIfNeeded(c
->db
,dictGetEntryKey(de
)) == 0) break;
3813 addReply(c
,shared
.plus
);
3814 addReply(c
,shared
.crlf
);
3816 addReply(c
,shared
.plus
);
3817 addReply(c
,dictGetEntryKey(de
));
3818 addReply(c
,shared
.crlf
);
3822 static void keysCommand(redisClient
*c
) {
3825 sds pattern
= c
->argv
[1]->ptr
;
3826 int plen
= sdslen(pattern
);
3827 unsigned long numkeys
= 0;
3828 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
3830 di
= dictGetIterator(c
->db
->dict
);
3832 decrRefCount(lenobj
);
3833 while((de
= dictNext(di
)) != NULL
) {
3834 robj
*keyobj
= dictGetEntryKey(de
);
3836 sds key
= keyobj
->ptr
;
3837 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
3838 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
3839 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
3840 addReplyBulkLen(c
,keyobj
);
3842 addReply(c
,shared
.crlf
);
3847 dictReleaseIterator(di
);
3848 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
3851 static void dbsizeCommand(redisClient
*c
) {
3853 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
3856 static void lastsaveCommand(redisClient
*c
) {
3858 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
3861 static void typeCommand(redisClient
*c
) {
3865 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3870 case REDIS_STRING
: type
= "+string"; break;
3871 case REDIS_LIST
: type
= "+list"; break;
3872 case REDIS_SET
: type
= "+set"; break;
3873 case REDIS_ZSET
: type
= "+zset"; break;
3874 default: type
= "unknown"; break;
3877 addReplySds(c
,sdsnew(type
));
3878 addReply(c
,shared
.crlf
);
3881 static void saveCommand(redisClient
*c
) {
3882 if (server
.bgsavechildpid
!= -1) {
3883 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
3886 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
3887 addReply(c
,shared
.ok
);
3889 addReply(c
,shared
.err
);
3893 static void bgsaveCommand(redisClient
*c
) {
3894 if (server
.bgsavechildpid
!= -1) {
3895 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
3898 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
3899 char *status
= "+Background saving started\r\n";
3900 addReplySds(c
,sdsnew(status
));
3902 addReply(c
,shared
.err
);
3906 static void shutdownCommand(redisClient
*c
) {
3907 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
3908 /* Kill the saving child if there is a background saving in progress.
3909 We want to avoid race conditions, for instance our saving child may
3910 overwrite the synchronous saving did by SHUTDOWN. */
3911 if (server
.bgsavechildpid
!= -1) {
3912 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
3913 kill(server
.bgsavechildpid
,SIGKILL
);
3914 rdbRemoveTempFile(server
.bgsavechildpid
);
3916 if (server
.appendonly
) {
3917 /* Append only file: fsync() the AOF and exit */
3918 fsync(server
.appendfd
);
3919 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
3922 /* Snapshotting. Perform a SYNC SAVE and exit */
3923 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
3924 if (server
.daemonize
)
3925 unlink(server
.pidfile
);
3926 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
3927 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
3928 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
3931 /* Ooops.. error saving! The best we can do is to continue operating.
3932 * Note that if there was a background saving process, in the next
3933 * cron() Redis will be notified that the background saving aborted,
3934 * handling special stuff like slaves pending for synchronization... */
3935 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
3936 addReplySds(c
,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3941 static void renameGenericCommand(redisClient
*c
, int nx
) {
3944 /* To use the same key as src and dst is probably an error */
3945 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
3946 addReply(c
,shared
.sameobjecterr
);
3950 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3952 addReply(c
,shared
.nokeyerr
);
3956 deleteIfVolatile(c
->db
,c
->argv
[2]);
3957 if (dictAdd(c
->db
->dict
,c
->argv
[2],o
) == DICT_ERR
) {
3960 addReply(c
,shared
.czero
);
3963 dictReplace(c
->db
->dict
,c
->argv
[2],o
);
3965 incrRefCount(c
->argv
[2]);
3967 deleteKey(c
->db
,c
->argv
[1]);
3969 addReply(c
,nx
? shared
.cone
: shared
.ok
);
3972 static void renameCommand(redisClient
*c
) {
3973 renameGenericCommand(c
,0);
3976 static void renamenxCommand(redisClient
*c
) {
3977 renameGenericCommand(c
,1);
3980 static void moveCommand(redisClient
*c
) {
3985 /* Obtain source and target DB pointers */
3988 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
3989 addReply(c
,shared
.outofrangeerr
);
3993 selectDb(c
,srcid
); /* Back to the source DB */
3995 /* If the user is moving using as target the same
3996 * DB as the source DB it is probably an error. */
3998 addReply(c
,shared
.sameobjecterr
);
4002 /* Check if the element exists and get a reference */
4003 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4005 addReply(c
,shared
.czero
);
4009 /* Try to add the element to the target DB */
4010 deleteIfVolatile(dst
,c
->argv
[1]);
4011 if (dictAdd(dst
->dict
,c
->argv
[1],o
) == DICT_ERR
) {
4012 addReply(c
,shared
.czero
);
4015 incrRefCount(c
->argv
[1]);
4018 /* OK! key moved, free the entry in the source DB */
4019 deleteKey(src
,c
->argv
[1]);
4021 addReply(c
,shared
.cone
);
4024 /* =================================== Lists ================================ */
4025 static void pushGenericCommand(redisClient
*c
, int where
) {
4029 lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4031 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4032 addReply(c
,shared
.ok
);
4035 lobj
= createListObject();
4037 if (where
== REDIS_HEAD
) {
4038 listAddNodeHead(list
,c
->argv
[2]);
4040 listAddNodeTail(list
,c
->argv
[2]);
4042 dictAdd(c
->db
->dict
,c
->argv
[1],lobj
);
4043 incrRefCount(c
->argv
[1]);
4044 incrRefCount(c
->argv
[2]);
4046 if (lobj
->type
!= REDIS_LIST
) {
4047 addReply(c
,shared
.wrongtypeerr
);
4050 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4051 addReply(c
,shared
.ok
);
4055 if (where
== REDIS_HEAD
) {
4056 listAddNodeHead(list
,c
->argv
[2]);
4058 listAddNodeTail(list
,c
->argv
[2]);
4060 incrRefCount(c
->argv
[2]);
4063 addReply(c
,shared
.ok
);
4066 static void lpushCommand(redisClient
*c
) {
4067 pushGenericCommand(c
,REDIS_HEAD
);
4070 static void rpushCommand(redisClient
*c
) {
4071 pushGenericCommand(c
,REDIS_TAIL
);
4074 static void llenCommand(redisClient
*c
) {
4078 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4080 addReply(c
,shared
.czero
);
4083 if (o
->type
!= REDIS_LIST
) {
4084 addReply(c
,shared
.wrongtypeerr
);
4087 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(l
)));
4092 static void lindexCommand(redisClient
*c
) {
4094 int index
= atoi(c
->argv
[2]->ptr
);
4096 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4098 addReply(c
,shared
.nullbulk
);
4100 if (o
->type
!= REDIS_LIST
) {
4101 addReply(c
,shared
.wrongtypeerr
);
4103 list
*list
= o
->ptr
;
4106 ln
= listIndex(list
, index
);
4108 addReply(c
,shared
.nullbulk
);
4110 robj
*ele
= listNodeValue(ln
);
4111 addReplyBulkLen(c
,ele
);
4113 addReply(c
,shared
.crlf
);
4119 static void lsetCommand(redisClient
*c
) {
4121 int index
= atoi(c
->argv
[2]->ptr
);
4123 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4125 addReply(c
,shared
.nokeyerr
);
4127 if (o
->type
!= REDIS_LIST
) {
4128 addReply(c
,shared
.wrongtypeerr
);
4130 list
*list
= o
->ptr
;
4133 ln
= listIndex(list
, index
);
4135 addReply(c
,shared
.outofrangeerr
);
4137 robj
*ele
= listNodeValue(ln
);
4140 listNodeValue(ln
) = c
->argv
[3];
4141 incrRefCount(c
->argv
[3]);
4142 addReply(c
,shared
.ok
);
4149 static void popGenericCommand(redisClient
*c
, int where
) {
4152 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4154 addReply(c
,shared
.nullbulk
);
4156 if (o
->type
!= REDIS_LIST
) {
4157 addReply(c
,shared
.wrongtypeerr
);
4159 list
*list
= o
->ptr
;
4162 if (where
== REDIS_HEAD
)
4163 ln
= listFirst(list
);
4165 ln
= listLast(list
);
4168 addReply(c
,shared
.nullbulk
);
4170 robj
*ele
= listNodeValue(ln
);
4171 addReplyBulkLen(c
,ele
);
4173 addReply(c
,shared
.crlf
);
4174 listDelNode(list
,ln
);
4181 static void lpopCommand(redisClient
*c
) {
4182 popGenericCommand(c
,REDIS_HEAD
);
4185 static void rpopCommand(redisClient
*c
) {
4186 popGenericCommand(c
,REDIS_TAIL
);
4189 static void lrangeCommand(redisClient
*c
) {
4191 int start
= atoi(c
->argv
[2]->ptr
);
4192 int end
= atoi(c
->argv
[3]->ptr
);
4194 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4196 addReply(c
,shared
.nullmultibulk
);
4198 if (o
->type
!= REDIS_LIST
) {
4199 addReply(c
,shared
.wrongtypeerr
);
4201 list
*list
= o
->ptr
;
4203 int llen
= listLength(list
);
4207 /* convert negative indexes */
4208 if (start
< 0) start
= llen
+start
;
4209 if (end
< 0) end
= llen
+end
;
4210 if (start
< 0) start
= 0;
4211 if (end
< 0) end
= 0;
4213 /* indexes sanity checks */
4214 if (start
> end
|| start
>= llen
) {
4215 /* Out of range start or start > end result in empty list */
4216 addReply(c
,shared
.emptymultibulk
);
4219 if (end
>= llen
) end
= llen
-1;
4220 rangelen
= (end
-start
)+1;
4222 /* Return the result in form of a multi-bulk reply */
4223 ln
= listIndex(list
, start
);
4224 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
4225 for (j
= 0; j
< rangelen
; j
++) {
4226 ele
= listNodeValue(ln
);
4227 addReplyBulkLen(c
,ele
);
4229 addReply(c
,shared
.crlf
);
4236 static void ltrimCommand(redisClient
*c
) {
4238 int start
= atoi(c
->argv
[2]->ptr
);
4239 int end
= atoi(c
->argv
[3]->ptr
);
4241 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4243 addReply(c
,shared
.ok
);
4245 if (o
->type
!= REDIS_LIST
) {
4246 addReply(c
,shared
.wrongtypeerr
);
4248 list
*list
= o
->ptr
;
4250 int llen
= listLength(list
);
4251 int j
, ltrim
, rtrim
;
4253 /* convert negative indexes */
4254 if (start
< 0) start
= llen
+start
;
4255 if (end
< 0) end
= llen
+end
;
4256 if (start
< 0) start
= 0;
4257 if (end
< 0) end
= 0;
4259 /* indexes sanity checks */
4260 if (start
> end
|| start
>= llen
) {
4261 /* Out of range start or start > end result in empty list */
4265 if (end
>= llen
) end
= llen
-1;
4270 /* Remove list elements to perform the trim */
4271 for (j
= 0; j
< ltrim
; j
++) {
4272 ln
= listFirst(list
);
4273 listDelNode(list
,ln
);
4275 for (j
= 0; j
< rtrim
; j
++) {
4276 ln
= listLast(list
);
4277 listDelNode(list
,ln
);
4280 addReply(c
,shared
.ok
);
4285 static void lremCommand(redisClient
*c
) {
4288 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4290 addReply(c
,shared
.czero
);
4292 if (o
->type
!= REDIS_LIST
) {
4293 addReply(c
,shared
.wrongtypeerr
);
4295 list
*list
= o
->ptr
;
4296 listNode
*ln
, *next
;
4297 int toremove
= atoi(c
->argv
[2]->ptr
);
4302 toremove
= -toremove
;
4305 ln
= fromtail
? list
->tail
: list
->head
;
4307 robj
*ele
= listNodeValue(ln
);
4309 next
= fromtail
? ln
->prev
: ln
->next
;
4310 if (compareStringObjects(ele
,c
->argv
[3]) == 0) {
4311 listDelNode(list
,ln
);
4314 if (toremove
&& removed
== toremove
) break;
4318 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
4323 /* This is the semantic of this command:
4324 * RPOPLPUSH srclist dstlist:
4325 * IF LLEN(srclist) > 0
4326 * element = RPOP srclist
4327 * LPUSH dstlist element
4334 * The idea is to be able to get an element from a list in a reliable way
4335 * since the element is not just returned but pushed against another list
4336 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4338 static void rpoplpushcommand(redisClient
*c
) {
4341 sobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4343 addReply(c
,shared
.nullbulk
);
4345 if (sobj
->type
!= REDIS_LIST
) {
4346 addReply(c
,shared
.wrongtypeerr
);
4348 list
*srclist
= sobj
->ptr
;
4349 listNode
*ln
= listLast(srclist
);
4352 addReply(c
,shared
.nullbulk
);
4354 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4355 robj
*ele
= listNodeValue(ln
);
4358 if (dobj
&& dobj
->type
!= REDIS_LIST
) {
4359 addReply(c
,shared
.wrongtypeerr
);
4363 /* Add the element to the target list (unless it's directly
4364 * passed to some BLPOP-ing client */
4365 if (!handleClientsWaitingListPush(c
,c
->argv
[2],ele
)) {
4367 /* Create the list if the key does not exist */
4368 dobj
= createListObject();
4369 dictAdd(c
->db
->dict
,c
->argv
[2],dobj
);
4370 incrRefCount(c
->argv
[2]);
4372 dstlist
= dobj
->ptr
;
4373 listAddNodeHead(dstlist
,ele
);
4377 /* Send the element to the client as reply as well */
4378 addReplyBulkLen(c
,ele
);
4380 addReply(c
,shared
.crlf
);
4382 /* Finally remove the element from the source list */
4383 listDelNode(srclist
,ln
);
4391 /* ==================================== Sets ================================ */
4393 static void saddCommand(redisClient
*c
) {
4396 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4398 set
= createSetObject();
4399 dictAdd(c
->db
->dict
,c
->argv
[1],set
);
4400 incrRefCount(c
->argv
[1]);
4402 if (set
->type
!= REDIS_SET
) {
4403 addReply(c
,shared
.wrongtypeerr
);
4407 if (dictAdd(set
->ptr
,c
->argv
[2],NULL
) == DICT_OK
) {
4408 incrRefCount(c
->argv
[2]);
4410 addReply(c
,shared
.cone
);
4412 addReply(c
,shared
.czero
);
4416 static void sremCommand(redisClient
*c
) {
4419 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4421 addReply(c
,shared
.czero
);
4423 if (set
->type
!= REDIS_SET
) {
4424 addReply(c
,shared
.wrongtypeerr
);
4427 if (dictDelete(set
->ptr
,c
->argv
[2]) == DICT_OK
) {
4429 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4430 addReply(c
,shared
.cone
);
4432 addReply(c
,shared
.czero
);
4437 static void smoveCommand(redisClient
*c
) {
4438 robj
*srcset
, *dstset
;
4440 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4441 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4443 /* If the source key does not exist return 0, if it's of the wrong type
4445 if (srcset
== NULL
|| srcset
->type
!= REDIS_SET
) {
4446 addReply(c
, srcset
? shared
.wrongtypeerr
: shared
.czero
);
4449 /* Error if the destination key is not a set as well */
4450 if (dstset
&& dstset
->type
!= REDIS_SET
) {
4451 addReply(c
,shared
.wrongtypeerr
);
4454 /* Remove the element from the source set */
4455 if (dictDelete(srcset
->ptr
,c
->argv
[3]) == DICT_ERR
) {
4456 /* Key not found in the src set! return zero */
4457 addReply(c
,shared
.czero
);
4461 /* Add the element to the destination set */
4463 dstset
= createSetObject();
4464 dictAdd(c
->db
->dict
,c
->argv
[2],dstset
);
4465 incrRefCount(c
->argv
[2]);
4467 if (dictAdd(dstset
->ptr
,c
->argv
[3],NULL
) == DICT_OK
)
4468 incrRefCount(c
->argv
[3]);
4469 addReply(c
,shared
.cone
);
4472 static void sismemberCommand(redisClient
*c
) {
4475 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4477 addReply(c
,shared
.czero
);
4479 if (set
->type
!= REDIS_SET
) {
4480 addReply(c
,shared
.wrongtypeerr
);
4483 if (dictFind(set
->ptr
,c
->argv
[2]))
4484 addReply(c
,shared
.cone
);
4486 addReply(c
,shared
.czero
);
4490 static void scardCommand(redisClient
*c
) {
4494 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4496 addReply(c
,shared
.czero
);
4499 if (o
->type
!= REDIS_SET
) {
4500 addReply(c
,shared
.wrongtypeerr
);
4503 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4509 static void spopCommand(redisClient
*c
) {
4513 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4515 addReply(c
,shared
.nullbulk
);
4517 if (set
->type
!= REDIS_SET
) {
4518 addReply(c
,shared
.wrongtypeerr
);
4521 de
= dictGetRandomKey(set
->ptr
);
4523 addReply(c
,shared
.nullbulk
);
4525 robj
*ele
= dictGetEntryKey(de
);
4527 addReplyBulkLen(c
,ele
);
4529 addReply(c
,shared
.crlf
);
4530 dictDelete(set
->ptr
,ele
);
4531 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4537 static void srandmemberCommand(redisClient
*c
) {
4541 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4543 addReply(c
,shared
.nullbulk
);
4545 if (set
->type
!= REDIS_SET
) {
4546 addReply(c
,shared
.wrongtypeerr
);
4549 de
= dictGetRandomKey(set
->ptr
);
4551 addReply(c
,shared
.nullbulk
);
4553 robj
*ele
= dictGetEntryKey(de
);
4555 addReplyBulkLen(c
,ele
);
4557 addReply(c
,shared
.crlf
);
4562 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
4563 dict
**d1
= (void*) s1
, **d2
= (void*) s2
;
4565 return dictSize(*d1
)-dictSize(*d2
);
4568 static void sinterGenericCommand(redisClient
*c
, robj
**setskeys
, unsigned long setsnum
, robj
*dstkey
) {
4569 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4572 robj
*lenobj
= NULL
, *dstset
= NULL
;
4573 unsigned long j
, cardinality
= 0;
4575 for (j
= 0; j
< setsnum
; j
++) {
4579 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4580 lookupKeyRead(c
->db
,setskeys
[j
]);
4584 if (deleteKey(c
->db
,dstkey
))
4586 addReply(c
,shared
.czero
);
4588 addReply(c
,shared
.nullmultibulk
);
4592 if (setobj
->type
!= REDIS_SET
) {
4594 addReply(c
,shared
.wrongtypeerr
);
4597 dv
[j
] = setobj
->ptr
;
4599 /* Sort sets from the smallest to largest, this will improve our
4600 * algorithm's performace */
4601 qsort(dv
,setsnum
,sizeof(dict
*),qsortCompareSetsByCardinality
);
4603 /* The first thing we should output is the total number of elements...
4604 * since this is a multi-bulk write, but at this stage we don't know
4605 * the intersection set size, so we use a trick, append an empty object
4606 * to the output list and save the pointer to later modify it with the
4609 lenobj
= createObject(REDIS_STRING
,NULL
);
4611 decrRefCount(lenobj
);
4613 /* If we have a target key where to store the resulting set
4614 * create this key with an empty set inside */
4615 dstset
= createSetObject();
4618 /* Iterate all the elements of the first (smallest) set, and test
4619 * the element against all the other sets, if at least one set does
4620 * not include the element it is discarded */
4621 di
= dictGetIterator(dv
[0]);
4623 while((de
= dictNext(di
)) != NULL
) {
4626 for (j
= 1; j
< setsnum
; j
++)
4627 if (dictFind(dv
[j
],dictGetEntryKey(de
)) == NULL
) break;
4629 continue; /* at least one set does not contain the member */
4630 ele
= dictGetEntryKey(de
);
4632 addReplyBulkLen(c
,ele
);
4634 addReply(c
,shared
.crlf
);
4637 dictAdd(dstset
->ptr
,ele
,NULL
);
4641 dictReleaseIterator(di
);
4644 /* Store the resulting set into the target */
4645 deleteKey(c
->db
,dstkey
);
4646 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4647 incrRefCount(dstkey
);
4651 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
4653 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4654 dictSize((dict
*)dstset
->ptr
)));
4660 static void sinterCommand(redisClient
*c
) {
4661 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
4664 static void sinterstoreCommand(redisClient
*c
) {
4665 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
4668 #define REDIS_OP_UNION 0
4669 #define REDIS_OP_DIFF 1
4671 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setskeys
, int setsnum
, robj
*dstkey
, int op
) {
4672 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4675 robj
*dstset
= NULL
;
4676 int j
, cardinality
= 0;
4678 for (j
= 0; j
< setsnum
; j
++) {
4682 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4683 lookupKeyRead(c
->db
,setskeys
[j
]);
4688 if (setobj
->type
!= REDIS_SET
) {
4690 addReply(c
,shared
.wrongtypeerr
);
4693 dv
[j
] = setobj
->ptr
;
4696 /* We need a temp set object to store our union. If the dstkey
4697 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4698 * this set object will be the resulting object to set into the target key*/
4699 dstset
= createSetObject();
4701 /* Iterate all the elements of all the sets, add every element a single
4702 * time to the result set */
4703 for (j
= 0; j
< setsnum
; j
++) {
4704 if (op
== REDIS_OP_DIFF
&& j
== 0 && !dv
[j
]) break; /* result set is empty */
4705 if (!dv
[j
]) continue; /* non existing keys are like empty sets */
4707 di
= dictGetIterator(dv
[j
]);
4709 while((de
= dictNext(di
)) != NULL
) {
4712 /* dictAdd will not add the same element multiple times */
4713 ele
= dictGetEntryKey(de
);
4714 if (op
== REDIS_OP_UNION
|| j
== 0) {
4715 if (dictAdd(dstset
->ptr
,ele
,NULL
) == DICT_OK
) {
4719 } else if (op
== REDIS_OP_DIFF
) {
4720 if (dictDelete(dstset
->ptr
,ele
) == DICT_OK
) {
4725 dictReleaseIterator(di
);
4727 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break; /* result set is empty */
4730 /* Output the content of the resulting set, if not in STORE mode */
4732 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
4733 di
= dictGetIterator(dstset
->ptr
);
4734 while((de
= dictNext(di
)) != NULL
) {
4737 ele
= dictGetEntryKey(de
);
4738 addReplyBulkLen(c
,ele
);
4740 addReply(c
,shared
.crlf
);
4742 dictReleaseIterator(di
);
4744 /* If we have a target key where to store the resulting set
4745 * create this key with the result set inside */
4746 deleteKey(c
->db
,dstkey
);
4747 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4748 incrRefCount(dstkey
);
4753 decrRefCount(dstset
);
4755 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4756 dictSize((dict
*)dstset
->ptr
)));
4762 static void sunionCommand(redisClient
*c
) {
4763 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
4766 static void sunionstoreCommand(redisClient
*c
) {
4767 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
4770 static void sdiffCommand(redisClient
*c
) {
4771 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
4774 static void sdiffstoreCommand(redisClient
*c
) {
4775 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
4778 /* ==================================== ZSets =============================== */
4780 /* ZSETs are ordered sets using two data structures to hold the same elements
4781 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4784 * The elements are added to an hash table mapping Redis objects to scores.
4785 * At the same time the elements are added to a skip list mapping scores
4786 * to Redis objects (so objects are sorted by scores in this "view"). */
4788 /* This skiplist implementation is almost a C translation of the original
4789 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4790 * Alternative to Balanced Trees", modified in three ways:
4791 * a) this implementation allows for repeated values.
4792 * b) the comparison is not just by key (our 'score') but by satellite data.
4793 * c) there is a back pointer, so it's a doubly linked list with the back
4794 * pointers being only at "level 1". This allows to traverse the list
4795 * from tail to head, useful for ZREVRANGE. */
4797 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
4798 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
4800 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
4801 zn
->span
= zmalloc(sizeof(unsigned long) * level
);
4807 static zskiplist
*zslCreate(void) {
4811 zsl
= zmalloc(sizeof(*zsl
));
4814 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
4815 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
4816 zsl
->header
->forward
[j
] = NULL
;
4817 zsl
->header
->span
[j
] = 0;
4819 zsl
->header
->backward
= NULL
;
4824 static void zslFreeNode(zskiplistNode
*node
) {
4825 decrRefCount(node
->obj
);
4826 zfree(node
->forward
);
4831 static void zslFree(zskiplist
*zsl
) {
4832 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
4834 zfree(zsl
->header
->forward
);
4835 zfree(zsl
->header
->span
);
4838 next
= node
->forward
[0];
4845 static int zslRandomLevel(void) {
4847 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
4852 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
4853 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4854 unsigned long span
[ZSKIPLIST_MAXLEVEL
];
4858 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4859 /* store span that is crossed to reach the insert position */
4860 span
[i
] = i
== (zsl
->level
-1) ? 0 : span
[i
+1];
4862 while (x
->forward
[i
] &&
4863 (x
->forward
[i
]->score
< score
||
4864 (x
->forward
[i
]->score
== score
&&
4865 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
4866 span
[i
] += x
->span
[i
];
4871 /* we assume the key is not already inside, since we allow duplicated
4872 * scores, and the re-insertion of score and redis object should never
4873 * happpen since the caller of zslInsert() should test in the hash table
4874 * if the element is already inside or not. */
4875 level
= zslRandomLevel();
4876 if (level
> zsl
->level
) {
4877 for (i
= zsl
->level
; i
< level
; i
++) {
4879 update
[i
] = zsl
->header
;
4880 update
[i
]->span
[i
] = zsl
->length
;
4884 x
= zslCreateNode(level
,score
,obj
);
4885 for (i
= 0; i
< level
; i
++) {
4886 x
->forward
[i
] = update
[i
]->forward
[i
];
4887 update
[i
]->forward
[i
] = x
;
4889 /* update span covered by update[i] as x is inserted here */
4890 x
->span
[i
] = update
[i
]->span
[i
] - (span
[0] - span
[i
]);
4891 update
[i
]->span
[i
] = (span
[0] - span
[i
]) + 1;
4894 /* increment span for untouched levels */
4895 for (i
= level
; i
< zsl
->level
; i
++) {
4896 update
[i
]->span
[i
]++;
4899 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
4901 x
->forward
[0]->backward
= x
;
4907 /* Delete an element with matching score/object from the skiplist. */
4908 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
4909 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4913 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4914 while (x
->forward
[i
] &&
4915 (x
->forward
[i
]->score
< score
||
4916 (x
->forward
[i
]->score
== score
&&
4917 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
4921 /* We may have multiple elements with the same score, what we need
4922 * is to find the element with both the right score and object. */
4924 if (x
&& score
== x
->score
&& compareStringObjects(x
->obj
,obj
) == 0) {
4925 for (i
= 0; i
< zsl
->level
; i
++) {
4926 if (update
[i
]->forward
[i
] == x
) {
4927 update
[i
]->span
[i
] += x
->span
[i
] - 1;
4928 update
[i
]->forward
[i
] = x
->forward
[i
];
4930 update
[i
]->span
[i
] -= 1;
4933 if (x
->forward
[0]) {
4934 x
->forward
[0]->backward
= (x
->backward
== zsl
->header
) ?
4937 zsl
->tail
= x
->backward
;
4940 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
4945 return 0; /* not found */
4947 return 0; /* not found */
4950 /* Delete all the elements with score between min and max from the skiplist.
4951 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4952 * Note that this function takes the reference to the hash table view of the
4953 * sorted set, in order to remove the elements from the hash table too. */
4954 static unsigned long zslDeleteRange(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
4955 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4956 unsigned long removed
= 0;
4960 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4961 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
4965 /* We may have multiple elements with the same score, what we need
4966 * is to find the element with both the right score and object. */
4968 while (x
&& x
->score
<= max
) {
4969 zskiplistNode
*next
;
4971 for (i
= 0; i
< zsl
->level
; i
++) {
4972 if (update
[i
]->forward
[i
] == x
) {
4973 update
[i
]->span
[i
] += x
->span
[i
] - 1;
4974 update
[i
]->forward
[i
] = x
->forward
[i
];
4976 update
[i
]->span
[i
] -= 1;
4979 if (x
->forward
[0]) {
4980 x
->forward
[0]->backward
= (x
->backward
== zsl
->header
) ?
4983 zsl
->tail
= x
->backward
;
4985 next
= x
->forward
[0];
4986 dictDelete(dict
,x
->obj
);
4988 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
4994 return removed
; /* not found */
4997 /* Find the first node having a score equal or greater than the specified one.
4998 * Returns NULL if there is no match. */
4999 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
5004 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5005 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
5008 /* We may have multiple elements with the same score, what we need
5009 * is to find the element with both the right score and object. */
5010 return x
->forward
[0];
5013 /* The actual Z-commands implementations */
5015 /* This generic command implements both ZADD and ZINCRBY.
5016 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5017 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5018 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
5023 zsetobj
= lookupKeyWrite(c
->db
,key
);
5024 if (zsetobj
== NULL
) {
5025 zsetobj
= createZsetObject();
5026 dictAdd(c
->db
->dict
,key
,zsetobj
);
5029 if (zsetobj
->type
!= REDIS_ZSET
) {
5030 addReply(c
,shared
.wrongtypeerr
);
5036 /* Ok now since we implement both ZADD and ZINCRBY here the code
5037 * needs to handle the two different conditions. It's all about setting
5038 * '*score', that is, the new score to set, to the right value. */
5039 score
= zmalloc(sizeof(double));
5043 /* Read the old score. If the element was not present starts from 0 */
5044 de
= dictFind(zs
->dict
,ele
);
5046 double *oldscore
= dictGetEntryVal(de
);
5047 *score
= *oldscore
+ scoreval
;
5055 /* What follows is a simple remove and re-insert operation that is common
5056 * to both ZADD and ZINCRBY... */
5057 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
5058 /* case 1: New element */
5059 incrRefCount(ele
); /* added to hash */
5060 zslInsert(zs
->zsl
,*score
,ele
);
5061 incrRefCount(ele
); /* added to skiplist */
5064 addReplyDouble(c
,*score
);
5066 addReply(c
,shared
.cone
);
5071 /* case 2: Score update operation */
5072 de
= dictFind(zs
->dict
,ele
);
5073 redisAssert(de
!= NULL
);
5074 oldscore
= dictGetEntryVal(de
);
5075 if (*score
!= *oldscore
) {
5078 /* Remove and insert the element in the skip list with new score */
5079 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
5080 redisAssert(deleted
!= 0);
5081 zslInsert(zs
->zsl
,*score
,ele
);
5083 /* Update the score in the hash table */
5084 dictReplace(zs
->dict
,ele
,score
);
5090 addReplyDouble(c
,*score
);
5092 addReply(c
,shared
.czero
);
5096 static void zaddCommand(redisClient
*c
) {
5099 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5100 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
5103 static void zincrbyCommand(redisClient
*c
) {
5106 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5107 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
5110 static void zremCommand(redisClient
*c
) {
5114 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5115 if (zsetobj
== NULL
) {
5116 addReply(c
,shared
.czero
);
5122 if (zsetobj
->type
!= REDIS_ZSET
) {
5123 addReply(c
,shared
.wrongtypeerr
);
5127 de
= dictFind(zs
->dict
,c
->argv
[2]);
5129 addReply(c
,shared
.czero
);
5132 /* Delete from the skiplist */
5133 oldscore
= dictGetEntryVal(de
);
5134 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
5135 redisAssert(deleted
!= 0);
5137 /* Delete from the hash table */
5138 dictDelete(zs
->dict
,c
->argv
[2]);
5139 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5141 addReply(c
,shared
.cone
);
5145 static void zremrangebyscoreCommand(redisClient
*c
) {
5146 double min
= strtod(c
->argv
[2]->ptr
,NULL
);
5147 double max
= strtod(c
->argv
[3]->ptr
,NULL
);
5151 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5152 if (zsetobj
== NULL
) {
5153 addReply(c
,shared
.czero
);
5157 if (zsetobj
->type
!= REDIS_ZSET
) {
5158 addReply(c
,shared
.wrongtypeerr
);
5162 deleted
= zslDeleteRange(zs
->zsl
,min
,max
,zs
->dict
);
5163 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5164 server
.dirty
+= deleted
;
5165 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",deleted
));
5169 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
5171 int start
= atoi(c
->argv
[2]->ptr
);
5172 int end
= atoi(c
->argv
[3]->ptr
);
5175 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
5177 } else if (c
->argc
>= 5) {
5178 addReply(c
,shared
.syntaxerr
);
5182 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5184 addReply(c
,shared
.nullmultibulk
);
5186 if (o
->type
!= REDIS_ZSET
) {
5187 addReply(c
,shared
.wrongtypeerr
);
5189 zset
*zsetobj
= o
->ptr
;
5190 zskiplist
*zsl
= zsetobj
->zsl
;
5193 int llen
= zsl
->length
;
5197 /* convert negative indexes */
5198 if (start
< 0) start
= llen
+start
;
5199 if (end
< 0) end
= llen
+end
;
5200 if (start
< 0) start
= 0;
5201 if (end
< 0) end
= 0;
5203 /* indexes sanity checks */
5204 if (start
> end
|| start
>= llen
) {
5205 /* Out of range start or start > end result in empty list */
5206 addReply(c
,shared
.emptymultibulk
);
5209 if (end
>= llen
) end
= llen
-1;
5210 rangelen
= (end
-start
)+1;
5212 /* Return the result in form of a multi-bulk reply */
5218 ln
= zsl
->header
->forward
[0];
5220 ln
= ln
->forward
[0];
5223 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
5224 withscores
? (rangelen
*2) : rangelen
));
5225 for (j
= 0; j
< rangelen
; j
++) {
5227 addReplyBulkLen(c
,ele
);
5229 addReply(c
,shared
.crlf
);
5231 addReplyDouble(c
,ln
->score
);
5232 ln
= reverse
? ln
->backward
: ln
->forward
[0];
5238 static void zrangeCommand(redisClient
*c
) {
5239 zrangeGenericCommand(c
,0);
5242 static void zrevrangeCommand(redisClient
*c
) {
5243 zrangeGenericCommand(c
,1);
5246 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5247 * If justcount is non-zero, just the count is returned. */
5248 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
5251 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
5252 int offset
= 0, limit
= -1;
5256 /* Parse the min-max interval. If one of the values is prefixed
5257 * by the "(" character, it's considered "open". For instance
5258 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5259 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5260 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
5261 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
5264 min
= strtod(c
->argv
[2]->ptr
,NULL
);
5266 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
5267 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
5270 max
= strtod(c
->argv
[3]->ptr
,NULL
);
5273 /* Parse "WITHSCORES": note that if the command was called with
5274 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5275 * enter the following paths to parse WITHSCORES and LIMIT. */
5276 if (c
->argc
== 5 || c
->argc
== 8) {
5277 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
5282 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
5286 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5291 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
5292 addReply(c
,shared
.syntaxerr
);
5294 } else if (c
->argc
== (7 + withscores
)) {
5295 offset
= atoi(c
->argv
[5]->ptr
);
5296 limit
= atoi(c
->argv
[6]->ptr
);
5297 if (offset
< 0) offset
= 0;
5300 /* Ok, lookup the key and get the range */
5301 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5303 addReply(c
,justcount
? shared
.czero
: shared
.nullmultibulk
);
5305 if (o
->type
!= REDIS_ZSET
) {
5306 addReply(c
,shared
.wrongtypeerr
);
5308 zset
*zsetobj
= o
->ptr
;
5309 zskiplist
*zsl
= zsetobj
->zsl
;
5311 robj
*ele
, *lenobj
= NULL
;
5312 unsigned long rangelen
= 0;
5314 /* Get the first node with the score >= min, or with
5315 * score > min if 'minex' is true. */
5316 ln
= zslFirstWithScore(zsl
,min
);
5317 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
5320 /* No element matching the speciifed interval */
5321 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
5325 /* We don't know in advance how many matching elements there
5326 * are in the list, so we push this object that will represent
5327 * the multi-bulk length in the output buffer, and will "fix"
5330 lenobj
= createObject(REDIS_STRING
,NULL
);
5332 decrRefCount(lenobj
);
5335 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
5338 ln
= ln
->forward
[0];
5341 if (limit
== 0) break;
5344 addReplyBulkLen(c
,ele
);
5346 addReply(c
,shared
.crlf
);
5348 addReplyDouble(c
,ln
->score
);
5350 ln
= ln
->forward
[0];
5352 if (limit
> 0) limit
--;
5355 addReplyLong(c
,(long)rangelen
);
5357 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
5358 withscores
? (rangelen
*2) : rangelen
);
5364 static void zrangebyscoreCommand(redisClient
*c
) {
5365 genericZrangebyscoreCommand(c
,0);
5368 static void zcountCommand(redisClient
*c
) {
5369 genericZrangebyscoreCommand(c
,1);
5372 static void zcardCommand(redisClient
*c
) {
5376 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5378 addReply(c
,shared
.czero
);
5381 if (o
->type
!= REDIS_ZSET
) {
5382 addReply(c
,shared
.wrongtypeerr
);
5385 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",zs
->zsl
->length
));
5390 static void zscoreCommand(redisClient
*c
) {
5394 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5396 addReply(c
,shared
.nullbulk
);
5399 if (o
->type
!= REDIS_ZSET
) {
5400 addReply(c
,shared
.wrongtypeerr
);
5405 de
= dictFind(zs
->dict
,c
->argv
[2]);
5407 addReply(c
,shared
.nullbulk
);
5409 double *score
= dictGetEntryVal(de
);
5411 addReplyDouble(c
,*score
);
5417 static void zrankCommand(redisClient
*c
) {
5419 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5421 addReply(c
,shared
.nullbulk
);
5424 if (o
->type
!= REDIS_ZSET
) {
5425 addReply(c
,shared
.wrongtypeerr
);
5430 zskiplist
*zsl
= zs
->zsl
;
5431 dictEntry
*de
= dictFind(zs
->dict
,c
->argv
[2]);
5433 addReply(c
,shared
.nullbulk
);
5437 double *score
= dictGetEntryVal(de
);
5439 unsigned long rank
= 0;
5443 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5444 while (x
->forward
[i
] &&
5445 (x
->forward
[i
]->score
< *score
||
5446 (x
->forward
[i
]->score
== *score
&&
5447 compareStringObjects(x
->forward
[i
]->obj
,c
->argv
[2]) < 0))) {
5452 if (x
->forward
[i
] && compareStringObjects(x
->forward
[i
]->obj
,c
->argv
[2]) == 0) {
5453 addReplyLong(c
, rank
);
5458 addReply(c
,shared
.nullbulk
);
5461 /* ========================= Non type-specific commands ==================== */
5463 static void flushdbCommand(redisClient
*c
) {
5464 server
.dirty
+= dictSize(c
->db
->dict
);
5465 dictEmpty(c
->db
->dict
);
5466 dictEmpty(c
->db
->expires
);
5467 addReply(c
,shared
.ok
);
5470 static void flushallCommand(redisClient
*c
) {
5471 server
.dirty
+= emptyDb();
5472 addReply(c
,shared
.ok
);
5473 rdbSave(server
.dbfilename
);
5477 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
5478 redisSortOperation
*so
= zmalloc(sizeof(*so
));
5480 so
->pattern
= pattern
;
5484 /* Return the value associated to the key with a name obtained
5485 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5486 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
5490 int prefixlen
, sublen
, postfixlen
;
5491 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5495 char buf
[REDIS_SORTKEY_MAX
+1];
5498 /* If the pattern is "#" return the substitution object itself in order
5499 * to implement the "SORT ... GET #" feature. */
5500 spat
= pattern
->ptr
;
5501 if (spat
[0] == '#' && spat
[1] == '\0') {
5505 /* The substitution object may be specially encoded. If so we create
5506 * a decoded object on the fly. Otherwise getDecodedObject will just
5507 * increment the ref count, that we'll decrement later. */
5508 subst
= getDecodedObject(subst
);
5511 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
5512 p
= strchr(spat
,'*');
5514 decrRefCount(subst
);
5519 sublen
= sdslen(ssub
);
5520 postfixlen
= sdslen(spat
)-(prefixlen
+1);
5521 memcpy(keyname
.buf
,spat
,prefixlen
);
5522 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
5523 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
5524 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
5525 keyname
.len
= prefixlen
+sublen
+postfixlen
;
5527 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2))
5528 decrRefCount(subst
);
5530 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5531 return lookupKeyRead(db
,&keyobj
);
5534 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5535 * the additional parameter is not standard but a BSD-specific we have to
5536 * pass sorting parameters via the global 'server' structure */
5537 static int sortCompare(const void *s1
, const void *s2
) {
5538 const redisSortObject
*so1
= s1
, *so2
= s2
;
5541 if (!server
.sort_alpha
) {
5542 /* Numeric sorting. Here it's trivial as we precomputed scores */
5543 if (so1
->u
.score
> so2
->u
.score
) {
5545 } else if (so1
->u
.score
< so2
->u
.score
) {
5551 /* Alphanumeric sorting */
5552 if (server
.sort_bypattern
) {
5553 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
5554 /* At least one compare object is NULL */
5555 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
5557 else if (so1
->u
.cmpobj
== NULL
)
5562 /* We have both the objects, use strcoll */
5563 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
5566 /* Compare elements directly */
5569 dec1
= getDecodedObject(so1
->obj
);
5570 dec2
= getDecodedObject(so2
->obj
);
5571 cmp
= strcoll(dec1
->ptr
,dec2
->ptr
);
5576 return server
.sort_desc
? -cmp
: cmp
;
5579 /* The SORT command is the most complex command in Redis. Warning: this code
5580 * is optimized for speed and a bit less for readability */
5581 static void sortCommand(redisClient
*c
) {
5584 int desc
= 0, alpha
= 0;
5585 int limit_start
= 0, limit_count
= -1, start
, end
;
5586 int j
, dontsort
= 0, vectorlen
;
5587 int getop
= 0; /* GET operation counter */
5588 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
5589 redisSortObject
*vector
; /* Resulting vector to sort */
5591 /* Lookup the key to sort. It must be of the right types */
5592 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
5593 if (sortval
== NULL
) {
5594 addReply(c
,shared
.nullmultibulk
);
5597 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
5598 sortval
->type
!= REDIS_ZSET
)
5600 addReply(c
,shared
.wrongtypeerr
);
5604 /* Create a list of operations to perform for every sorted element.
5605 * Operations can be GET/DEL/INCR/DECR */
5606 operations
= listCreate();
5607 listSetFreeMethod(operations
,zfree
);
5610 /* Now we need to protect sortval incrementing its count, in the future
5611 * SORT may have options able to overwrite/delete keys during the sorting
5612 * and the sorted key itself may get destroied */
5613 incrRefCount(sortval
);
5615 /* The SORT command has an SQL-alike syntax, parse it */
5616 while(j
< c
->argc
) {
5617 int leftargs
= c
->argc
-j
-1;
5618 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
5620 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
5622 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
5624 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
5625 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
5626 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
5628 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
5629 storekey
= c
->argv
[j
+1];
5631 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
5632 sortby
= c
->argv
[j
+1];
5633 /* If the BY pattern does not contain '*', i.e. it is constant,
5634 * we don't need to sort nor to lookup the weight keys. */
5635 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
5637 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
5638 listAddNodeTail(operations
,createSortOperation(
5639 REDIS_SORT_GET
,c
->argv
[j
+1]));
5643 decrRefCount(sortval
);
5644 listRelease(operations
);
5645 addReply(c
,shared
.syntaxerr
);
5651 /* Load the sorting vector with all the objects to sort */
5652 switch(sortval
->type
) {
5653 case REDIS_LIST
: vectorlen
= listLength((list
*)sortval
->ptr
); break;
5654 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
5655 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
5656 default: vectorlen
= 0; redisAssert(0); /* Avoid GCC warning */
5658 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
5661 if (sortval
->type
== REDIS_LIST
) {
5662 list
*list
= sortval
->ptr
;
5666 listRewind(list
,&li
);
5667 while((ln
= listNext(&li
))) {
5668 robj
*ele
= ln
->value
;
5669 vector
[j
].obj
= ele
;
5670 vector
[j
].u
.score
= 0;
5671 vector
[j
].u
.cmpobj
= NULL
;
5679 if (sortval
->type
== REDIS_SET
) {
5682 zset
*zs
= sortval
->ptr
;
5686 di
= dictGetIterator(set
);
5687 while((setele
= dictNext(di
)) != NULL
) {
5688 vector
[j
].obj
= dictGetEntryKey(setele
);
5689 vector
[j
].u
.score
= 0;
5690 vector
[j
].u
.cmpobj
= NULL
;
5693 dictReleaseIterator(di
);
5695 redisAssert(j
== vectorlen
);
5697 /* Now it's time to load the right scores in the sorting vector */
5698 if (dontsort
== 0) {
5699 for (j
= 0; j
< vectorlen
; j
++) {
5703 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
5704 if (!byval
|| byval
->type
!= REDIS_STRING
) continue;
5706 vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
5708 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
5709 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
5711 /* Don't need to decode the object if it's
5712 * integer-encoded (the only encoding supported) so
5713 * far. We can just cast it */
5714 if (byval
->encoding
== REDIS_ENCODING_INT
) {
5715 vector
[j
].u
.score
= (long)byval
->ptr
;
5717 redisAssert(1 != 1);
5722 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_RAW
)
5723 vector
[j
].u
.score
= strtod(vector
[j
].obj
->ptr
,NULL
);
5725 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_INT
)
5726 vector
[j
].u
.score
= (long) vector
[j
].obj
->ptr
;
5728 redisAssert(1 != 1);
5735 /* We are ready to sort the vector... perform a bit of sanity check
5736 * on the LIMIT option too. We'll use a partial version of quicksort. */
5737 start
= (limit_start
< 0) ? 0 : limit_start
;
5738 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
5739 if (start
>= vectorlen
) {
5740 start
= vectorlen
-1;
5743 if (end
>= vectorlen
) end
= vectorlen
-1;
5745 if (dontsort
== 0) {
5746 server
.sort_desc
= desc
;
5747 server
.sort_alpha
= alpha
;
5748 server
.sort_bypattern
= sortby
? 1 : 0;
5749 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
5750 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
5752 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
5755 /* Send command output to the output buffer, performing the specified
5756 * GET/DEL/INCR/DECR operations if any. */
5757 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
5758 if (storekey
== NULL
) {
5759 /* STORE option not specified, sent the sorting result to client */
5760 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
5761 for (j
= start
; j
<= end
; j
++) {
5766 addReplyBulkLen(c
,vector
[j
].obj
);
5767 addReply(c
,vector
[j
].obj
);
5768 addReply(c
,shared
.crlf
);
5770 listRewind(operations
,&li
);
5771 while((ln
= listNext(&li
))) {
5772 redisSortOperation
*sop
= ln
->value
;
5773 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
5776 if (sop
->type
== REDIS_SORT_GET
) {
5777 if (!val
|| val
->type
!= REDIS_STRING
) {
5778 addReply(c
,shared
.nullbulk
);
5780 addReplyBulkLen(c
,val
);
5782 addReply(c
,shared
.crlf
);
5785 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
5790 robj
*listObject
= createListObject();
5791 list
*listPtr
= (list
*) listObject
->ptr
;
5793 /* STORE option specified, set the sorting result as a List object */
5794 for (j
= start
; j
<= end
; j
++) {
5799 listAddNodeTail(listPtr
,vector
[j
].obj
);
5800 incrRefCount(vector
[j
].obj
);
5802 listRewind(operations
,&li
);
5803 while((ln
= listNext(&li
))) {
5804 redisSortOperation
*sop
= ln
->value
;
5805 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
5808 if (sop
->type
== REDIS_SORT_GET
) {
5809 if (!val
|| val
->type
!= REDIS_STRING
) {
5810 listAddNodeTail(listPtr
,createStringObject("",0));
5812 listAddNodeTail(listPtr
,val
);
5816 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
5820 if (dictReplace(c
->db
->dict
,storekey
,listObject
)) {
5821 incrRefCount(storekey
);
5823 /* Note: we add 1 because the DB is dirty anyway since even if the
5824 * SORT result is empty a new key is set and maybe the old content
5826 server
.dirty
+= 1+outputlen
;
5827 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
5831 decrRefCount(sortval
);
5832 listRelease(operations
);
5833 for (j
= 0; j
< vectorlen
; j
++) {
5834 if (sortby
&& alpha
&& vector
[j
].u
.cmpobj
)
5835 decrRefCount(vector
[j
].u
.cmpobj
);
5840 /* Convert an amount of bytes into a human readable string in the form
5841 * of 100B, 2G, 100M, 4K, and so forth. */
5842 static void bytesToHuman(char *s
, unsigned long long n
) {
5847 sprintf(s
,"%lluB",n
);
5849 } else if (n
< (1024*1024)) {
5850 d
= (double)n
/(1024);
5851 sprintf(s
,"%.2fK",d
);
5852 } else if (n
< (1024LL*1024*1024)) {
5853 d
= (double)n
/(1024*1024);
5854 sprintf(s
,"%.2fM",d
);
5855 } else if (n
< (1024LL*1024*1024*1024)) {
5856 d
= (double)n
/(1024LL*1024*1024);
5857 sprintf(s
,"%.2fG",d
);
5861 /* Create the string returned by the INFO command. This is decoupled
5862 * by the INFO command itself as we need to report the same information
5863 * on memory corruption problems. */
5864 static sds
genRedisInfoString(void) {
5866 time_t uptime
= time(NULL
)-server
.stat_starttime
;
5870 bytesToHuman(hmem
,zmalloc_used_memory());
5871 info
= sdscatprintf(sdsempty(),
5872 "redis_version:%s\r\n"
5874 "multiplexing_api:%s\r\n"
5875 "process_id:%ld\r\n"
5876 "uptime_in_seconds:%ld\r\n"
5877 "uptime_in_days:%ld\r\n"
5878 "connected_clients:%d\r\n"
5879 "connected_slaves:%d\r\n"
5880 "blocked_clients:%d\r\n"
5881 "used_memory:%zu\r\n"
5882 "used_memory_human:%s\r\n"
5883 "changes_since_last_save:%lld\r\n"
5884 "bgsave_in_progress:%d\r\n"
5885 "last_save_time:%ld\r\n"
5886 "bgrewriteaof_in_progress:%d\r\n"
5887 "total_connections_received:%lld\r\n"
5888 "total_commands_processed:%lld\r\n"
5892 (sizeof(long) == 8) ? "64" : "32",
5897 listLength(server
.clients
)-listLength(server
.slaves
),
5898 listLength(server
.slaves
),
5899 server
.blpop_blocked_clients
,
5900 zmalloc_used_memory(),
5903 server
.bgsavechildpid
!= -1,
5905 server
.bgrewritechildpid
!= -1,
5906 server
.stat_numconnections
,
5907 server
.stat_numcommands
,
5908 server
.vm_enabled
!= 0,
5909 server
.masterhost
== NULL
? "master" : "slave"
5911 if (server
.masterhost
) {
5912 info
= sdscatprintf(info
,
5913 "master_host:%s\r\n"
5914 "master_port:%d\r\n"
5915 "master_link_status:%s\r\n"
5916 "master_last_io_seconds_ago:%d\r\n"
5919 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
5921 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
5924 if (server
.vm_enabled
) {
5926 info
= sdscatprintf(info
,
5927 "vm_conf_max_memory:%llu\r\n"
5928 "vm_conf_page_size:%llu\r\n"
5929 "vm_conf_pages:%llu\r\n"
5930 "vm_stats_used_pages:%llu\r\n"
5931 "vm_stats_swapped_objects:%llu\r\n"
5932 "vm_stats_swappin_count:%llu\r\n"
5933 "vm_stats_swappout_count:%llu\r\n"
5934 "vm_stats_io_newjobs_len:%lu\r\n"
5935 "vm_stats_io_processing_len:%lu\r\n"
5936 "vm_stats_io_processed_len:%lu\r\n"
5937 "vm_stats_io_active_threads:%lu\r\n"
5938 "vm_stats_blocked_clients:%lu\r\n"
5939 ,(unsigned long long) server
.vm_max_memory
,
5940 (unsigned long long) server
.vm_page_size
,
5941 (unsigned long long) server
.vm_pages
,
5942 (unsigned long long) server
.vm_stats_used_pages
,
5943 (unsigned long long) server
.vm_stats_swapped_objects
,
5944 (unsigned long long) server
.vm_stats_swapins
,
5945 (unsigned long long) server
.vm_stats_swapouts
,
5946 (unsigned long) listLength(server
.io_newjobs
),
5947 (unsigned long) listLength(server
.io_processing
),
5948 (unsigned long) listLength(server
.io_processed
),
5949 (unsigned long) server
.io_active_threads
,
5950 (unsigned long) server
.vm_blocked_clients
5954 for (j
= 0; j
< server
.dbnum
; j
++) {
5955 long long keys
, vkeys
;
5957 keys
= dictSize(server
.db
[j
].dict
);
5958 vkeys
= dictSize(server
.db
[j
].expires
);
5959 if (keys
|| vkeys
) {
5960 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
5967 static void infoCommand(redisClient
*c
) {
5968 sds info
= genRedisInfoString();
5969 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
5970 (unsigned long)sdslen(info
)));
5971 addReplySds(c
,info
);
5972 addReply(c
,shared
.crlf
);
5975 static void monitorCommand(redisClient
*c
) {
5976 /* ignore MONITOR if aleady slave or in monitor mode */
5977 if (c
->flags
& REDIS_SLAVE
) return;
5979 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
5981 listAddNodeTail(server
.monitors
,c
);
5982 addReply(c
,shared
.ok
);
5985 /* ================================= Expire ================================= */
5986 static int removeExpire(redisDb
*db
, robj
*key
) {
5987 if (dictDelete(db
->expires
,key
) == DICT_OK
) {
5994 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
5995 if (dictAdd(db
->expires
,key
,(void*)when
) == DICT_ERR
) {
6003 /* Return the expire time of the specified key, or -1 if no expire
6004 * is associated with this key (i.e. the key is non volatile) */
6005 static time_t getExpire(redisDb
*db
, robj
*key
) {
6008 /* No expire? return ASAP */
6009 if (dictSize(db
->expires
) == 0 ||
6010 (de
= dictFind(db
->expires
,key
)) == NULL
) return -1;
6012 return (time_t) dictGetEntryVal(de
);
6015 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
6019 /* No expire? return ASAP */
6020 if (dictSize(db
->expires
) == 0 ||
6021 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6023 /* Lookup the expire */
6024 when
= (time_t) dictGetEntryVal(de
);
6025 if (time(NULL
) <= when
) return 0;
6027 /* Delete the key */
6028 dictDelete(db
->expires
,key
);
6029 return dictDelete(db
->dict
,key
) == DICT_OK
;
6032 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
6035 /* No expire? return ASAP */
6036 if (dictSize(db
->expires
) == 0 ||
6037 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6039 /* Delete the key */
6041 dictDelete(db
->expires
,key
);
6042 return dictDelete(db
->dict
,key
) == DICT_OK
;
6045 static void expireGenericCommand(redisClient
*c
, robj
*key
, time_t seconds
) {
6048 de
= dictFind(c
->db
->dict
,key
);
6050 addReply(c
,shared
.czero
);
6054 if (deleteKey(c
->db
,key
)) server
.dirty
++;
6055 addReply(c
, shared
.cone
);
6058 time_t when
= time(NULL
)+seconds
;
6059 if (setExpire(c
->db
,key
,when
)) {
6060 addReply(c
,shared
.cone
);
6063 addReply(c
,shared
.czero
);
6069 static void expireCommand(redisClient
*c
) {
6070 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10));
6073 static void expireatCommand(redisClient
*c
) {
6074 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10)-time(NULL
));
6077 static void ttlCommand(redisClient
*c
) {
6081 expire
= getExpire(c
->db
,c
->argv
[1]);
6083 ttl
= (int) (expire
-time(NULL
));
6084 if (ttl
< 0) ttl
= -1;
6086 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
6089 /* ================================ MULTI/EXEC ============================== */
6091 /* Client state initialization for MULTI/EXEC */
6092 static void initClientMultiState(redisClient
*c
) {
6093 c
->mstate
.commands
= NULL
;
6094 c
->mstate
.count
= 0;
6097 /* Release all the resources associated with MULTI/EXEC state */
6098 static void freeClientMultiState(redisClient
*c
) {
6101 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6103 multiCmd
*mc
= c
->mstate
.commands
+j
;
6105 for (i
= 0; i
< mc
->argc
; i
++)
6106 decrRefCount(mc
->argv
[i
]);
6109 zfree(c
->mstate
.commands
);
6112 /* Add a new command into the MULTI commands queue */
6113 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
6117 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
6118 sizeof(multiCmd
)*(c
->mstate
.count
+1));
6119 mc
= c
->mstate
.commands
+c
->mstate
.count
;
6122 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
6123 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
6124 for (j
= 0; j
< c
->argc
; j
++)
6125 incrRefCount(mc
->argv
[j
]);
6129 static void multiCommand(redisClient
*c
) {
6130 c
->flags
|= REDIS_MULTI
;
6131 addReply(c
,shared
.ok
);
6134 static void discardCommand(redisClient
*c
) {
6135 if (!(c
->flags
& REDIS_MULTI
)) {
6136 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
6140 freeClientMultiState(c
);
6141 initClientMultiState(c
);
6142 c
->flags
&= (~REDIS_MULTI
);
6143 addReply(c
,shared
.ok
);
6146 static void execCommand(redisClient
*c
) {
6151 if (!(c
->flags
& REDIS_MULTI
)) {
6152 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
6156 orig_argv
= c
->argv
;
6157 orig_argc
= c
->argc
;
6158 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
6159 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6160 c
->argc
= c
->mstate
.commands
[j
].argc
;
6161 c
->argv
= c
->mstate
.commands
[j
].argv
;
6162 call(c
,c
->mstate
.commands
[j
].cmd
);
6164 c
->argv
= orig_argv
;
6165 c
->argc
= orig_argc
;
6166 freeClientMultiState(c
);
6167 initClientMultiState(c
);
6168 c
->flags
&= (~REDIS_MULTI
);
6171 /* =========================== Blocking Operations ========================= */
6173 /* Currently Redis blocking operations support is limited to list POP ops,
6174 * so the current implementation is not fully generic, but it is also not
6175 * completely specific so it will not require a rewrite to support new
6176 * kind of blocking operations in the future.
6178 * Still it's important to note that list blocking operations can be already
6179 * used as a notification mechanism in order to implement other blocking
6180 * operations at application level, so there must be a very strong evidence
6181 * of usefulness and generality before new blocking operations are implemented.
6183 * This is how the current blocking POP works, we use BLPOP as example:
6184 * - If the user calls BLPOP and the key exists and contains a non empty list
6185 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6186 * if there is not to block.
6187 * - If instead BLPOP is called and the key does not exists or the list is
6188 * empty we need to block. In order to do so we remove the notification for
6189 * new data to read in the client socket (so that we'll not serve new
6190 * requests if the blocking request is not served). Also we put the client
6191 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6192 * blocking for this keys.
6193 * - If a PUSH operation against a key with blocked clients waiting is
6194 * performed, we serve the first in the list: basically instead to push
6195 * the new element inside the list we return it to the (first / oldest)
6196 * blocking client, unblock the client, and remove it form the list.
6198 * The above comment and the source code should be enough in order to understand
6199 * the implementation and modify / fix it later.
6202 /* Set a client in blocking mode for the specified key, with the specified
6204 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
6209 c
->blockingkeys
= zmalloc(sizeof(robj
*)*numkeys
);
6210 c
->blockingkeysnum
= numkeys
;
6211 c
->blockingto
= timeout
;
6212 for (j
= 0; j
< numkeys
; j
++) {
6213 /* Add the key in the client structure, to map clients -> keys */
6214 c
->blockingkeys
[j
] = keys
[j
];
6215 incrRefCount(keys
[j
]);
6217 /* And in the other "side", to map keys -> clients */
6218 de
= dictFind(c
->db
->blockingkeys
,keys
[j
]);
6222 /* For every key we take a list of clients blocked for it */
6224 retval
= dictAdd(c
->db
->blockingkeys
,keys
[j
],l
);
6225 incrRefCount(keys
[j
]);
6226 assert(retval
== DICT_OK
);
6228 l
= dictGetEntryVal(de
);
6230 listAddNodeTail(l
,c
);
6232 /* Mark the client as a blocked client */
6233 c
->flags
|= REDIS_BLOCKED
;
6234 server
.blpop_blocked_clients
++;
6237 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6238 static void unblockClientWaitingData(redisClient
*c
) {
6243 assert(c
->blockingkeys
!= NULL
);
6244 /* The client may wait for multiple keys, so unblock it for every key. */
6245 for (j
= 0; j
< c
->blockingkeysnum
; j
++) {
6246 /* Remove this client from the list of clients waiting for this key. */
6247 de
= dictFind(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6249 l
= dictGetEntryVal(de
);
6250 listDelNode(l
,listSearchKey(l
,c
));
6251 /* If the list is empty we need to remove it to avoid wasting memory */
6252 if (listLength(l
) == 0)
6253 dictDelete(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6254 decrRefCount(c
->blockingkeys
[j
]);
6256 /* Cleanup the client structure */
6257 zfree(c
->blockingkeys
);
6258 c
->blockingkeys
= NULL
;
6259 c
->flags
&= (~REDIS_BLOCKED
);
6260 server
.blpop_blocked_clients
--;
6261 /* We want to process data if there is some command waiting
6262 * in the input buffer. Note that this is safe even if
6263 * unblockClientWaitingData() gets called from freeClient() because
6264 * freeClient() will be smart enough to call this function
6265 * *after* c->querybuf was set to NULL. */
6266 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
6269 /* This should be called from any function PUSHing into lists.
6270 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6271 * 'ele' is the element pushed.
6273 * If the function returns 0 there was no client waiting for a list push
6276 * If the function returns 1 there was a client waiting for a list push
6277 * against this key, the element was passed to this client thus it's not
6278 * needed to actually add it to the list and the caller should return asap. */
6279 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
6280 struct dictEntry
*de
;
6281 redisClient
*receiver
;
6285 de
= dictFind(c
->db
->blockingkeys
,key
);
6286 if (de
== NULL
) return 0;
6287 l
= dictGetEntryVal(de
);
6290 receiver
= ln
->value
;
6292 addReplySds(receiver
,sdsnew("*2\r\n"));
6293 addReplyBulkLen(receiver
,key
);
6294 addReply(receiver
,key
);
6295 addReply(receiver
,shared
.crlf
);
6296 addReplyBulkLen(receiver
,ele
);
6297 addReply(receiver
,ele
);
6298 addReply(receiver
,shared
.crlf
);
6299 unblockClientWaitingData(receiver
);
6303 /* Blocking RPOP/LPOP */
6304 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
6309 for (j
= 1; j
< c
->argc
-1; j
++) {
6310 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
6312 if (o
->type
!= REDIS_LIST
) {
6313 addReply(c
,shared
.wrongtypeerr
);
6316 list
*list
= o
->ptr
;
6317 if (listLength(list
) != 0) {
6318 /* If the list contains elements fall back to the usual
6319 * non-blocking POP operation */
6320 robj
*argv
[2], **orig_argv
;
6323 /* We need to alter the command arguments before to call
6324 * popGenericCommand() as the command takes a single key. */
6325 orig_argv
= c
->argv
;
6326 orig_argc
= c
->argc
;
6327 argv
[1] = c
->argv
[j
];
6331 /* Also the return value is different, we need to output
6332 * the multi bulk reply header and the key name. The
6333 * "real" command will add the last element (the value)
6334 * for us. If this souds like an hack to you it's just
6335 * because it is... */
6336 addReplySds(c
,sdsnew("*2\r\n"));
6337 addReplyBulkLen(c
,argv
[1]);
6338 addReply(c
,argv
[1]);
6339 addReply(c
,shared
.crlf
);
6340 popGenericCommand(c
,where
);
6342 /* Fix the client structure with the original stuff */
6343 c
->argv
= orig_argv
;
6344 c
->argc
= orig_argc
;
6350 /* If the list is empty or the key does not exists we must block */
6351 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
6352 if (timeout
> 0) timeout
+= time(NULL
);
6353 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
6356 static void blpopCommand(redisClient
*c
) {
6357 blockingPopGenericCommand(c
,REDIS_HEAD
);
6360 static void brpopCommand(redisClient
*c
) {
6361 blockingPopGenericCommand(c
,REDIS_TAIL
);
6364 /* =============================== Replication ============================= */
6366 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6367 ssize_t nwritten
, ret
= size
;
6368 time_t start
= time(NULL
);
6372 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
6373 nwritten
= write(fd
,ptr
,size
);
6374 if (nwritten
== -1) return -1;
6378 if ((time(NULL
)-start
) > timeout
) {
6386 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6387 ssize_t nread
, totread
= 0;
6388 time_t start
= time(NULL
);
6392 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
6393 nread
= read(fd
,ptr
,size
);
6394 if (nread
== -1) return -1;
6399 if ((time(NULL
)-start
) > timeout
) {
6407 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6414 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
6417 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
6428 static void syncCommand(redisClient
*c
) {
6429 /* ignore SYNC if aleady slave or in monitor mode */
6430 if (c
->flags
& REDIS_SLAVE
) return;
6432 /* SYNC can't be issued when the server has pending data to send to
6433 * the client about already issued commands. We need a fresh reply
6434 * buffer registering the differences between the BGSAVE and the current
6435 * dataset, so that we can copy to other slaves if needed. */
6436 if (listLength(c
->reply
) != 0) {
6437 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6441 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
6442 /* Here we need to check if there is a background saving operation
6443 * in progress, or if it is required to start one */
6444 if (server
.bgsavechildpid
!= -1) {
6445 /* Ok a background save is in progress. Let's check if it is a good
6446 * one for replication, i.e. if there is another slave that is
6447 * registering differences since the server forked to save */
6452 listRewind(server
.slaves
,&li
);
6453 while((ln
= listNext(&li
))) {
6455 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
6458 /* Perfect, the server is already registering differences for
6459 * another slave. Set the right state, and copy the buffer. */
6460 listRelease(c
->reply
);
6461 c
->reply
= listDup(slave
->reply
);
6462 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6463 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
6465 /* No way, we need to wait for the next BGSAVE in order to
6466 * register differences */
6467 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
6468 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
6471 /* Ok we don't have a BGSAVE in progress, let's start one */
6472 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
6473 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
6474 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
6475 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
6478 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6481 c
->flags
|= REDIS_SLAVE
;
6483 listAddNodeTail(server
.slaves
,c
);
6487 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
6488 redisClient
*slave
= privdata
;
6490 REDIS_NOTUSED(mask
);
6491 char buf
[REDIS_IOBUF_LEN
];
6492 ssize_t nwritten
, buflen
;
6494 if (slave
->repldboff
== 0) {
6495 /* Write the bulk write count before to transfer the DB. In theory here
6496 * we don't know how much room there is in the output buffer of the
6497 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6498 * operations) will never be smaller than the few bytes we need. */
6501 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6503 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
6511 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
6512 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
6514 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
6515 (buflen
== 0) ? "premature EOF" : strerror(errno
));
6519 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
6520 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
6525 slave
->repldboff
+= nwritten
;
6526 if (slave
->repldboff
== slave
->repldbsize
) {
6527 close(slave
->repldbfd
);
6528 slave
->repldbfd
= -1;
6529 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
6530 slave
->replstate
= REDIS_REPL_ONLINE
;
6531 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
6532 sendReplyToClient
, slave
) == AE_ERR
) {
6536 addReplySds(slave
,sdsempty());
6537 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
6541 /* This function is called at the end of every backgrond saving.
6542 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6543 * otherwise REDIS_ERR is passed to the function.
6545 * The goal of this function is to handle slaves waiting for a successful
6546 * background saving in order to perform non-blocking synchronization. */
6547 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
6549 int startbgsave
= 0;
6552 listRewind(server
.slaves
,&li
);
6553 while((ln
= listNext(&li
))) {
6554 redisClient
*slave
= ln
->value
;
6556 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
6558 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6559 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
6560 struct redis_stat buf
;
6562 if (bgsaveerr
!= REDIS_OK
) {
6564 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
6567 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
6568 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
6570 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
6573 slave
->repldboff
= 0;
6574 slave
->repldbsize
= buf
.st_size
;
6575 slave
->replstate
= REDIS_REPL_SEND_BULK
;
6576 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
6577 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
6584 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
6587 listRewind(server
.slaves
,&li
);
6588 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
6589 while((ln
= listNext(&li
))) {
6590 redisClient
*slave
= ln
->value
;
6592 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
6599 static int syncWithMaster(void) {
6600 char buf
[1024], tmpfile
[256], authcmd
[1024];
6602 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
6606 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
6611 /* AUTH with the master if required. */
6612 if(server
.masterauth
) {
6613 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
6614 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
6616 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
6620 /* Read the AUTH result. */
6621 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
6623 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
6627 if (buf
[0] != '+') {
6629 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
6634 /* Issue the SYNC command */
6635 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
6637 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
6641 /* Read the bulk write count */
6642 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
6644 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
6648 if (buf
[0] != '$') {
6650 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6653 dumpsize
= atoi(buf
+1);
6654 redisLog(REDIS_NOTICE
,"Receiving %d bytes data dump from MASTER",dumpsize
);
6655 /* Read the bulk write data on a temp file */
6656 snprintf(tmpfile
,256,"temp-%d.%ld.rdb",(int)time(NULL
),(long int)random());
6657 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
,0644);
6660 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
6664 int nread
, nwritten
;
6666 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
6668 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
6674 nwritten
= write(dfd
,buf
,nread
);
6675 if (nwritten
== -1) {
6676 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
6684 if (rename(tmpfile
,server
.dbfilename
) == -1) {
6685 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
6691 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
6692 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
6696 server
.master
= createClient(fd
);
6697 server
.master
->flags
|= REDIS_MASTER
;
6698 server
.master
->authenticated
= 1;
6699 server
.replstate
= REDIS_REPL_CONNECTED
;
6703 static void slaveofCommand(redisClient
*c
) {
6704 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
6705 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
6706 if (server
.masterhost
) {
6707 sdsfree(server
.masterhost
);
6708 server
.masterhost
= NULL
;
6709 if (server
.master
) freeClient(server
.master
);
6710 server
.replstate
= REDIS_REPL_NONE
;
6711 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
6714 sdsfree(server
.masterhost
);
6715 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
6716 server
.masterport
= atoi(c
->argv
[2]->ptr
);
6717 if (server
.master
) freeClient(server
.master
);
6718 server
.replstate
= REDIS_REPL_CONNECT
;
6719 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
6720 server
.masterhost
, server
.masterport
);
6722 addReply(c
,shared
.ok
);
6725 /* ============================ Maxmemory directive ======================== */
6727 /* Try to free one object form the pre-allocated objects free list.
6728 * This is useful under low mem conditions as by default we take 1 million
6729 * free objects allocated. On success REDIS_OK is returned, otherwise
6731 static int tryFreeOneObjectFromFreelist(void) {
6734 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
6735 if (listLength(server
.objfreelist
)) {
6736 listNode
*head
= listFirst(server
.objfreelist
);
6737 o
= listNodeValue(head
);
6738 listDelNode(server
.objfreelist
,head
);
6739 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
6743 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
6748 /* This function gets called when 'maxmemory' is set on the config file to limit
6749 * the max memory used by the server, and we are out of memory.
6750 * This function will try to, in order:
6752 * - Free objects from the free list
6753 * - Try to remove keys with an EXPIRE set
6755 * It is not possible to free enough memory to reach used-memory < maxmemory
6756 * the server will start refusing commands that will enlarge even more the
6759 static void freeMemoryIfNeeded(void) {
6760 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
6761 int j
, k
, freed
= 0;
6763 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
6764 for (j
= 0; j
< server
.dbnum
; j
++) {
6766 robj
*minkey
= NULL
;
6767 struct dictEntry
*de
;
6769 if (dictSize(server
.db
[j
].expires
)) {
6771 /* From a sample of three keys drop the one nearest to
6772 * the natural expire */
6773 for (k
= 0; k
< 3; k
++) {
6776 de
= dictGetRandomKey(server
.db
[j
].expires
);
6777 t
= (time_t) dictGetEntryVal(de
);
6778 if (minttl
== -1 || t
< minttl
) {
6779 minkey
= dictGetEntryKey(de
);
6783 deleteKey(server
.db
+j
,minkey
);
6786 if (!freed
) return; /* nothing to free... */
6790 /* ============================== Append Only file ========================== */
6792 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
6793 sds buf
= sdsempty();
6799 /* The DB this command was targetting is not the same as the last command
6800 * we appendend. To issue a SELECT command is needed. */
6801 if (dictid
!= server
.appendseldb
) {
6804 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
6805 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6806 (unsigned long)strlen(seldb
),seldb
);
6807 server
.appendseldb
= dictid
;
6810 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6811 * EXPIREs into EXPIREATs calls */
6812 if (cmd
->proc
== expireCommand
) {
6815 tmpargv
[0] = createStringObject("EXPIREAT",8);
6816 tmpargv
[1] = argv
[1];
6817 incrRefCount(argv
[1]);
6818 when
= time(NULL
)+strtol(argv
[2]->ptr
,NULL
,10);
6819 tmpargv
[2] = createObject(REDIS_STRING
,
6820 sdscatprintf(sdsempty(),"%ld",when
));
6824 /* Append the actual command */
6825 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
6826 for (j
= 0; j
< argc
; j
++) {
6829 o
= getDecodedObject(o
);
6830 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
6831 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
6832 buf
= sdscatlen(buf
,"\r\n",2);
6836 /* Free the objects from the modified argv for EXPIREAT */
6837 if (cmd
->proc
== expireCommand
) {
6838 for (j
= 0; j
< 3; j
++)
6839 decrRefCount(argv
[j
]);
6842 /* We want to perform a single write. This should be guaranteed atomic
6843 * at least if the filesystem we are writing is a real physical one.
6844 * While this will save us against the server being killed I don't think
6845 * there is much to do about the whole server stopping for power problems
6847 nwritten
= write(server
.appendfd
,buf
,sdslen(buf
));
6848 if (nwritten
!= (signed)sdslen(buf
)) {
6849 /* Ooops, we are in troubles. The best thing to do for now is
6850 * to simply exit instead to give the illusion that everything is
6851 * working as expected. */
6852 if (nwritten
== -1) {
6853 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
6855 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
6859 /* If a background append only file rewriting is in progress we want to
6860 * accumulate the differences between the child DB and the current one
6861 * in a buffer, so that when the child process will do its work we
6862 * can append the differences to the new append only file. */
6863 if (server
.bgrewritechildpid
!= -1)
6864 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
6868 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
6869 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
6870 now
-server
.lastfsync
> 1))
6872 fsync(server
.appendfd
); /* Let's try to get this data on the disk */
6873 server
.lastfsync
= now
;
6877 /* In Redis commands are always executed in the context of a client, so in
6878 * order to load the append only file we need to create a fake client. */
6879 static struct redisClient
*createFakeClient(void) {
6880 struct redisClient
*c
= zmalloc(sizeof(*c
));
6884 c
->querybuf
= sdsempty();
6888 /* We set the fake client as a slave waiting for the synchronization
6889 * so that Redis will not try to send replies to this client. */
6890 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
6891 c
->reply
= listCreate();
6892 listSetFreeMethod(c
->reply
,decrRefCount
);
6893 listSetDupMethod(c
->reply
,dupClientReplyValue
);
6897 static void freeFakeClient(struct redisClient
*c
) {
6898 sdsfree(c
->querybuf
);
6899 listRelease(c
->reply
);
6903 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
6904 * error (the append only file is zero-length) REDIS_ERR is returned. On
6905 * fatal error an error message is logged and the program exists. */
6906 int loadAppendOnlyFile(char *filename
) {
6907 struct redisClient
*fakeClient
;
6908 FILE *fp
= fopen(filename
,"r");
6909 struct redis_stat sb
;
6910 unsigned long long loadedkeys
= 0;
6912 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
6916 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
6920 fakeClient
= createFakeClient();
6927 struct redisCommand
*cmd
;
6929 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
6935 if (buf
[0] != '*') goto fmterr
;
6937 argv
= zmalloc(sizeof(robj
*)*argc
);
6938 for (j
= 0; j
< argc
; j
++) {
6939 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
6940 if (buf
[0] != '$') goto fmterr
;
6941 len
= strtol(buf
+1,NULL
,10);
6942 argsds
= sdsnewlen(NULL
,len
);
6943 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
6944 argv
[j
] = createObject(REDIS_STRING
,argsds
);
6945 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
6948 /* Command lookup */
6949 cmd
= lookupCommand(argv
[0]->ptr
);
6951 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
6954 /* Try object sharing and encoding */
6955 if (server
.shareobjects
) {
6957 for(j
= 1; j
< argc
; j
++)
6958 argv
[j
] = tryObjectSharing(argv
[j
]);
6960 if (cmd
->flags
& REDIS_CMD_BULK
)
6961 tryObjectEncoding(argv
[argc
-1]);
6962 /* Run the command in the context of a fake client */
6963 fakeClient
->argc
= argc
;
6964 fakeClient
->argv
= argv
;
6965 cmd
->proc(fakeClient
);
6966 /* Discard the reply objects list from the fake client */
6967 while(listLength(fakeClient
->reply
))
6968 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
6969 /* Clean up, ready for the next command */
6970 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
6972 /* Handle swapping while loading big datasets when VM is on */
6974 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
6975 while (zmalloc_used_memory() > server
.vm_max_memory
) {
6976 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
6981 freeFakeClient(fakeClient
);
6986 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
6988 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
6992 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
6996 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6997 static int fwriteBulk(FILE *fp
, robj
*obj
) {
7001 /* Avoid the incr/decr ref count business if possible to help
7002 * copy-on-write (we are often in a child process when this function
7004 * Also makes sure that key objects don't get incrRefCount-ed when VM
7006 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
7007 obj
= getDecodedObject(obj
);
7010 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(long)sdslen(obj
->ptr
));
7011 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) goto err
;
7012 if (sdslen(obj
->ptr
) && fwrite(obj
->ptr
,sdslen(obj
->ptr
),1,fp
) == 0)
7014 if (fwrite("\r\n",2,1,fp
) == 0) goto err
;
7015 if (decrrc
) decrRefCount(obj
);
7018 if (decrrc
) decrRefCount(obj
);
7022 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7023 static int fwriteBulkDouble(FILE *fp
, double d
) {
7024 char buf
[128], dbuf
[128];
7026 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
7027 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
7028 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7029 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
7033 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7034 static int fwriteBulkLong(FILE *fp
, long l
) {
7035 char buf
[128], lbuf
[128];
7037 snprintf(lbuf
,sizeof(lbuf
),"%ld\r\n",l
);
7038 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(lbuf
)-2);
7039 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7040 if (fwrite(lbuf
,strlen(lbuf
),1,fp
) == 0) return 0;
7044 /* Write a sequence of commands able to fully rebuild the dataset into
7045 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7046 static int rewriteAppendOnlyFile(char *filename
) {
7047 dictIterator
*di
= NULL
;
7052 time_t now
= time(NULL
);
7054 /* Note that we have to use a different temp name here compared to the
7055 * one used by rewriteAppendOnlyFileBackground() function. */
7056 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
7057 fp
= fopen(tmpfile
,"w");
7059 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
7062 for (j
= 0; j
< server
.dbnum
; j
++) {
7063 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
7064 redisDb
*db
= server
.db
+j
;
7066 if (dictSize(d
) == 0) continue;
7067 di
= dictGetIterator(d
);
7073 /* SELECT the new DB */
7074 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
7075 if (fwriteBulkLong(fp
,j
) == 0) goto werr
;
7077 /* Iterate this DB writing every entry */
7078 while((de
= dictNext(di
)) != NULL
) {
7083 key
= dictGetEntryKey(de
);
7084 /* If the value for this key is swapped, load a preview in memory.
7085 * We use a "swapped" flag to remember if we need to free the
7086 * value object instead to just increment the ref count anyway
7087 * in order to avoid copy-on-write of pages if we are forked() */
7088 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
7089 key
->storage
== REDIS_VM_SWAPPING
) {
7090 o
= dictGetEntryVal(de
);
7093 o
= vmPreviewObject(key
);
7096 expiretime
= getExpire(db
,key
);
7098 /* Save the key and associated value */
7099 if (o
->type
== REDIS_STRING
) {
7100 /* Emit a SET command */
7101 char cmd
[]="*3\r\n$3\r\nSET\r\n";
7102 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7104 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7105 if (fwriteBulk(fp
,o
) == 0) goto werr
;
7106 } else if (o
->type
== REDIS_LIST
) {
7107 /* Emit the RPUSHes needed to rebuild the list */
7108 list
*list
= o
->ptr
;
7112 listRewind(list
,&li
);
7113 while((ln
= listNext(&li
))) {
7114 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
7115 robj
*eleobj
= listNodeValue(ln
);
7117 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7118 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7119 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7121 } else if (o
->type
== REDIS_SET
) {
7122 /* Emit the SADDs needed to rebuild the set */
7124 dictIterator
*di
= dictGetIterator(set
);
7127 while((de
= dictNext(di
)) != NULL
) {
7128 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
7129 robj
*eleobj
= dictGetEntryKey(de
);
7131 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7132 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7133 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7135 dictReleaseIterator(di
);
7136 } else if (o
->type
== REDIS_ZSET
) {
7137 /* Emit the ZADDs needed to rebuild the sorted set */
7139 dictIterator
*di
= dictGetIterator(zs
->dict
);
7142 while((de
= dictNext(di
)) != NULL
) {
7143 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
7144 robj
*eleobj
= dictGetEntryKey(de
);
7145 double *score
= dictGetEntryVal(de
);
7147 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7148 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7149 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
7150 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7152 dictReleaseIterator(di
);
7154 redisAssert(0 != 0);
7156 /* Save the expire time */
7157 if (expiretime
!= -1) {
7158 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
7159 /* If this key is already expired skip it */
7160 if (expiretime
< now
) continue;
7161 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7162 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7163 if (fwriteBulkLong(fp
,expiretime
) == 0) goto werr
;
7165 if (swapped
) decrRefCount(o
);
7167 dictReleaseIterator(di
);
7170 /* Make sure data will not remain on the OS's output buffers */
7175 /* Use RENAME to make sure the DB file is changed atomically only
7176 * if the generate DB file is ok. */
7177 if (rename(tmpfile
,filename
) == -1) {
7178 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
7182 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
7188 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
7189 if (di
) dictReleaseIterator(di
);
7193 /* This is how rewriting of the append only file in background works:
7195 * 1) The user calls BGREWRITEAOF
7196 * 2) Redis calls this function, that forks():
7197 * 2a) the child rewrite the append only file in a temp file.
7198 * 2b) the parent accumulates differences in server.bgrewritebuf.
7199 * 3) When the child finished '2a' exists.
7200 * 4) The parent will trap the exit code, if it's OK, will append the
7201 * data accumulated into server.bgrewritebuf into the temp file, and
7202 * finally will rename(2) the temp file in the actual file name.
7203 * The the new file is reopened as the new append only file. Profit!
7205 static int rewriteAppendOnlyFileBackground(void) {
7208 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
7209 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
7210 if ((childpid
= fork()) == 0) {
7214 if (server
.vm_enabled
) vmReopenSwapFile();
7216 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7217 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
7224 if (childpid
== -1) {
7225 redisLog(REDIS_WARNING
,
7226 "Can't rewrite append only file in background: fork: %s",
7230 redisLog(REDIS_NOTICE
,
7231 "Background append only file rewriting started by pid %d",childpid
);
7232 server
.bgrewritechildpid
= childpid
;
7233 /* We set appendseldb to -1 in order to force the next call to the
7234 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7235 * accumulated by the parent into server.bgrewritebuf will start
7236 * with a SELECT statement and it will be safe to merge. */
7237 server
.appendseldb
= -1;
7240 return REDIS_OK
; /* unreached */
7243 static void bgrewriteaofCommand(redisClient
*c
) {
7244 if (server
.bgrewritechildpid
!= -1) {
7245 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7248 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
7249 char *status
= "+Background append only file rewriting started\r\n";
7250 addReplySds(c
,sdsnew(status
));
7252 addReply(c
,shared
.err
);
7256 static void aofRemoveTempFile(pid_t childpid
) {
7259 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
7263 /* Virtual Memory is composed mainly of two subsystems:
7264 * - Blocking Virutal Memory
7265 * - Threaded Virtual Memory I/O
7266 * The two parts are not fully decoupled, but functions are split among two
7267 * different sections of the source code (delimited by comments) in order to
7268 * make more clear what functionality is about the blocking VM and what about
7269 * the threaded (not blocking) VM.
7273 * Redis VM is a blocking VM (one that blocks reading swapped values from
7274 * disk into memory when a value swapped out is needed in memory) that is made
7275 * unblocking by trying to examine the command argument vector in order to
7276 * load in background values that will likely be needed in order to exec
7277 * the command. The command is executed only once all the relevant keys
7278 * are loaded into memory.
7280 * This basically is almost as simple of a blocking VM, but almost as parallel
7281 * as a fully non-blocking VM.
7284 /* =================== Virtual Memory - Blocking Side ====================== */
7286 /* substitute the first occurrence of '%p' with the process pid in the
7287 * swap file name. */
7288 static void expandVmSwapFilename(void) {
7289 char *p
= strstr(server
.vm_swap_file
,"%p");
7295 new = sdscat(new,server
.vm_swap_file
);
7296 new = sdscatprintf(new,"%ld",(long) getpid());
7297 new = sdscat(new,p
+2);
7298 zfree(server
.vm_swap_file
);
7299 server
.vm_swap_file
= new;
7302 static void vmInit(void) {
7307 if (server
.vm_max_threads
!= 0)
7308 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7310 expandVmSwapFilename();
7311 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
7312 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
7313 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
7315 if (server
.vm_fp
== NULL
) {
7316 redisLog(REDIS_WARNING
,
7317 "Impossible to open the swap file: %s. Exiting.",
7321 server
.vm_fd
= fileno(server
.vm_fp
);
7322 server
.vm_next_page
= 0;
7323 server
.vm_near_pages
= 0;
7324 server
.vm_stats_used_pages
= 0;
7325 server
.vm_stats_swapped_objects
= 0;
7326 server
.vm_stats_swapouts
= 0;
7327 server
.vm_stats_swapins
= 0;
7328 totsize
= server
.vm_pages
*server
.vm_page_size
;
7329 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
7330 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
7331 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
7335 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
7337 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
7338 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
7339 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
7340 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
7342 /* Initialize threaded I/O (used by Virtual Memory) */
7343 server
.io_newjobs
= listCreate();
7344 server
.io_processing
= listCreate();
7345 server
.io_processed
= listCreate();
7346 server
.io_ready_clients
= listCreate();
7347 pthread_mutex_init(&server
.io_mutex
,NULL
);
7348 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
7349 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
7350 server
.io_active_threads
= 0;
7351 if (pipe(pipefds
) == -1) {
7352 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
7356 server
.io_ready_pipe_read
= pipefds
[0];
7357 server
.io_ready_pipe_write
= pipefds
[1];
7358 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
7359 /* LZF requires a lot of stack */
7360 pthread_attr_init(&server
.io_threads_attr
);
7361 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
7362 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
7363 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
7364 /* Listen for events in the threaded I/O pipe */
7365 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
7366 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
7367 oom("creating file event");
7370 /* Mark the page as used */
7371 static void vmMarkPageUsed(off_t page
) {
7372 off_t byte
= page
/8;
7374 redisAssert(vmFreePage(page
) == 1);
7375 server
.vm_bitmap
[byte
] |= 1<<bit
;
7378 /* Mark N contiguous pages as used, with 'page' being the first. */
7379 static void vmMarkPagesUsed(off_t page
, off_t count
) {
7382 for (j
= 0; j
< count
; j
++)
7383 vmMarkPageUsed(page
+j
);
7384 server
.vm_stats_used_pages
+= count
;
7385 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
7386 (long long)count
, (long long)page
);
7389 /* Mark the page as free */
7390 static void vmMarkPageFree(off_t page
) {
7391 off_t byte
= page
/8;
7393 redisAssert(vmFreePage(page
) == 0);
7394 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
7397 /* Mark N contiguous pages as free, with 'page' being the first. */
7398 static void vmMarkPagesFree(off_t page
, off_t count
) {
7401 for (j
= 0; j
< count
; j
++)
7402 vmMarkPageFree(page
+j
);
7403 server
.vm_stats_used_pages
-= count
;
7404 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
7405 (long long)count
, (long long)page
);
7408 /* Test if the page is free */
7409 static int vmFreePage(off_t page
) {
7410 off_t byte
= page
/8;
7412 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
7415 /* Find N contiguous free pages storing the first page of the cluster in *first.
7416 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7417 * REDIS_ERR is returned.
7419 * This function uses a simple algorithm: we try to allocate
7420 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7421 * again from the start of the swap file searching for free spaces.
7423 * If it looks pretty clear that there are no free pages near our offset
7424 * we try to find less populated places doing a forward jump of
7425 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7426 * without hurry, and then we jump again and so forth...
7428 * This function can be improved using a free list to avoid to guess
7429 * too much, since we could collect data about freed pages.
7431 * note: I implemented this function just after watching an episode of
7432 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7434 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
7435 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
7437 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
7438 server
.vm_near_pages
= 0;
7439 server
.vm_next_page
= 0;
7441 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
7442 base
= server
.vm_next_page
;
7444 while(offset
< server
.vm_pages
) {
7445 off_t
this = base
+offset
;
7447 /* If we overflow, restart from page zero */
7448 if (this >= server
.vm_pages
) {
7449 this -= server
.vm_pages
;
7451 /* Just overflowed, what we found on tail is no longer
7452 * interesting, as it's no longer contiguous. */
7456 if (vmFreePage(this)) {
7457 /* This is a free page */
7459 /* Already got N free pages? Return to the caller, with success */
7461 *first
= this-(n
-1);
7462 server
.vm_next_page
= this+1;
7463 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
7467 /* The current one is not a free page */
7471 /* Fast-forward if the current page is not free and we already
7472 * searched enough near this place. */
7474 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
7475 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
7477 /* Note that even if we rewind after the jump, we are don't need
7478 * to make sure numfree is set to zero as we only jump *if* it
7479 * is set to zero. */
7481 /* Otherwise just check the next page */
7488 /* Write the specified object at the specified page of the swap file */
7489 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
7490 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
7491 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
7492 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7493 redisLog(REDIS_WARNING
,
7494 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7498 rdbSaveObject(server
.vm_fp
,o
);
7499 fflush(server
.vm_fp
);
7500 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7504 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7505 * needed to later retrieve the object into the key object.
7506 * If we can't find enough contiguous empty pages to swap the object on disk
7507 * REDIS_ERR is returned. */
7508 static int vmSwapObjectBlocking(robj
*key
, robj
*val
) {
7509 off_t pages
= rdbSavedObjectPages(val
,NULL
);
7512 assert(key
->storage
== REDIS_VM_MEMORY
);
7513 assert(key
->refcount
== 1);
7514 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return REDIS_ERR
;
7515 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return REDIS_ERR
;
7516 key
->vm
.page
= page
;
7517 key
->vm
.usedpages
= pages
;
7518 key
->storage
= REDIS_VM_SWAPPED
;
7519 key
->vtype
= val
->type
;
7520 decrRefCount(val
); /* Deallocate the object from memory. */
7521 vmMarkPagesUsed(page
,pages
);
7522 redisLog(REDIS_DEBUG
,"VM: object %s swapped out at %lld (%lld pages)",
7523 (unsigned char*) key
->ptr
,
7524 (unsigned long long) page
, (unsigned long long) pages
);
7525 server
.vm_stats_swapped_objects
++;
7526 server
.vm_stats_swapouts
++;
7530 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
7533 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
7534 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
7535 redisLog(REDIS_WARNING
,
7536 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7540 o
= rdbLoadObject(type
,server
.vm_fp
);
7542 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
7545 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7549 /* Load the value object relative to the 'key' object from swap to memory.
7550 * The newly allocated object is returned.
7552 * If preview is true the unserialized object is returned to the caller but
7553 * no changes are made to the key object, nor the pages are marked as freed */
7554 static robj
*vmGenericLoadObject(robj
*key
, int preview
) {
7557 redisAssert(key
->storage
== REDIS_VM_SWAPPED
|| key
->storage
== REDIS_VM_LOADING
);
7558 val
= vmReadObjectFromSwap(key
->vm
.page
,key
->vtype
);
7560 key
->storage
= REDIS_VM_MEMORY
;
7561 key
->vm
.atime
= server
.unixtime
;
7562 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
7563 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk",
7564 (unsigned char*) key
->ptr
);
7565 server
.vm_stats_swapped_objects
--;
7567 redisLog(REDIS_DEBUG
, "VM: object %s previewed from disk",
7568 (unsigned char*) key
->ptr
);
7570 server
.vm_stats_swapins
++;
7574 /* Plain object loading, from swap to memory */
7575 static robj
*vmLoadObject(robj
*key
) {
7576 /* If we are loading the object in background, stop it, we
7577 * need to load this object synchronously ASAP. */
7578 if (key
->storage
== REDIS_VM_LOADING
)
7579 vmCancelThreadedIOJob(key
);
7580 return vmGenericLoadObject(key
,0);
7583 /* Just load the value on disk, without to modify the key.
7584 * This is useful when we want to perform some operation on the value
7585 * without to really bring it from swap to memory, like while saving the
7586 * dataset or rewriting the append only log. */
7587 static robj
*vmPreviewObject(robj
*key
) {
7588 return vmGenericLoadObject(key
,1);
7591 /* How a good candidate is this object for swapping?
7592 * The better candidate it is, the greater the returned value.
7594 * Currently we try to perform a fast estimation of the object size in
7595 * memory, and combine it with aging informations.
7597 * Basically swappability = idle-time * log(estimated size)
7599 * Bigger objects are preferred over smaller objects, but not
7600 * proportionally, this is why we use the logarithm. This algorithm is
7601 * just a first try and will probably be tuned later. */
7602 static double computeObjectSwappability(robj
*o
) {
7603 time_t age
= server
.unixtime
- o
->vm
.atime
;
7607 struct dictEntry
*de
;
7610 if (age
<= 0) return 0;
7613 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
7616 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
7621 listNode
*ln
= listFirst(l
);
7623 asize
= sizeof(list
);
7625 robj
*ele
= ln
->value
;
7628 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
7629 (sizeof(*o
)+sdslen(ele
->ptr
)) :
7631 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
7636 z
= (o
->type
== REDIS_ZSET
);
7637 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
7639 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
7640 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
7645 de
= dictGetRandomKey(d
);
7646 ele
= dictGetEntryKey(de
);
7647 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
7648 (sizeof(*o
)+sdslen(ele
->ptr
)) :
7650 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
7651 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
7655 return (double)age
*log(1+asize
);
7658 /* Try to swap an object that's a good candidate for swapping.
7659 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7660 * to swap any object at all.
7662 * If 'usethreaded' is true, Redis will try to swap the object in background
7663 * using I/O threads. */
7664 static int vmSwapOneObject(int usethreads
) {
7666 struct dictEntry
*best
= NULL
;
7667 double best_swappability
= 0;
7668 redisDb
*best_db
= NULL
;
7671 for (j
= 0; j
< server
.dbnum
; j
++) {
7672 redisDb
*db
= server
.db
+j
;
7673 /* Why maxtries is set to 100?
7674 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7675 * are swappable objects */
7678 if (dictSize(db
->dict
) == 0) continue;
7679 for (i
= 0; i
< 5; i
++) {
7681 double swappability
;
7683 if (maxtries
) maxtries
--;
7684 de
= dictGetRandomKey(db
->dict
);
7685 key
= dictGetEntryKey(de
);
7686 val
= dictGetEntryVal(de
);
7687 /* Only swap objects that are currently in memory.
7689 * Also don't swap shared objects if threaded VM is on, as we
7690 * try to ensure that the main thread does not touch the
7691 * object while the I/O thread is using it, but we can't
7692 * control other keys without adding additional mutex. */
7693 if (key
->storage
!= REDIS_VM_MEMORY
||
7694 (server
.vm_max_threads
!= 0 && val
->refcount
!= 1)) {
7695 if (maxtries
) i
--; /* don't count this try */
7698 swappability
= computeObjectSwappability(val
);
7699 if (!best
|| swappability
> best_swappability
) {
7701 best_swappability
= swappability
;
7706 if (best
== NULL
) return REDIS_ERR
;
7707 key
= dictGetEntryKey(best
);
7708 val
= dictGetEntryVal(best
);
7710 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
7711 key
->ptr
, best_swappability
);
7713 /* Unshare the key if needed */
7714 if (key
->refcount
> 1) {
7715 robj
*newkey
= dupStringObject(key
);
7717 key
= dictGetEntryKey(best
) = newkey
;
7721 vmSwapObjectThreaded(key
,val
,best_db
);
7724 if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
7725 dictGetEntryVal(best
) = NULL
;
7733 static int vmSwapOneObjectBlocking() {
7734 return vmSwapOneObject(0);
7737 static int vmSwapOneObjectThreaded() {
7738 return vmSwapOneObject(1);
7741 /* Return true if it's safe to swap out objects in a given moment.
7742 * Basically we don't want to swap objects out while there is a BGSAVE
7743 * or a BGAEOREWRITE running in backgroud. */
7744 static int vmCanSwapOut(void) {
7745 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
7748 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7749 * and was deleted. Otherwise 0 is returned. */
7750 static int deleteIfSwapped(redisDb
*db
, robj
*key
) {
7754 if ((de
= dictFind(db
->dict
,key
)) == NULL
) return 0;
7755 foundkey
= dictGetEntryKey(de
);
7756 if (foundkey
->storage
== REDIS_VM_MEMORY
) return 0;
7761 /* =================== Virtual Memory - Threaded I/O ======================= */
7763 static void freeIOJob(iojob
*j
) {
7764 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
7765 j
->type
== REDIS_IOJOB_DO_SWAP
||
7766 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
7767 decrRefCount(j
->val
);
7768 decrRefCount(j
->key
);
7772 /* Every time a thread finished a Job, it writes a byte into the write side
7773 * of an unix pipe in order to "awake" the main thread, and this function
7775 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
7779 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
7781 REDIS_NOTUSED(mask
);
7782 REDIS_NOTUSED(privdata
);
7784 /* For every byte we read in the read side of the pipe, there is one
7785 * I/O job completed to process. */
7786 while((retval
= read(fd
,buf
,1)) == 1) {
7790 struct dictEntry
*de
;
7792 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
7794 /* Get the processed element (the oldest one) */
7796 assert(listLength(server
.io_processed
) != 0);
7797 if (toprocess
== -1) {
7798 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
7799 if (toprocess
<= 0) toprocess
= 1;
7801 ln
= listFirst(server
.io_processed
);
7803 listDelNode(server
.io_processed
,ln
);
7805 /* If this job is marked as canceled, just ignore it */
7810 /* Post process it in the main thread, as there are things we
7811 * can do just here to avoid race conditions and/or invasive locks */
7812 redisLog(REDIS_DEBUG
,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j
, j
->type
, (void*)j
->key
, (char*)j
->key
->ptr
, j
->key
->refcount
);
7813 de
= dictFind(j
->db
->dict
,j
->key
);
7815 key
= dictGetEntryKey(de
);
7816 if (j
->type
== REDIS_IOJOB_LOAD
) {
7819 /* Key loaded, bring it at home */
7820 key
->storage
= REDIS_VM_MEMORY
;
7821 key
->vm
.atime
= server
.unixtime
;
7822 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
7823 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
7824 (unsigned char*) key
->ptr
);
7825 server
.vm_stats_swapped_objects
--;
7826 server
.vm_stats_swapins
++;
7827 dictGetEntryVal(de
) = j
->val
;
7828 incrRefCount(j
->val
);
7831 /* Handle clients waiting for this key to be loaded. */
7832 handleClientsBlockedOnSwappedKey(db
,key
);
7833 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
7834 /* Now we know the amount of pages required to swap this object.
7835 * Let's find some space for it, and queue this task again
7836 * rebranded as REDIS_IOJOB_DO_SWAP. */
7837 if (!vmCanSwapOut() ||
7838 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
7840 /* Ooops... no space or we can't swap as there is
7841 * a fork()ed Redis trying to save stuff on disk. */
7843 key
->storage
= REDIS_VM_MEMORY
; /* undo operation */
7845 /* Note that we need to mark this pages as used now,
7846 * if the job will be canceled, we'll mark them as freed
7848 vmMarkPagesUsed(j
->page
,j
->pages
);
7849 j
->type
= REDIS_IOJOB_DO_SWAP
;
7854 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
7857 /* Key swapped. We can finally free some memory. */
7858 if (key
->storage
!= REDIS_VM_SWAPPING
) {
7859 printf("key->storage: %d\n",key
->storage
);
7860 printf("key->name: %s\n",(char*)key
->ptr
);
7861 printf("key->refcount: %d\n",key
->refcount
);
7862 printf("val: %p\n",(void*)j
->val
);
7863 printf("val->type: %d\n",j
->val
->type
);
7864 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
7866 redisAssert(key
->storage
== REDIS_VM_SWAPPING
);
7867 val
= dictGetEntryVal(de
);
7868 key
->vm
.page
= j
->page
;
7869 key
->vm
.usedpages
= j
->pages
;
7870 key
->storage
= REDIS_VM_SWAPPED
;
7871 key
->vtype
= j
->val
->type
;
7872 decrRefCount(val
); /* Deallocate the object from memory. */
7873 dictGetEntryVal(de
) = NULL
;
7874 redisLog(REDIS_DEBUG
,
7875 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7876 (unsigned char*) key
->ptr
,
7877 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
7878 server
.vm_stats_swapped_objects
++;
7879 server
.vm_stats_swapouts
++;
7881 /* Put a few more swap requests in queue if we are still
7883 if (trytoswap
&& vmCanSwapOut() &&
7884 zmalloc_used_memory() > server
.vm_max_memory
)
7889 more
= listLength(server
.io_newjobs
) <
7890 (unsigned) server
.vm_max_threads
;
7892 /* Don't waste CPU time if swappable objects are rare. */
7893 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
7901 if (processed
== toprocess
) return;
7903 if (retval
< 0 && errno
!= EAGAIN
) {
7904 redisLog(REDIS_WARNING
,
7905 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7910 static void lockThreadedIO(void) {
7911 pthread_mutex_lock(&server
.io_mutex
);
7914 static void unlockThreadedIO(void) {
7915 pthread_mutex_unlock(&server
.io_mutex
);
7918 /* Remove the specified object from the threaded I/O queue if still not
7919 * processed, otherwise make sure to flag it as canceled. */
7920 static void vmCancelThreadedIOJob(robj
*o
) {
7922 server
.io_newjobs
, /* 0 */
7923 server
.io_processing
, /* 1 */
7924 server
.io_processed
/* 2 */
7928 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
7931 /* Search for a matching key in one of the queues */
7932 for (i
= 0; i
< 3; i
++) {
7936 listRewind(lists
[i
],&li
);
7937 while ((ln
= listNext(&li
)) != NULL
) {
7938 iojob
*job
= ln
->value
;
7940 if (job
->canceled
) continue; /* Skip this, already canceled. */
7941 if (compareStringObjects(job
->key
,o
) == 0) {
7942 redisLog(REDIS_DEBUG
,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
7943 (void*)job
, (char*)o
->ptr
, job
->type
, i
);
7944 /* Mark the pages as free since the swap didn't happened
7945 * or happened but is now discarded. */
7946 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
7947 vmMarkPagesFree(job
->page
,job
->pages
);
7948 /* Cancel the job. It depends on the list the job is
7951 case 0: /* io_newjobs */
7952 /* If the job was yet not processed the best thing to do
7953 * is to remove it from the queue at all */
7955 listDelNode(lists
[i
],ln
);
7957 case 1: /* io_processing */
7958 /* Oh Shi- the thread is messing with the Job:
7960 * Probably it's accessing the object if this is a
7961 * PREPARE_SWAP or DO_SWAP job.
7962 * If it's a LOAD job it may be reading from disk and
7963 * if we don't wait for the job to terminate before to
7964 * cancel it, maybe in a few microseconds data can be
7965 * corrupted in this pages. So the short story is:
7967 * Better to wait for the job to move into the
7968 * next queue (processed)... */
7970 /* We try again and again until the job is completed. */
7972 /* But let's wait some time for the I/O thread
7973 * to finish with this job. After all this condition
7974 * should be very rare. */
7977 case 2: /* io_processed */
7978 /* The job was already processed, that's easy...
7979 * just mark it as canceled so that we'll ignore it
7980 * when processing completed jobs. */
7984 /* Finally we have to adjust the storage type of the object
7985 * in order to "UNDO" the operaiton. */
7986 if (o
->storage
== REDIS_VM_LOADING
)
7987 o
->storage
= REDIS_VM_SWAPPED
;
7988 else if (o
->storage
== REDIS_VM_SWAPPING
)
7989 o
->storage
= REDIS_VM_MEMORY
;
7996 assert(1 != 1); /* We should never reach this */
7999 static void *IOThreadEntryPoint(void *arg
) {
8004 pthread_detach(pthread_self());
8006 /* Get a new job to process */
8008 if (listLength(server
.io_newjobs
) == 0) {
8009 /* No new jobs in queue, exit. */
8010 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
8011 (long) pthread_self());
8012 server
.io_active_threads
--;
8016 ln
= listFirst(server
.io_newjobs
);
8018 listDelNode(server
.io_newjobs
,ln
);
8019 /* Add the job in the processing queue */
8020 j
->thread
= pthread_self();
8021 listAddNodeTail(server
.io_processing
,j
);
8022 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
8024 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
8025 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
8027 /* Process the Job */
8028 if (j
->type
== REDIS_IOJOB_LOAD
) {
8029 j
->val
= vmReadObjectFromSwap(j
->page
,j
->key
->vtype
);
8030 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
8031 FILE *fp
= fopen("/dev/null","w+");
8032 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
8034 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
8035 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
8039 /* Done: insert the job into the processed queue */
8040 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
8041 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
8043 listDelNode(server
.io_processing
,ln
);
8044 listAddNodeTail(server
.io_processed
,j
);
8047 /* Signal the main thread there is new stuff to process */
8048 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
8050 return NULL
; /* never reached */
8053 static void spawnIOThread(void) {
8055 sigset_t mask
, omask
;
8058 sigaddset(&mask
,SIGCHLD
);
8059 sigaddset(&mask
,SIGHUP
);
8060 sigaddset(&mask
,SIGPIPE
);
8061 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
8062 pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
);
8063 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
8064 server
.io_active_threads
++;
8067 /* We need to wait for the last thread to exit before we are able to
8068 * fork() in order to BGSAVE or BGREWRITEAOF. */
8069 static void waitEmptyIOJobsQueue(void) {
8071 int io_processed_len
;
8074 if (listLength(server
.io_newjobs
) == 0 &&
8075 listLength(server
.io_processing
) == 0 &&
8076 server
.io_active_threads
== 0)
8081 /* While waiting for empty jobs queue condition we post-process some
8082 * finshed job, as I/O threads may be hanging trying to write against
8083 * the io_ready_pipe_write FD but there are so much pending jobs that
8085 io_processed_len
= listLength(server
.io_processed
);
8087 if (io_processed_len
) {
8088 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
8089 usleep(1000); /* 1 millisecond */
8091 usleep(10000); /* 10 milliseconds */
8096 static void vmReopenSwapFile(void) {
8097 /* Note: we don't close the old one as we are in the child process
8098 * and don't want to mess at all with the original file object. */
8099 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
8100 if (server
.vm_fp
== NULL
) {
8101 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
8102 server
.vm_swap_file
);
8105 server
.vm_fd
= fileno(server
.vm_fp
);
8108 /* This function must be called while with threaded IO locked */
8109 static void queueIOJob(iojob
*j
) {
8110 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
8111 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
8112 listAddNodeTail(server
.io_newjobs
,j
);
8113 if (server
.io_active_threads
< server
.vm_max_threads
)
8117 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
8120 assert(key
->storage
== REDIS_VM_MEMORY
);
8121 assert(key
->refcount
== 1);
8123 j
= zmalloc(sizeof(*j
));
8124 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
8126 j
->key
= dupStringObject(key
);
8130 j
->thread
= (pthread_t
) -1;
8131 key
->storage
= REDIS_VM_SWAPPING
;
8139 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8141 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8142 * If there is not already a job loading the key, it is craeted.
8143 * The key is added to the io_keys list in the client structure, and also
8144 * in the hash table mapping swapped keys to waiting clients, that is,
8145 * server.io_waited_keys. */
8146 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
8147 struct dictEntry
*de
;
8151 /* If the key does not exist or is already in RAM we don't need to
8152 * block the client at all. */
8153 de
= dictFind(c
->db
->dict
,key
);
8154 if (de
== NULL
) return 0;
8155 o
= dictGetEntryKey(de
);
8156 if (o
->storage
== REDIS_VM_MEMORY
) {
8158 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
8159 /* We were swapping the key, undo it! */
8160 vmCancelThreadedIOJob(o
);
8164 /* OK: the key is either swapped, or being loaded just now. */
8166 /* Add the key to the list of keys this client is waiting for.
8167 * This maps clients to keys they are waiting for. */
8168 listAddNodeTail(c
->io_keys
,key
);
8171 /* Add the client to the swapped keys => clients waiting map. */
8172 de
= dictFind(c
->db
->io_keys
,key
);
8176 /* For every key we take a list of clients blocked for it */
8178 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
8180 assert(retval
== DICT_OK
);
8182 l
= dictGetEntryVal(de
);
8184 listAddNodeTail(l
,c
);
8186 /* Are we already loading the key from disk? If not create a job */
8187 if (o
->storage
== REDIS_VM_SWAPPED
) {
8190 o
->storage
= REDIS_VM_LOADING
;
8191 j
= zmalloc(sizeof(*j
));
8192 j
->type
= REDIS_IOJOB_LOAD
;
8194 j
->key
= dupStringObject(key
);
8195 j
->key
->vtype
= o
->vtype
;
8196 j
->page
= o
->vm
.page
;
8199 j
->thread
= (pthread_t
) -1;
8207 /* Is this client attempting to run a command against swapped keys?
8208 * If so, block it ASAP, load the keys in background, then resume it.
8210 * The important idea about this function is that it can fail! If keys will
8211 * still be swapped when the client is resumed, this key lookups will
8212 * just block loading keys from disk. In practical terms this should only
8213 * happen with SORT BY command or if there is a bug in this function.
8215 * Return 1 if the client is marked as blocked, 0 if the client can
8216 * continue as the keys it is going to access appear to be in memory. */
8217 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
) {
8220 if (cmd
->vm_firstkey
== 0) return 0;
8221 last
= cmd
->vm_lastkey
;
8222 if (last
< 0) last
= c
->argc
+last
;
8223 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
)
8224 waitForSwappedKey(c
,c
->argv
[j
]);
8225 /* If the client was blocked for at least one key, mark it as blocked. */
8226 if (listLength(c
->io_keys
)) {
8227 c
->flags
|= REDIS_IO_WAIT
;
8228 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
8229 server
.vm_blocked_clients
++;
8236 /* Remove the 'key' from the list of blocked keys for a given client.
8238 * The function returns 1 when there are no longer blocking keys after
8239 * the current one was removed (and the client can be unblocked). */
8240 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
8244 struct dictEntry
*de
;
8246 /* Remove the key from the list of keys this client is waiting for. */
8247 listRewind(c
->io_keys
,&li
);
8248 while ((ln
= listNext(&li
)) != NULL
) {
8249 if (compareStringObjects(ln
->value
,key
) == 0) {
8250 listDelNode(c
->io_keys
,ln
);
8256 /* Remove the client form the key => waiting clients map. */
8257 de
= dictFind(c
->db
->io_keys
,key
);
8259 l
= dictGetEntryVal(de
);
8260 ln
= listSearchKey(l
,c
);
8263 if (listLength(l
) == 0)
8264 dictDelete(c
->db
->io_keys
,key
);
8266 return listLength(c
->io_keys
) == 0;
8269 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
8270 struct dictEntry
*de
;
8275 de
= dictFind(db
->io_keys
,key
);
8278 l
= dictGetEntryVal(de
);
8279 len
= listLength(l
);
8280 /* Note: we can't use something like while(listLength(l)) as the list
8281 * can be freed by the calling function when we remove the last element. */
8284 redisClient
*c
= ln
->value
;
8286 if (dontWaitForSwappedKey(c
,key
)) {
8287 /* Put the client in the list of clients ready to go as we
8288 * loaded all the keys about it. */
8289 listAddNodeTail(server
.io_ready_clients
,c
);
8294 /* ================================= Debugging ============================== */
8296 static void debugCommand(redisClient
*c
) {
8297 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
8299 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
8300 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
8301 addReply(c
,shared
.err
);
8305 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
8306 addReply(c
,shared
.err
);
8309 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
8310 addReply(c
,shared
.ok
);
8311 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
8313 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
8314 addReply(c
,shared
.err
);
8317 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
8318 addReply(c
,shared
.ok
);
8319 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
8320 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
8324 addReply(c
,shared
.nokeyerr
);
8327 key
= dictGetEntryKey(de
);
8328 val
= dictGetEntryVal(de
);
8329 if (!server
.vm_enabled
|| (key
->storage
== REDIS_VM_MEMORY
||
8330 key
->storage
== REDIS_VM_SWAPPING
)) {
8331 addReplySds(c
,sdscatprintf(sdsempty(),
8332 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8333 "encoding:%d serializedlength:%lld\r\n",
8334 (void*)key
, key
->refcount
, (void*)val
, val
->refcount
,
8335 val
->encoding
, (long long) rdbSavedObjectLen(val
,NULL
)));
8337 addReplySds(c
,sdscatprintf(sdsempty(),
8338 "+Key at:%p refcount:%d, value swapped at: page %llu "
8339 "using %llu pages\r\n",
8340 (void*)key
, key
->refcount
, (unsigned long long) key
->vm
.page
,
8341 (unsigned long long) key
->vm
.usedpages
));
8343 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
8344 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
8347 if (!server
.vm_enabled
) {
8348 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8352 addReply(c
,shared
.nokeyerr
);
8355 key
= dictGetEntryKey(de
);
8356 val
= dictGetEntryVal(de
);
8357 /* If the key is shared we want to create a copy */
8358 if (key
->refcount
> 1) {
8359 robj
*newkey
= dupStringObject(key
);
8361 key
= dictGetEntryKey(de
) = newkey
;
8364 if (key
->storage
!= REDIS_VM_MEMORY
) {
8365 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
8366 } else if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
8367 dictGetEntryVal(de
) = NULL
;
8368 addReply(c
,shared
.ok
);
8370 addReply(c
,shared
.err
);
8373 addReplySds(c
,sdsnew(
8374 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8378 static void _redisAssert(char *estr
, char *file
, int line
) {
8379 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
8380 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true\n",file
,line
,estr
);
8381 #ifdef HAVE_BACKTRACE
8382 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
8387 /* =================================== Main! ================================ */
8390 int linuxOvercommitMemoryValue(void) {
8391 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
8395 if (fgets(buf
,64,fp
) == NULL
) {
8404 void linuxOvercommitMemoryWarning(void) {
8405 if (linuxOvercommitMemoryValue() == 0) {
8406 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8409 #endif /* __linux__ */
8411 static void daemonize(void) {
8415 if (fork() != 0) exit(0); /* parent exits */
8416 setsid(); /* create a new session */
8418 /* Every output goes to /dev/null. If Redis is daemonized but
8419 * the 'logfile' is set to 'stdout' in the configuration file
8420 * it will not log at all. */
8421 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
8422 dup2(fd
, STDIN_FILENO
);
8423 dup2(fd
, STDOUT_FILENO
);
8424 dup2(fd
, STDERR_FILENO
);
8425 if (fd
> STDERR_FILENO
) close(fd
);
8427 /* Try to write the pid file */
8428 fp
= fopen(server
.pidfile
,"w");
8430 fprintf(fp
,"%d\n",getpid());
8435 int main(int argc
, char **argv
) {
8440 resetServerSaveParams();
8441 loadServerConfig(argv
[1]);
8442 } else if (argc
> 2) {
8443 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
8446 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8448 if (server
.daemonize
) daemonize();
8450 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
8452 linuxOvercommitMemoryWarning();
8455 if (server
.appendonly
) {
8456 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
8457 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
8459 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
8460 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
8462 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
8463 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
8465 aeDeleteEventLoop(server
.el
);
8469 /* ============================= Backtrace support ========================= */
8471 #ifdef HAVE_BACKTRACE
8472 static char *findFuncName(void *pointer
, unsigned long *offset
);
8474 static void *getMcontextEip(ucontext_t
*uc
) {
8475 #if defined(__FreeBSD__)
8476 return (void*) uc
->uc_mcontext
.mc_eip
;
8477 #elif defined(__dietlibc__)
8478 return (void*) uc
->uc_mcontext
.eip
;
8479 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8481 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
8483 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
8485 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8486 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8487 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
8489 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
8491 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8492 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
8493 #elif defined(__ia64__) /* Linux IA64 */
8494 return (void*) uc
->uc_mcontext
.sc_ip
;
8500 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
8502 char **messages
= NULL
;
8503 int i
, trace_size
= 0;
8504 unsigned long offset
=0;
8505 ucontext_t
*uc
= (ucontext_t
*) secret
;
8507 REDIS_NOTUSED(info
);
8509 redisLog(REDIS_WARNING
,
8510 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
8511 infostring
= genRedisInfoString();
8512 redisLog(REDIS_WARNING
, "%s",infostring
);
8513 /* It's not safe to sdsfree() the returned string under memory
8514 * corruption conditions. Let it leak as we are going to abort */
8516 trace_size
= backtrace(trace
, 100);
8517 /* overwrite sigaction with caller's address */
8518 if (getMcontextEip(uc
) != NULL
) {
8519 trace
[1] = getMcontextEip(uc
);
8521 messages
= backtrace_symbols(trace
, trace_size
);
8523 for (i
=1; i
<trace_size
; ++i
) {
8524 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
8526 p
= strchr(messages
[i
],'+');
8527 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
8528 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
8530 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
8533 /* free(messages); Don't call free() with possibly corrupted memory. */
8537 static void setupSigSegvAction(void) {
8538 struct sigaction act
;
8540 sigemptyset (&act
.sa_mask
);
8541 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8542 * is used. Otherwise, sa_handler is used */
8543 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
8544 act
.sa_sigaction
= segvHandler
;
8545 sigaction (SIGSEGV
, &act
, NULL
);
8546 sigaction (SIGBUS
, &act
, NULL
);
8547 sigaction (SIGFPE
, &act
, NULL
);
8548 sigaction (SIGILL
, &act
, NULL
);
8549 sigaction (SIGBUS
, &act
, NULL
);
8553 #include "staticsymbols.h"
8554 /* This function try to convert a pointer into a function name. It's used in
8555 * oreder to provide a backtrace under segmentation fault that's able to
8556 * display functions declared as static (otherwise the backtrace is useless). */
8557 static char *findFuncName(void *pointer
, unsigned long *offset
){
8559 unsigned long off
, minoff
= 0;
8561 /* Try to match against the Symbol with the smallest offset */
8562 for (i
=0; symsTable
[i
].pointer
; i
++) {
8563 unsigned long lp
= (unsigned long) pointer
;
8565 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
8566 off
=lp
-symsTable
[i
].pointer
;
8567 if (ret
< 0 || off
< minoff
) {
8573 if (ret
== -1) return NULL
;
8575 return symsTable
[ret
].name
;
8577 #else /* HAVE_BACKTRACE */
8578 static void setupSigSegvAction(void) {
8580 #endif /* HAVE_BACKTRACE */