2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "1.3.4"
40 #define __USE_POSIX199309
47 #endif /* HAVE_BACKTRACE */
55 #include <arpa/inet.h>
59 #include <sys/resource.h>
66 #include "solarisfixes.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
83 /* Static server configuration */
84 #define REDIS_SERVERPORT 6379 /* TCP port */
85 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
86 #define REDIS_IOBUF_LEN 1024
87 #define REDIS_LOADBUF_LEN 1024
88 #define REDIS_STATIC_ARGS 4
89 #define REDIS_DEFAULT_DBNUM 16
90 #define REDIS_CONFIGLINE_MAX 1024
91 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
92 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
93 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
94 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
95 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
98 #define REDIS_WRITEV_THRESHOLD 3
99 /* Max number of iovecs used for each writev call */
100 #define REDIS_WRITEV_IOVEC_COUNT 256
102 /* Hash table parameters */
103 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106 #define REDIS_CMD_BULK 1 /* Bulk write command */
107 #define REDIS_CMD_INLINE 2 /* Inline command */
108 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
109 this flags will return an error when the 'maxmemory' option is set in the
110 config file and the server is using more than maxmemory bytes of memory.
111 In short this commands are denied on low memory conditions. */
112 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_STRING 0
121 /* Objects encoding */
122 #define REDIS_ENCODING_RAW 0 /* Raw representation */
123 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
125 /* Object types only used for dumping to disk */
126 #define REDIS_EXPIRETIME 253
127 #define REDIS_SELECTDB 254
128 #define REDIS_EOF 255
130 /* Defines related to the dump file format. To store 32 bits lengths for short
131 * keys requires a lot of space, so we check the most significant 2 bits of
132 * the first byte to interpreter the length:
134 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
135 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
136 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
137 * 11|000000 this means: specially encoded object will follow. The six bits
138 * number specify the kind of object that follows.
139 * See the REDIS_RDB_ENC_* defines.
141 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
142 * values, will fit inside. */
143 #define REDIS_RDB_6BITLEN 0
144 #define REDIS_RDB_14BITLEN 1
145 #define REDIS_RDB_32BITLEN 2
146 #define REDIS_RDB_ENCVAL 3
147 #define REDIS_RDB_LENERR UINT_MAX
149 /* When a length of a string object stored on disk has the first two bits
150 * set, the remaining two bits specify a special encoding for the object
151 * accordingly to the following defines: */
152 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
153 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
154 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
155 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
157 /* Virtual memory object->where field. */
158 #define REDIS_VM_MEMORY 0 /* The object is on memory */
159 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
160 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
161 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
163 /* Virtual memory static configuration stuff.
164 * Check vmFindContiguousPages() to know more about this magic numbers. */
165 #define REDIS_VM_MAX_NEAR_PAGES 65536
166 #define REDIS_VM_MAX_RANDOM_JUMP 4096
167 #define REDIS_VM_MAX_THREADS 32
168 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
169 /* The following is the *percentage* of completed I/O jobs to process when the
170 * handelr is called. While Virtual Memory I/O operations are performed by
171 * threads, this operations must be processed by the main thread when completed
172 * in order to take effect. */
173 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
176 #define REDIS_SLAVE 1 /* This client is a slave server */
177 #define REDIS_MASTER 2 /* This client is a master server */
178 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
179 #define REDIS_MULTI 8 /* This client is in a MULTI context */
180 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
181 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
183 /* Slave replication state - slave side */
184 #define REDIS_REPL_NONE 0 /* No active replication */
185 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
186 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
188 /* Slave replication state - from the point of view of master
189 * Note that in SEND_BULK and ONLINE state the slave receives new updates
190 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
191 * to start the next background saving in order to send updates to it. */
192 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
193 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
194 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
195 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
197 /* List related stuff */
201 /* Sort operations */
202 #define REDIS_SORT_GET 0
203 #define REDIS_SORT_ASC 1
204 #define REDIS_SORT_DESC 2
205 #define REDIS_SORTKEY_MAX 1024
208 #define REDIS_DEBUG 0
209 #define REDIS_VERBOSE 1
210 #define REDIS_NOTICE 2
211 #define REDIS_WARNING 3
213 /* Anti-warning macro... */
214 #define REDIS_NOTUSED(V) ((void) V)
216 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
217 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
219 /* Append only defines */
220 #define APPENDFSYNC_NO 0
221 #define APPENDFSYNC_ALWAYS 1
222 #define APPENDFSYNC_EVERYSEC 2
224 /* We can print the stacktrace, so our assert is defined this way: */
225 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
226 static void _redisAssert(char *estr
, char *file
, int line
);
228 /*================================= Data types ============================== */
230 /* A redis object, that is a type able to hold a string / list / set */
232 /* The VM object structure */
233 struct redisObjectVM
{
234 off_t page
; /* the page at witch the object is stored on disk */
235 off_t usedpages
; /* number of pages used on disk */
236 time_t atime
; /* Last access time */
239 /* The actual Redis Object */
240 typedef struct redisObject
{
243 unsigned char encoding
;
244 unsigned char storage
; /* If this object is a key, where is the value?
245 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
246 unsigned char vtype
; /* If this object is a key, and value is swapped out,
247 * this is the type of the swapped out object. */
249 /* VM fields, this are only allocated if VM is active, otherwise the
250 * object allocation function will just allocate
251 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
252 * Redis without VM active will not have any overhead. */
253 struct redisObjectVM vm
;
256 /* Macro used to initalize a Redis object allocated on the stack.
257 * Note that this macro is taken near the structure definition to make sure
258 * we'll update it when the structure is changed, to avoid bugs like
259 * bug #85 introduced exactly in this way. */
260 #define initStaticStringObject(_var,_ptr) do { \
262 _var.type = REDIS_STRING; \
263 _var.encoding = REDIS_ENCODING_RAW; \
265 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
268 typedef struct redisDb
{
269 dict
*dict
; /* The keyspace for this DB */
270 dict
*expires
; /* Timeout of keys with a timeout set */
271 dict
*blockingkeys
; /* Keys with clients waiting for data (BLPOP) */
272 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
276 /* Client MULTI/EXEC state */
277 typedef struct multiCmd
{
280 struct redisCommand
*cmd
;
283 typedef struct multiState
{
284 multiCmd
*commands
; /* Array of MULTI commands */
285 int count
; /* Total number of MULTI commands */
288 /* With multiplexing we need to take per-clinet state.
289 * Clients are taken in a liked list. */
290 typedef struct redisClient
{
295 robj
**argv
, **mbargv
;
297 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
298 int multibulk
; /* multi bulk command format active */
301 time_t lastinteraction
; /* time of the last interaction, used for timeout */
302 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
303 int slaveseldb
; /* slave selected db, if this client is a slave */
304 int authenticated
; /* when requirepass is non-NULL */
305 int replstate
; /* replication state if this is a slave */
306 int repldbfd
; /* replication DB file descriptor */
307 long repldboff
; /* replication DB file offset */
308 off_t repldbsize
; /* replication DB file size */
309 multiState mstate
; /* MULTI/EXEC state */
310 robj
**blockingkeys
; /* The key we are waiting to terminate a blocking
311 * operation such as BLPOP. Otherwise NULL. */
312 int blockingkeysnum
; /* Number of blocking keys */
313 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
314 * is >= blockingto then the operation timed out. */
315 list
*io_keys
; /* Keys this client is waiting to be loaded from the
316 * swap file in order to continue. */
324 /* Global server state structure */
329 dict
*sharingpool
; /* Poll used for object sharing */
330 unsigned int sharingpoolsize
;
331 long long dirty
; /* changes to DB from the last save */
333 list
*slaves
, *monitors
;
334 char neterr
[ANET_ERR_LEN
];
336 int cronloops
; /* number of times the cron function run */
337 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
338 time_t lastsave
; /* Unix time of last save succeeede */
339 /* Fields used only for stats */
340 time_t stat_starttime
; /* server start time */
341 long long stat_numcommands
; /* number of processed commands */
342 long long stat_numconnections
; /* number of connections received */
355 pid_t bgsavechildpid
;
356 pid_t bgrewritechildpid
;
357 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
358 struct saveparam
*saveparams
;
363 char *appendfilename
;
367 /* Replication related */
372 redisClient
*master
; /* client that is master for this slave */
374 unsigned int maxclients
;
375 unsigned long long maxmemory
;
376 unsigned int blpop_blocked_clients
;
377 unsigned int vm_blocked_clients
;
378 /* Sort parameters - qsort_r() is only available under BSD so we
379 * have to take this state global, in order to pass it to sortCompare() */
383 /* Virtual memory configuration */
388 unsigned long long vm_max_memory
;
389 /* Virtual memory state */
392 off_t vm_next_page
; /* Next probably empty page */
393 off_t vm_near_pages
; /* Number of pages allocated sequentially */
394 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
395 time_t unixtime
; /* Unix time sampled every second. */
396 /* Virtual memory I/O threads stuff */
397 /* An I/O thread process an element taken from the io_jobs queue and
398 * put the result of the operation in the io_done list. While the
399 * job is being processed, it's put on io_processing queue. */
400 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
401 list
*io_processing
; /* List of VM I/O jobs being processed */
402 list
*io_processed
; /* List of VM I/O jobs already processed */
403 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
404 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
405 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
406 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
407 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
408 int io_active_threads
; /* Number of running I/O threads */
409 int vm_max_threads
; /* Max number of I/O threads running at the same time */
410 /* Our main thread is blocked on the event loop, locking for sockets ready
411 * to be read or written, so when a threaded I/O operation is ready to be
412 * processed by the main thread, the I/O thread will use a unix pipe to
413 * awake the main thread. The followings are the two pipe FDs. */
414 int io_ready_pipe_read
;
415 int io_ready_pipe_write
;
416 /* Virtual memory stats */
417 unsigned long long vm_stats_used_pages
;
418 unsigned long long vm_stats_swapped_objects
;
419 unsigned long long vm_stats_swapouts
;
420 unsigned long long vm_stats_swapins
;
424 typedef void redisCommandProc(redisClient
*c
);
425 struct redisCommand
{
427 redisCommandProc
*proc
;
430 /* What keys should be loaded in background when calling this command? */
431 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
432 int vm_lastkey
; /* THe last argument that's a key */
433 int vm_keystep
; /* The step between first and last key */
436 struct redisFunctionSym
{
438 unsigned long pointer
;
441 typedef struct _redisSortObject
{
449 typedef struct _redisSortOperation
{
452 } redisSortOperation
;
454 /* ZSETs use a specialized version of Skiplists */
456 typedef struct zskiplistNode
{
457 struct zskiplistNode
**forward
;
458 struct zskiplistNode
*backward
;
464 typedef struct zskiplist
{
465 struct zskiplistNode
*header
, *tail
;
466 unsigned long length
;
470 typedef struct zset
{
475 /* Our shared "common" objects */
477 struct sharedObjectsStruct
{
478 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
479 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
480 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
481 *outofrangeerr
, *plus
,
482 *select0
, *select1
, *select2
, *select3
, *select4
,
483 *select5
, *select6
, *select7
, *select8
, *select9
;
486 /* Global vars that are actally used as constants. The following double
487 * values are used for double on-disk serialization, and are initialized
488 * at runtime to avoid strange compiler optimizations. */
490 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
492 /* VM threaded I/O request message */
493 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
494 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
495 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
496 typedef struct iojob
{
497 int type
; /* Request type, REDIS_IOJOB_* */
498 redisDb
*db
;/* Redis database */
499 robj
*key
; /* This I/O request is about swapping this key */
500 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
501 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
502 off_t page
; /* Swap page where to read/write the object */
503 off_t pages
; /* Swap pages needed to safe object. PREPARE_SWAP return val */
504 int canceled
; /* True if this command was canceled by blocking side of VM */
505 pthread_t thread
; /* ID of the thread processing this entry */
508 /*================================ Prototypes =============================== */
510 static void freeStringObject(robj
*o
);
511 static void freeListObject(robj
*o
);
512 static void freeSetObject(robj
*o
);
513 static void decrRefCount(void *o
);
514 static robj
*createObject(int type
, void *ptr
);
515 static void freeClient(redisClient
*c
);
516 static int rdbLoad(char *filename
);
517 static void addReply(redisClient
*c
, robj
*obj
);
518 static void addReplySds(redisClient
*c
, sds s
);
519 static void incrRefCount(robj
*o
);
520 static int rdbSaveBackground(char *filename
);
521 static robj
*createStringObject(char *ptr
, size_t len
);
522 static robj
*dupStringObject(robj
*o
);
523 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
524 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
525 static int syncWithMaster(void);
526 static robj
*tryObjectSharing(robj
*o
);
527 static int tryObjectEncoding(robj
*o
);
528 static robj
*getDecodedObject(robj
*o
);
529 static int removeExpire(redisDb
*db
, robj
*key
);
530 static int expireIfNeeded(redisDb
*db
, robj
*key
);
531 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
532 static int deleteIfSwapped(redisDb
*db
, robj
*key
);
533 static int deleteKey(redisDb
*db
, robj
*key
);
534 static time_t getExpire(redisDb
*db
, robj
*key
);
535 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
536 static void updateSlavesWaitingBgsave(int bgsaveerr
);
537 static void freeMemoryIfNeeded(void);
538 static int processCommand(redisClient
*c
);
539 static void setupSigSegvAction(void);
540 static void rdbRemoveTempFile(pid_t childpid
);
541 static void aofRemoveTempFile(pid_t childpid
);
542 static size_t stringObjectLen(robj
*o
);
543 static void processInputBuffer(redisClient
*c
);
544 static zskiplist
*zslCreate(void);
545 static void zslFree(zskiplist
*zsl
);
546 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
547 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
548 static void initClientMultiState(redisClient
*c
);
549 static void freeClientMultiState(redisClient
*c
);
550 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
551 static void unblockClientWaitingData(redisClient
*c
);
552 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
553 static void vmInit(void);
554 static void vmMarkPagesFree(off_t page
, off_t count
);
555 static robj
*vmLoadObject(robj
*key
);
556 static robj
*vmPreviewObject(robj
*key
);
557 static int vmSwapOneObjectBlocking(void);
558 static int vmSwapOneObjectThreaded(void);
559 static int vmCanSwapOut(void);
560 static int tryFreeOneObjectFromFreelist(void);
561 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
562 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
563 static void vmCancelThreadedIOJob(robj
*o
);
564 static void lockThreadedIO(void);
565 static void unlockThreadedIO(void);
566 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
567 static void freeIOJob(iojob
*j
);
568 static void queueIOJob(iojob
*j
);
569 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
570 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
571 static void waitEmptyIOJobsQueue(void);
572 static void vmReopenSwapFile(void);
573 static int vmFreePage(off_t page
);
574 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
);
575 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
576 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
577 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
578 static struct redisCommand
*lookupCommand(char *name
);
579 static void call(redisClient
*c
, struct redisCommand
*cmd
);
580 static void resetClient(redisClient
*c
);
582 static void authCommand(redisClient
*c
);
583 static void pingCommand(redisClient
*c
);
584 static void echoCommand(redisClient
*c
);
585 static void setCommand(redisClient
*c
);
586 static void setnxCommand(redisClient
*c
);
587 static void getCommand(redisClient
*c
);
588 static void delCommand(redisClient
*c
);
589 static void existsCommand(redisClient
*c
);
590 static void incrCommand(redisClient
*c
);
591 static void decrCommand(redisClient
*c
);
592 static void incrbyCommand(redisClient
*c
);
593 static void decrbyCommand(redisClient
*c
);
594 static void selectCommand(redisClient
*c
);
595 static void randomkeyCommand(redisClient
*c
);
596 static void keysCommand(redisClient
*c
);
597 static void dbsizeCommand(redisClient
*c
);
598 static void lastsaveCommand(redisClient
*c
);
599 static void saveCommand(redisClient
*c
);
600 static void bgsaveCommand(redisClient
*c
);
601 static void bgrewriteaofCommand(redisClient
*c
);
602 static void shutdownCommand(redisClient
*c
);
603 static void moveCommand(redisClient
*c
);
604 static void renameCommand(redisClient
*c
);
605 static void renamenxCommand(redisClient
*c
);
606 static void lpushCommand(redisClient
*c
);
607 static void rpushCommand(redisClient
*c
);
608 static void lpopCommand(redisClient
*c
);
609 static void rpopCommand(redisClient
*c
);
610 static void llenCommand(redisClient
*c
);
611 static void lindexCommand(redisClient
*c
);
612 static void lrangeCommand(redisClient
*c
);
613 static void ltrimCommand(redisClient
*c
);
614 static void typeCommand(redisClient
*c
);
615 static void lsetCommand(redisClient
*c
);
616 static void saddCommand(redisClient
*c
);
617 static void sremCommand(redisClient
*c
);
618 static void smoveCommand(redisClient
*c
);
619 static void sismemberCommand(redisClient
*c
);
620 static void scardCommand(redisClient
*c
);
621 static void spopCommand(redisClient
*c
);
622 static void srandmemberCommand(redisClient
*c
);
623 static void sinterCommand(redisClient
*c
);
624 static void sinterstoreCommand(redisClient
*c
);
625 static void sunionCommand(redisClient
*c
);
626 static void sunionstoreCommand(redisClient
*c
);
627 static void sdiffCommand(redisClient
*c
);
628 static void sdiffstoreCommand(redisClient
*c
);
629 static void syncCommand(redisClient
*c
);
630 static void flushdbCommand(redisClient
*c
);
631 static void flushallCommand(redisClient
*c
);
632 static void sortCommand(redisClient
*c
);
633 static void lremCommand(redisClient
*c
);
634 static void rpoplpushcommand(redisClient
*c
);
635 static void infoCommand(redisClient
*c
);
636 static void mgetCommand(redisClient
*c
);
637 static void monitorCommand(redisClient
*c
);
638 static void expireCommand(redisClient
*c
);
639 static void expireatCommand(redisClient
*c
);
640 static void getsetCommand(redisClient
*c
);
641 static void ttlCommand(redisClient
*c
);
642 static void slaveofCommand(redisClient
*c
);
643 static void debugCommand(redisClient
*c
);
644 static void msetCommand(redisClient
*c
);
645 static void msetnxCommand(redisClient
*c
);
646 static void zaddCommand(redisClient
*c
);
647 static void zincrbyCommand(redisClient
*c
);
648 static void zrangeCommand(redisClient
*c
);
649 static void zrangebyscoreCommand(redisClient
*c
);
650 static void zcountCommand(redisClient
*c
);
651 static void zrevrangeCommand(redisClient
*c
);
652 static void zcardCommand(redisClient
*c
);
653 static void zremCommand(redisClient
*c
);
654 static void zscoreCommand(redisClient
*c
);
655 static void zremrangebyscoreCommand(redisClient
*c
);
656 static void multiCommand(redisClient
*c
);
657 static void execCommand(redisClient
*c
);
658 static void discardCommand(redisClient
*c
);
659 static void blpopCommand(redisClient
*c
);
660 static void brpopCommand(redisClient
*c
);
661 static void appendCommand(redisClient
*c
);
662 static void zrankCommand(redisClient
*c
);
664 /*================================= Globals ================================= */
667 static struct redisServer server
; /* server global state */
668 static struct redisCommand cmdTable
[] = {
669 {"get",getCommand
,2,REDIS_CMD_INLINE
,1,1,1},
670 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
671 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
672 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
673 {"del",delCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
674 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,1,1,1},
675 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
676 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
677 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,1,-1,1},
678 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
679 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
680 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
681 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
682 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
683 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
684 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,1,1,1},
685 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,1,1,1},
686 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
687 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,1,1,1},
688 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,1,1,1},
689 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,1,1,1},
690 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,2,1},
691 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
692 {"srem",sremCommand
,3,REDIS_CMD_BULK
,1,1,1},
693 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,1,2,1},
694 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,1,1,1},
695 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
696 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
697 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,1,1,1},
698 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
699 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
700 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
701 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
702 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
703 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
704 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,1,1,1},
705 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
706 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
707 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,1,1,1},
708 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,1,1,1},
709 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
710 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
711 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,1,1,1},
712 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
713 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
714 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
715 {"zrank",zrankCommand
,3,REDIS_CMD_INLINE
,1,1,1},
716 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
717 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
718 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
719 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
720 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
721 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,0,0,0},
722 {"select",selectCommand
,2,REDIS_CMD_INLINE
,0,0,0},
723 {"move",moveCommand
,3,REDIS_CMD_INLINE
,1,1,1},
724 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,1,1,1},
725 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,1,1,1},
726 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,0,0,0},
727 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,0,0,0},
728 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,0,0,0},
729 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,0,0,0},
730 {"auth",authCommand
,2,REDIS_CMD_INLINE
,0,0,0},
731 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,0,0,0},
732 {"echo",echoCommand
,2,REDIS_CMD_BULK
,0,0,0},
733 {"save",saveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
734 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
735 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,0,0,0},
736 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,0,0,0},
737 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
738 {"type",typeCommand
,2,REDIS_CMD_INLINE
,1,1,1},
739 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,0,0,0},
740 {"exec",execCommand
,1,REDIS_CMD_INLINE
,0,0,0},
741 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,0,0,0},
742 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,0,0,0},
743 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,0,0,0},
744 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,0,0,0},
745 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
746 {"info",infoCommand
,1,REDIS_CMD_INLINE
,0,0,0},
747 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,0,0,0},
748 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,1,1,1},
749 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,0,0,0},
750 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
751 {NULL
,NULL
,0,0,0,0,0}
754 /*============================ Utility functions ============================ */
756 /* Glob-style pattern matching. */
757 int stringmatchlen(const char *pattern
, int patternLen
,
758 const char *string
, int stringLen
, int nocase
)
763 while (pattern
[1] == '*') {
768 return 1; /* match */
770 if (stringmatchlen(pattern
+1, patternLen
-1,
771 string
, stringLen
, nocase
))
772 return 1; /* match */
776 return 0; /* no match */
780 return 0; /* no match */
790 not = pattern
[0] == '^';
797 if (pattern
[0] == '\\') {
800 if (pattern
[0] == string
[0])
802 } else if (pattern
[0] == ']') {
804 } else if (patternLen
== 0) {
808 } else if (pattern
[1] == '-' && patternLen
>= 3) {
809 int start
= pattern
[0];
810 int end
= pattern
[2];
818 start
= tolower(start
);
824 if (c
>= start
&& c
<= end
)
828 if (pattern
[0] == string
[0])
831 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
841 return 0; /* no match */
847 if (patternLen
>= 2) {
854 if (pattern
[0] != string
[0])
855 return 0; /* no match */
857 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
858 return 0; /* no match */
866 if (stringLen
== 0) {
867 while(*pattern
== '*') {
874 if (patternLen
== 0 && stringLen
== 0)
879 static void redisLog(int level
, const char *fmt
, ...) {
883 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
887 if (level
>= server
.verbosity
) {
893 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
894 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
895 vfprintf(fp
, fmt
, ap
);
901 if (server
.logfile
) fclose(fp
);
904 /*====================== Hash table type implementation ==================== */
906 /* This is an hash table type that uses the SDS dynamic strings libary as
907 * keys and radis objects as values (objects can hold SDS strings,
910 static void dictVanillaFree(void *privdata
, void *val
)
912 DICT_NOTUSED(privdata
);
916 static void dictListDestructor(void *privdata
, void *val
)
918 DICT_NOTUSED(privdata
);
919 listRelease((list
*)val
);
922 static int sdsDictKeyCompare(void *privdata
, const void *key1
,
926 DICT_NOTUSED(privdata
);
928 l1
= sdslen((sds
)key1
);
929 l2
= sdslen((sds
)key2
);
930 if (l1
!= l2
) return 0;
931 return memcmp(key1
, key2
, l1
) == 0;
934 static void dictRedisObjectDestructor(void *privdata
, void *val
)
936 DICT_NOTUSED(privdata
);
938 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
942 static int dictObjKeyCompare(void *privdata
, const void *key1
,
945 const robj
*o1
= key1
, *o2
= key2
;
946 return sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
949 static unsigned int dictObjHash(const void *key
) {
951 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
954 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
957 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
960 o1
= getDecodedObject(o1
);
961 o2
= getDecodedObject(o2
);
962 cmp
= sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
968 static unsigned int dictEncObjHash(const void *key
) {
969 robj
*o
= (robj
*) key
;
971 if (o
->encoding
== REDIS_ENCODING_RAW
) {
972 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
974 if (o
->encoding
== REDIS_ENCODING_INT
) {
978 len
= snprintf(buf
,32,"%ld",(long)o
->ptr
);
979 return dictGenHashFunction((unsigned char*)buf
, len
);
983 o
= getDecodedObject(o
);
984 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
991 /* Sets type and expires */
992 static dictType setDictType
= {
993 dictEncObjHash
, /* hash function */
996 dictEncObjKeyCompare
, /* key compare */
997 dictRedisObjectDestructor
, /* key destructor */
998 NULL
/* val destructor */
1001 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1002 static dictType zsetDictType
= {
1003 dictEncObjHash
, /* hash function */
1006 dictEncObjKeyCompare
, /* key compare */
1007 dictRedisObjectDestructor
, /* key destructor */
1008 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1012 static dictType hashDictType
= {
1013 dictObjHash
, /* hash function */
1016 dictObjKeyCompare
, /* key compare */
1017 dictRedisObjectDestructor
, /* key destructor */
1018 dictRedisObjectDestructor
/* val destructor */
1022 static dictType keyptrDictType
= {
1023 dictObjHash
, /* hash function */
1026 dictObjKeyCompare
, /* key compare */
1027 dictRedisObjectDestructor
, /* key destructor */
1028 NULL
/* val destructor */
1031 /* Keylist hash table type has unencoded redis objects as keys and
1032 * lists as values. It's used for blocking operations (BLPOP) and to
1033 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1034 static dictType keylistDictType
= {
1035 dictObjHash
, /* hash function */
1038 dictObjKeyCompare
, /* key compare */
1039 dictRedisObjectDestructor
, /* key destructor */
1040 dictListDestructor
/* val destructor */
1043 /* ========================= Random utility functions ======================= */
1045 /* Redis generally does not try to recover from out of memory conditions
1046 * when allocating objects or strings, it is not clear if it will be possible
1047 * to report this condition to the client since the networking layer itself
1048 * is based on heap allocation for send buffers, so we simply abort.
1049 * At least the code will be simpler to read... */
1050 static void oom(const char *msg
) {
1051 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1056 /* ====================== Redis server networking stuff ===================== */
1057 static void closeTimedoutClients(void) {
1060 time_t now
= time(NULL
);
1063 listRewind(server
.clients
,&li
);
1064 while ((ln
= listNext(&li
)) != NULL
) {
1065 c
= listNodeValue(ln
);
1066 if (server
.maxidletime
&&
1067 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1068 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1069 (now
- c
->lastinteraction
> server
.maxidletime
))
1071 redisLog(REDIS_VERBOSE
,"Closing idle client");
1073 } else if (c
->flags
& REDIS_BLOCKED
) {
1074 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1075 addReply(c
,shared
.nullmultibulk
);
1076 unblockClientWaitingData(c
);
1082 static int htNeedsResize(dict
*dict
) {
1083 long long size
, used
;
1085 size
= dictSlots(dict
);
1086 used
= dictSize(dict
);
1087 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1088 (used
*100/size
< REDIS_HT_MINFILL
));
1091 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1092 * we resize the hash table to save memory */
1093 static void tryResizeHashTables(void) {
1096 for (j
= 0; j
< server
.dbnum
; j
++) {
1097 if (htNeedsResize(server
.db
[j
].dict
)) {
1098 redisLog(REDIS_VERBOSE
,"The hash table %d is too sparse, resize it...",j
);
1099 dictResize(server
.db
[j
].dict
);
1100 redisLog(REDIS_VERBOSE
,"Hash table %d resized.",j
);
1102 if (htNeedsResize(server
.db
[j
].expires
))
1103 dictResize(server
.db
[j
].expires
);
1107 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1108 void backgroundSaveDoneHandler(int statloc
) {
1109 int exitcode
= WEXITSTATUS(statloc
);
1110 int bysignal
= WIFSIGNALED(statloc
);
1112 if (!bysignal
&& exitcode
== 0) {
1113 redisLog(REDIS_NOTICE
,
1114 "Background saving terminated with success");
1116 server
.lastsave
= time(NULL
);
1117 } else if (!bysignal
&& exitcode
!= 0) {
1118 redisLog(REDIS_WARNING
, "Background saving error");
1120 redisLog(REDIS_WARNING
,
1121 "Background saving terminated by signal");
1122 rdbRemoveTempFile(server
.bgsavechildpid
);
1124 server
.bgsavechildpid
= -1;
1125 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1126 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1127 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1130 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1132 void backgroundRewriteDoneHandler(int statloc
) {
1133 int exitcode
= WEXITSTATUS(statloc
);
1134 int bysignal
= WIFSIGNALED(statloc
);
1136 if (!bysignal
&& exitcode
== 0) {
1140 redisLog(REDIS_NOTICE
,
1141 "Background append only file rewriting terminated with success");
1142 /* Now it's time to flush the differences accumulated by the parent */
1143 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1144 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1146 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1149 /* Flush our data... */
1150 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1151 (signed) sdslen(server
.bgrewritebuf
)) {
1152 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1156 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1157 /* Now our work is to rename the temp file into the stable file. And
1158 * switch the file descriptor used by the server for append only. */
1159 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1160 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1164 /* Mission completed... almost */
1165 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1166 if (server
.appendfd
!= -1) {
1167 /* If append only is actually enabled... */
1168 close(server
.appendfd
);
1169 server
.appendfd
= fd
;
1171 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1172 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1174 /* If append only is disabled we just generate a dump in this
1175 * format. Why not? */
1178 } else if (!bysignal
&& exitcode
!= 0) {
1179 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1181 redisLog(REDIS_WARNING
,
1182 "Background append only file rewriting terminated by signal");
1185 sdsfree(server
.bgrewritebuf
);
1186 server
.bgrewritebuf
= sdsempty();
1187 aofRemoveTempFile(server
.bgrewritechildpid
);
1188 server
.bgrewritechildpid
= -1;
1191 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1192 int j
, loops
= server
.cronloops
++;
1193 REDIS_NOTUSED(eventLoop
);
1195 REDIS_NOTUSED(clientData
);
1197 /* We take a cached value of the unix time in the global state because
1198 * with virtual memory and aging there is to store the current time
1199 * in objects at every object access, and accuracy is not needed.
1200 * To access a global var is faster than calling time(NULL) */
1201 server
.unixtime
= time(NULL
);
1203 /* Show some info about non-empty databases */
1204 for (j
= 0; j
< server
.dbnum
; j
++) {
1205 long long size
, used
, vkeys
;
1207 size
= dictSlots(server
.db
[j
].dict
);
1208 used
= dictSize(server
.db
[j
].dict
);
1209 vkeys
= dictSize(server
.db
[j
].expires
);
1210 if (!(loops
% 5) && (used
|| vkeys
)) {
1211 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1212 /* dictPrintStats(server.dict); */
1216 /* We don't want to resize the hash tables while a bacground saving
1217 * is in progress: the saving child is created using fork() that is
1218 * implemented with a copy-on-write semantic in most modern systems, so
1219 * if we resize the HT while there is the saving child at work actually
1220 * a lot of memory movements in the parent will cause a lot of pages
1222 if (server
.bgsavechildpid
== -1) tryResizeHashTables();
1224 /* Show information about connected clients */
1226 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1227 listLength(server
.clients
)-listLength(server
.slaves
),
1228 listLength(server
.slaves
),
1229 zmalloc_used_memory(),
1230 dictSize(server
.sharingpool
));
1233 /* Close connections of timedout clients */
1234 if ((server
.maxidletime
&& !(loops
% 10)) || server
.blpop_blocked_clients
)
1235 closeTimedoutClients();
1237 /* Check if a background saving or AOF rewrite in progress terminated */
1238 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1242 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1243 if (pid
== server
.bgsavechildpid
) {
1244 backgroundSaveDoneHandler(statloc
);
1246 backgroundRewriteDoneHandler(statloc
);
1250 /* If there is not a background saving in progress check if
1251 * we have to save now */
1252 time_t now
= time(NULL
);
1253 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1254 struct saveparam
*sp
= server
.saveparams
+j
;
1256 if (server
.dirty
>= sp
->changes
&&
1257 now
-server
.lastsave
> sp
->seconds
) {
1258 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1259 sp
->changes
, sp
->seconds
);
1260 rdbSaveBackground(server
.dbfilename
);
1266 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1267 * will use few CPU cycles if there are few expiring keys, otherwise
1268 * it will get more aggressive to avoid that too much memory is used by
1269 * keys that can be removed from the keyspace. */
1270 for (j
= 0; j
< server
.dbnum
; j
++) {
1272 redisDb
*db
= server
.db
+j
;
1274 /* Continue to expire if at the end of the cycle more than 25%
1275 * of the keys were expired. */
1277 long num
= dictSize(db
->expires
);
1278 time_t now
= time(NULL
);
1281 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1282 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1287 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1288 t
= (time_t) dictGetEntryVal(de
);
1290 deleteKey(db
,dictGetEntryKey(de
));
1294 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1297 /* Swap a few keys on disk if we are over the memory limit and VM
1298 * is enbled. Try to free objects from the free list first. */
1299 if (vmCanSwapOut()) {
1300 while (server
.vm_enabled
&& zmalloc_used_memory() >
1301 server
.vm_max_memory
)
1305 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1306 retval
= (server
.vm_max_threads
== 0) ?
1307 vmSwapOneObjectBlocking() :
1308 vmSwapOneObjectThreaded();
1309 if (retval
== REDIS_ERR
&& (loops
% 30) == 0 &&
1310 zmalloc_used_memory() >
1311 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1313 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1315 /* Note that when using threade I/O we free just one object,
1316 * because anyway when the I/O thread in charge to swap this
1317 * object out will finish, the handler of completed jobs
1318 * will try to swap more objects if we are still out of memory. */
1319 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1323 /* Check if we should connect to a MASTER */
1324 if (server
.replstate
== REDIS_REPL_CONNECT
) {
1325 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1326 if (syncWithMaster() == REDIS_OK
) {
1327 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1333 /* This function gets called every time Redis is entering the
1334 * main loop of the event driven library, that is, before to sleep
1335 * for ready file descriptors. */
1336 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1337 REDIS_NOTUSED(eventLoop
);
1339 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1343 listRewind(server
.io_ready_clients
,&li
);
1344 while((ln
= listNext(&li
))) {
1345 redisClient
*c
= ln
->value
;
1346 struct redisCommand
*cmd
;
1348 /* Resume the client. */
1349 listDelNode(server
.io_ready_clients
,ln
);
1350 c
->flags
&= (~REDIS_IO_WAIT
);
1351 server
.vm_blocked_clients
--;
1352 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1353 readQueryFromClient
, c
);
1354 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1355 assert(cmd
!= NULL
);
1358 /* There may be more data to process in the input buffer. */
1359 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1360 processInputBuffer(c
);
1365 static void createSharedObjects(void) {
1366 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1367 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1368 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1369 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1370 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1371 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1372 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1373 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1374 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1375 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1376 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1377 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1378 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1379 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1380 "-ERR no such key\r\n"));
1381 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1382 "-ERR syntax error\r\n"));
1383 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1384 "-ERR source and destination objects are the same\r\n"));
1385 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1386 "-ERR index out of range\r\n"));
1387 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1388 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1389 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1390 shared
.select0
= createStringObject("select 0\r\n",10);
1391 shared
.select1
= createStringObject("select 1\r\n",10);
1392 shared
.select2
= createStringObject("select 2\r\n",10);
1393 shared
.select3
= createStringObject("select 3\r\n",10);
1394 shared
.select4
= createStringObject("select 4\r\n",10);
1395 shared
.select5
= createStringObject("select 5\r\n",10);
1396 shared
.select6
= createStringObject("select 6\r\n",10);
1397 shared
.select7
= createStringObject("select 7\r\n",10);
1398 shared
.select8
= createStringObject("select 8\r\n",10);
1399 shared
.select9
= createStringObject("select 9\r\n",10);
1402 static void appendServerSaveParams(time_t seconds
, int changes
) {
1403 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1404 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1405 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1406 server
.saveparamslen
++;
1409 static void resetServerSaveParams() {
1410 zfree(server
.saveparams
);
1411 server
.saveparams
= NULL
;
1412 server
.saveparamslen
= 0;
1415 static void initServerConfig() {
1416 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1417 server
.port
= REDIS_SERVERPORT
;
1418 server
.verbosity
= REDIS_VERBOSE
;
1419 server
.maxidletime
= REDIS_MAXIDLETIME
;
1420 server
.saveparams
= NULL
;
1421 server
.logfile
= NULL
; /* NULL = log on standard output */
1422 server
.bindaddr
= NULL
;
1423 server
.glueoutputbuf
= 1;
1424 server
.daemonize
= 0;
1425 server
.appendonly
= 0;
1426 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1427 server
.lastfsync
= time(NULL
);
1428 server
.appendfd
= -1;
1429 server
.appendseldb
= -1; /* Make sure the first time will not match */
1430 server
.pidfile
= "/var/run/redis.pid";
1431 server
.dbfilename
= "dump.rdb";
1432 server
.appendfilename
= "appendonly.aof";
1433 server
.requirepass
= NULL
;
1434 server
.shareobjects
= 0;
1435 server
.rdbcompression
= 1;
1436 server
.sharingpoolsize
= 1024;
1437 server
.maxclients
= 0;
1438 server
.blpop_blocked_clients
= 0;
1439 server
.maxmemory
= 0;
1440 server
.vm_enabled
= 0;
1441 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1442 server
.vm_page_size
= 256; /* 256 bytes per page */
1443 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1444 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1445 server
.vm_max_threads
= 4;
1446 server
.vm_blocked_clients
= 0;
1448 resetServerSaveParams();
1450 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1451 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1452 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1453 /* Replication related */
1455 server
.masterauth
= NULL
;
1456 server
.masterhost
= NULL
;
1457 server
.masterport
= 6379;
1458 server
.master
= NULL
;
1459 server
.replstate
= REDIS_REPL_NONE
;
1461 /* Double constants initialization */
1463 R_PosInf
= 1.0/R_Zero
;
1464 R_NegInf
= -1.0/R_Zero
;
1465 R_Nan
= R_Zero
/R_Zero
;
1468 static void initServer() {
1471 signal(SIGHUP
, SIG_IGN
);
1472 signal(SIGPIPE
, SIG_IGN
);
1473 setupSigSegvAction();
1475 server
.devnull
= fopen("/dev/null","w");
1476 if (server
.devnull
== NULL
) {
1477 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1480 server
.clients
= listCreate();
1481 server
.slaves
= listCreate();
1482 server
.monitors
= listCreate();
1483 server
.objfreelist
= listCreate();
1484 createSharedObjects();
1485 server
.el
= aeCreateEventLoop();
1486 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1487 server
.sharingpool
= dictCreate(&setDictType
,NULL
);
1488 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1489 if (server
.fd
== -1) {
1490 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1493 for (j
= 0; j
< server
.dbnum
; j
++) {
1494 server
.db
[j
].dict
= dictCreate(&hashDictType
,NULL
);
1495 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1496 server
.db
[j
].blockingkeys
= dictCreate(&keylistDictType
,NULL
);
1497 if (server
.vm_enabled
)
1498 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1499 server
.db
[j
].id
= j
;
1501 server
.cronloops
= 0;
1502 server
.bgsavechildpid
= -1;
1503 server
.bgrewritechildpid
= -1;
1504 server
.bgrewritebuf
= sdsempty();
1505 server
.lastsave
= time(NULL
);
1507 server
.stat_numcommands
= 0;
1508 server
.stat_numconnections
= 0;
1509 server
.stat_starttime
= time(NULL
);
1510 server
.unixtime
= time(NULL
);
1511 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1512 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1513 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1515 if (server
.appendonly
) {
1516 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1517 if (server
.appendfd
== -1) {
1518 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1524 if (server
.vm_enabled
) vmInit();
1527 /* Empty the whole database */
1528 static long long emptyDb() {
1530 long long removed
= 0;
1532 for (j
= 0; j
< server
.dbnum
; j
++) {
1533 removed
+= dictSize(server
.db
[j
].dict
);
1534 dictEmpty(server
.db
[j
].dict
);
1535 dictEmpty(server
.db
[j
].expires
);
1540 static int yesnotoi(char *s
) {
1541 if (!strcasecmp(s
,"yes")) return 1;
1542 else if (!strcasecmp(s
,"no")) return 0;
1546 /* I agree, this is a very rudimental way to load a configuration...
1547 will improve later if the config gets more complex */
1548 static void loadServerConfig(char *filename
) {
1550 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1554 if (filename
[0] == '-' && filename
[1] == '\0')
1557 if ((fp
= fopen(filename
,"r")) == NULL
) {
1558 redisLog(REDIS_WARNING
,"Fatal error, can't open config file");
1563 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1569 line
= sdstrim(line
," \t\r\n");
1571 /* Skip comments and blank lines*/
1572 if (line
[0] == '#' || line
[0] == '\0') {
1577 /* Split into arguments */
1578 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1579 sdstolower(argv
[0]);
1581 /* Execute config directives */
1582 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1583 server
.maxidletime
= atoi(argv
[1]);
1584 if (server
.maxidletime
< 0) {
1585 err
= "Invalid timeout value"; goto loaderr
;
1587 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1588 server
.port
= atoi(argv
[1]);
1589 if (server
.port
< 1 || server
.port
> 65535) {
1590 err
= "Invalid port"; goto loaderr
;
1592 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1593 server
.bindaddr
= zstrdup(argv
[1]);
1594 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1595 int seconds
= atoi(argv
[1]);
1596 int changes
= atoi(argv
[2]);
1597 if (seconds
< 1 || changes
< 0) {
1598 err
= "Invalid save parameters"; goto loaderr
;
1600 appendServerSaveParams(seconds
,changes
);
1601 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1602 if (chdir(argv
[1]) == -1) {
1603 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1604 argv
[1], strerror(errno
));
1607 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1608 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1609 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1610 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1611 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1613 err
= "Invalid log level. Must be one of debug, notice, warning";
1616 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1619 server
.logfile
= zstrdup(argv
[1]);
1620 if (!strcasecmp(server
.logfile
,"stdout")) {
1621 zfree(server
.logfile
);
1622 server
.logfile
= NULL
;
1624 if (server
.logfile
) {
1625 /* Test if we are able to open the file. The server will not
1626 * be able to abort just for this problem later... */
1627 logfp
= fopen(server
.logfile
,"a");
1628 if (logfp
== NULL
) {
1629 err
= sdscatprintf(sdsempty(),
1630 "Can't open the log file: %s", strerror(errno
));
1635 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1636 server
.dbnum
= atoi(argv
[1]);
1637 if (server
.dbnum
< 1) {
1638 err
= "Invalid number of databases"; goto loaderr
;
1640 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1641 server
.maxclients
= atoi(argv
[1]);
1642 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1643 server
.maxmemory
= strtoll(argv
[1], NULL
, 10);
1644 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1645 server
.masterhost
= sdsnew(argv
[1]);
1646 server
.masterport
= atoi(argv
[2]);
1647 server
.replstate
= REDIS_REPL_CONNECT
;
1648 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1649 server
.masterauth
= zstrdup(argv
[1]);
1650 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1651 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1652 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1654 } else if (!strcasecmp(argv
[0],"shareobjects") && argc
== 2) {
1655 if ((server
.shareobjects
= yesnotoi(argv
[1])) == -1) {
1656 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1658 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1659 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1660 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1662 } else if (!strcasecmp(argv
[0],"shareobjectspoolsize") && argc
== 2) {
1663 server
.sharingpoolsize
= atoi(argv
[1]);
1664 if (server
.sharingpoolsize
< 1) {
1665 err
= "invalid object sharing pool size"; goto loaderr
;
1667 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1668 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1669 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1671 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
1672 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
1673 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1675 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
1676 if (!strcasecmp(argv
[1],"no")) {
1677 server
.appendfsync
= APPENDFSYNC_NO
;
1678 } else if (!strcasecmp(argv
[1],"always")) {
1679 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1680 } else if (!strcasecmp(argv
[1],"everysec")) {
1681 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1683 err
= "argument must be 'no', 'always' or 'everysec'";
1686 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
1687 server
.requirepass
= zstrdup(argv
[1]);
1688 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
1689 server
.pidfile
= zstrdup(argv
[1]);
1690 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
1691 server
.dbfilename
= zstrdup(argv
[1]);
1692 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
1693 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
1694 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1696 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
1697 zfree(server
.vm_swap_file
);
1698 server
.vm_swap_file
= zstrdup(argv
[1]);
1699 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
1700 server
.vm_max_memory
= strtoll(argv
[1], NULL
, 10);
1701 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
1702 server
.vm_page_size
= strtoll(argv
[1], NULL
, 10);
1703 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
1704 server
.vm_pages
= strtoll(argv
[1], NULL
, 10);
1705 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1706 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1708 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
1710 for (j
= 0; j
< argc
; j
++)
1715 if (fp
!= stdin
) fclose(fp
);
1719 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
1720 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
1721 fprintf(stderr
, ">>> '%s'\n", line
);
1722 fprintf(stderr
, "%s\n", err
);
1726 static void freeClientArgv(redisClient
*c
) {
1729 for (j
= 0; j
< c
->argc
; j
++)
1730 decrRefCount(c
->argv
[j
]);
1731 for (j
= 0; j
< c
->mbargc
; j
++)
1732 decrRefCount(c
->mbargv
[j
]);
1737 static void freeClient(redisClient
*c
) {
1740 /* Note that if the client we are freeing is blocked into a blocking
1741 * call, we have to set querybuf to NULL *before* to call
1742 * unblockClientWaitingData() to avoid processInputBuffer() will get
1743 * called. Also it is important to remove the file events after
1744 * this, because this call adds the READABLE event. */
1745 sdsfree(c
->querybuf
);
1747 if (c
->flags
& REDIS_BLOCKED
)
1748 unblockClientWaitingData(c
);
1750 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
1751 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1752 listRelease(c
->reply
);
1755 /* Remove from the list of clients */
1756 ln
= listSearchKey(server
.clients
,c
);
1757 redisAssert(ln
!= NULL
);
1758 listDelNode(server
.clients
,ln
);
1759 /* Remove from the list of clients waiting for swapped keys */
1760 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
1761 ln
= listSearchKey(server
.io_ready_clients
,c
);
1763 listDelNode(server
.io_ready_clients
,ln
);
1764 server
.vm_blocked_clients
--;
1767 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
1768 ln
= listFirst(c
->io_keys
);
1769 dontWaitForSwappedKey(c
,ln
->value
);
1771 listRelease(c
->io_keys
);
1773 if (c
->flags
& REDIS_SLAVE
) {
1774 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
1776 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
1777 ln
= listSearchKey(l
,c
);
1778 redisAssert(ln
!= NULL
);
1781 if (c
->flags
& REDIS_MASTER
) {
1782 server
.master
= NULL
;
1783 server
.replstate
= REDIS_REPL_CONNECT
;
1787 freeClientMultiState(c
);
1791 #define GLUEREPLY_UP_TO (1024)
1792 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
1794 char buf
[GLUEREPLY_UP_TO
];
1799 listRewind(c
->reply
,&li
);
1800 while((ln
= listNext(&li
))) {
1804 objlen
= sdslen(o
->ptr
);
1805 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
1806 memcpy(buf
+copylen
,o
->ptr
,objlen
);
1808 listDelNode(c
->reply
,ln
);
1810 if (copylen
== 0) return;
1814 /* Now the output buffer is empty, add the new single element */
1815 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
1816 listAddNodeHead(c
->reply
,o
);
1819 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
1820 redisClient
*c
= privdata
;
1821 int nwritten
= 0, totwritten
= 0, objlen
;
1824 REDIS_NOTUSED(mask
);
1826 /* Use writev() if we have enough buffers to send */
1827 if (!server
.glueoutputbuf
&&
1828 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
1829 !(c
->flags
& REDIS_MASTER
))
1831 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
1835 while(listLength(c
->reply
)) {
1836 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
1837 glueReplyBuffersIfNeeded(c
);
1839 o
= listNodeValue(listFirst(c
->reply
));
1840 objlen
= sdslen(o
->ptr
);
1843 listDelNode(c
->reply
,listFirst(c
->reply
));
1847 if (c
->flags
& REDIS_MASTER
) {
1848 /* Don't reply to a master */
1849 nwritten
= objlen
- c
->sentlen
;
1851 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
1852 if (nwritten
<= 0) break;
1854 c
->sentlen
+= nwritten
;
1855 totwritten
+= nwritten
;
1856 /* If we fully sent the object on head go to the next one */
1857 if (c
->sentlen
== objlen
) {
1858 listDelNode(c
->reply
,listFirst(c
->reply
));
1861 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1862 * bytes, in a single threaded server it's a good idea to serve
1863 * other clients as well, even if a very large request comes from
1864 * super fast link that is always able to accept data (in real world
1865 * scenario think about 'KEYS *' against the loopback interfae) */
1866 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
1868 if (nwritten
== -1) {
1869 if (errno
== EAGAIN
) {
1872 redisLog(REDIS_VERBOSE
,
1873 "Error writing to client: %s", strerror(errno
));
1878 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
1879 if (listLength(c
->reply
) == 0) {
1881 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1885 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
1887 redisClient
*c
= privdata
;
1888 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
1890 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
1891 int offset
, ion
= 0;
1893 REDIS_NOTUSED(mask
);
1896 while (listLength(c
->reply
)) {
1897 offset
= c
->sentlen
;
1901 /* fill-in the iov[] array */
1902 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
1903 o
= listNodeValue(node
);
1904 objlen
= sdslen(o
->ptr
);
1906 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
1909 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
1910 break; /* no more iovecs */
1912 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
1913 iov
[ion
].iov_len
= objlen
- offset
;
1914 willwrite
+= objlen
- offset
;
1915 offset
= 0; /* just for the first item */
1922 /* write all collected blocks at once */
1923 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
1924 if (errno
!= EAGAIN
) {
1925 redisLog(REDIS_VERBOSE
,
1926 "Error writing to client: %s", strerror(errno
));
1933 totwritten
+= nwritten
;
1934 offset
= c
->sentlen
;
1936 /* remove written robjs from c->reply */
1937 while (nwritten
&& listLength(c
->reply
)) {
1938 o
= listNodeValue(listFirst(c
->reply
));
1939 objlen
= sdslen(o
->ptr
);
1941 if(nwritten
>= objlen
- offset
) {
1942 listDelNode(c
->reply
, listFirst(c
->reply
));
1943 nwritten
-= objlen
- offset
;
1947 c
->sentlen
+= nwritten
;
1955 c
->lastinteraction
= time(NULL
);
1957 if (listLength(c
->reply
) == 0) {
1959 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1963 static struct redisCommand
*lookupCommand(char *name
) {
1965 while(cmdTable
[j
].name
!= NULL
) {
1966 if (!strcasecmp(name
,cmdTable
[j
].name
)) return &cmdTable
[j
];
1972 /* resetClient prepare the client to process the next command */
1973 static void resetClient(redisClient
*c
) {
1979 /* Call() is the core of Redis execution of a command */
1980 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
1983 dirty
= server
.dirty
;
1985 if (server
.appendonly
&& server
.dirty
-dirty
)
1986 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
1987 if (server
.dirty
-dirty
&& listLength(server
.slaves
))
1988 replicationFeedSlaves(server
.slaves
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
1989 if (listLength(server
.monitors
))
1990 replicationFeedSlaves(server
.monitors
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
1991 server
.stat_numcommands
++;
1994 /* If this function gets called we already read a whole
1995 * command, argments are in the client argv/argc fields.
1996 * processCommand() execute the command or prepare the
1997 * server for a bulk read from the client.
1999 * If 1 is returned the client is still alive and valid and
2000 * and other operations can be performed by the caller. Otherwise
2001 * if 0 is returned the client was destroied (i.e. after QUIT). */
2002 static int processCommand(redisClient
*c
) {
2003 struct redisCommand
*cmd
;
2005 /* Free some memory if needed (maxmemory setting) */
2006 if (server
.maxmemory
) freeMemoryIfNeeded();
2008 /* Handle the multi bulk command type. This is an alternative protocol
2009 * supported by Redis in order to receive commands that are composed of
2010 * multiple binary-safe "bulk" arguments. The latency of processing is
2011 * a bit higher but this allows things like multi-sets, so if this
2012 * protocol is used only for MSET and similar commands this is a big win. */
2013 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2014 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2015 if (c
->multibulk
<= 0) {
2019 decrRefCount(c
->argv
[c
->argc
-1]);
2023 } else if (c
->multibulk
) {
2024 if (c
->bulklen
== -1) {
2025 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2026 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2030 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2031 decrRefCount(c
->argv
[0]);
2032 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2034 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2039 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2043 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2044 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2048 if (c
->multibulk
== 0) {
2052 /* Here we need to swap the multi-bulk argc/argv with the
2053 * normal argc/argv of the client structure. */
2055 c
->argv
= c
->mbargv
;
2056 c
->mbargv
= auxargv
;
2059 c
->argc
= c
->mbargc
;
2060 c
->mbargc
= auxargc
;
2062 /* We need to set bulklen to something different than -1
2063 * in order for the code below to process the command without
2064 * to try to read the last argument of a bulk command as
2065 * a special argument. */
2067 /* continue below and process the command */
2074 /* -- end of multi bulk commands processing -- */
2076 /* The QUIT command is handled as a special case. Normal command
2077 * procs are unable to close the client connection safely */
2078 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2083 /* Now lookup the command and check ASAP about trivial error conditions
2084 * such wrong arity, bad command name and so forth. */
2085 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2088 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2089 (char*)c
->argv
[0]->ptr
));
2092 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2093 (c
->argc
< -cmd
->arity
)) {
2095 sdscatprintf(sdsempty(),
2096 "-ERR wrong number of arguments for '%s' command\r\n",
2100 } else if (server
.maxmemory
&& cmd
->flags
& REDIS_CMD_DENYOOM
&& zmalloc_used_memory() > server
.maxmemory
) {
2101 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2104 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2105 /* This is a bulk command, we have to read the last argument yet. */
2106 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2108 decrRefCount(c
->argv
[c
->argc
-1]);
2109 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2111 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2116 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2117 /* It is possible that the bulk read is already in the
2118 * buffer. Check this condition and handle it accordingly.
2119 * This is just a fast path, alternative to call processInputBuffer().
2120 * It's a good idea since the code is small and this condition
2121 * happens most of the times. */
2122 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2123 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2125 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2127 /* Otherwise return... there is to read the last argument
2128 * from the socket. */
2132 /* Let's try to share objects on the command arguments vector */
2133 if (server
.shareobjects
) {
2135 for(j
= 1; j
< c
->argc
; j
++)
2136 c
->argv
[j
] = tryObjectSharing(c
->argv
[j
]);
2138 /* Let's try to encode the bulk object to save space. */
2139 if (cmd
->flags
& REDIS_CMD_BULK
)
2140 tryObjectEncoding(c
->argv
[c
->argc
-1]);
2142 /* Check if the user is authenticated */
2143 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2144 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2149 /* Exec the command */
2150 if (c
->flags
& REDIS_MULTI
&& cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
) {
2151 queueMultiCommand(c
,cmd
);
2152 addReply(c
,shared
.queued
);
2154 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2155 blockClientOnSwappedKeys(cmd
,c
)) return 1;
2159 /* Prepare the client for the next command */
2164 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
2169 /* (args*2)+1 is enough room for args, spaces, newlines */
2170 robj
*static_outv
[REDIS_STATIC_ARGS
*2+1];
2172 if (argc
<= REDIS_STATIC_ARGS
) {
2175 outv
= zmalloc(sizeof(robj
*)*(argc
*2+1));
2178 for (j
= 0; j
< argc
; j
++) {
2179 if (j
!= 0) outv
[outc
++] = shared
.space
;
2180 if ((cmd
->flags
& REDIS_CMD_BULK
) && j
== argc
-1) {
2183 lenobj
= createObject(REDIS_STRING
,
2184 sdscatprintf(sdsempty(),"%lu\r\n",
2185 (unsigned long) stringObjectLen(argv
[j
])));
2186 lenobj
->refcount
= 0;
2187 outv
[outc
++] = lenobj
;
2189 outv
[outc
++] = argv
[j
];
2191 outv
[outc
++] = shared
.crlf
;
2193 /* Increment all the refcounts at start and decrement at end in order to
2194 * be sure to free objects if there is no slave in a replication state
2195 * able to be feed with commands */
2196 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2197 listRewind(slaves
,&li
);
2198 while((ln
= listNext(&li
))) {
2199 redisClient
*slave
= ln
->value
;
2201 /* Don't feed slaves that are still waiting for BGSAVE to start */
2202 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2204 /* Feed all the other slaves, MONITORs and so on */
2205 if (slave
->slaveseldb
!= dictid
) {
2209 case 0: selectcmd
= shared
.select0
; break;
2210 case 1: selectcmd
= shared
.select1
; break;
2211 case 2: selectcmd
= shared
.select2
; break;
2212 case 3: selectcmd
= shared
.select3
; break;
2213 case 4: selectcmd
= shared
.select4
; break;
2214 case 5: selectcmd
= shared
.select5
; break;
2215 case 6: selectcmd
= shared
.select6
; break;
2216 case 7: selectcmd
= shared
.select7
; break;
2217 case 8: selectcmd
= shared
.select8
; break;
2218 case 9: selectcmd
= shared
.select9
; break;
2220 selectcmd
= createObject(REDIS_STRING
,
2221 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2222 selectcmd
->refcount
= 0;
2225 addReply(slave
,selectcmd
);
2226 slave
->slaveseldb
= dictid
;
2228 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2230 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2231 if (outv
!= static_outv
) zfree(outv
);
2234 static void processInputBuffer(redisClient
*c
) {
2236 /* Before to process the input buffer, make sure the client is not
2237 * waitig for a blocking operation such as BLPOP. Note that the first
2238 * iteration the client is never blocked, otherwise the processInputBuffer
2239 * would not be called at all, but after the execution of the first commands
2240 * in the input buffer the client may be blocked, and the "goto again"
2241 * will try to reiterate. The following line will make it return asap. */
2242 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2243 if (c
->bulklen
== -1) {
2244 /* Read the first line of the query */
2245 char *p
= strchr(c
->querybuf
,'\n');
2252 query
= c
->querybuf
;
2253 c
->querybuf
= sdsempty();
2254 querylen
= 1+(p
-(query
));
2255 if (sdslen(query
) > querylen
) {
2256 /* leave data after the first line of the query in the buffer */
2257 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2259 *p
= '\0'; /* remove "\n" */
2260 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2261 sdsupdatelen(query
);
2263 /* Now we can split the query in arguments */
2264 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2267 if (c
->argv
) zfree(c
->argv
);
2268 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2270 for (j
= 0; j
< argc
; j
++) {
2271 if (sdslen(argv
[j
])) {
2272 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2280 /* Execute the command. If the client is still valid
2281 * after processCommand() return and there is something
2282 * on the query buffer try to process the next command. */
2283 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2285 /* Nothing to process, argc == 0. Just process the query
2286 * buffer if it's not empty or return to the caller */
2287 if (sdslen(c
->querybuf
)) goto again
;
2290 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2291 redisLog(REDIS_VERBOSE
, "Client protocol error");
2296 /* Bulk read handling. Note that if we are at this point
2297 the client already sent a command terminated with a newline,
2298 we are reading the bulk data that is actually the last
2299 argument of the command. */
2300 int qbl
= sdslen(c
->querybuf
);
2302 if (c
->bulklen
<= qbl
) {
2303 /* Copy everything but the final CRLF as final argument */
2304 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2306 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2307 /* Process the command. If the client is still valid after
2308 * the processing and there is more data in the buffer
2309 * try to parse it. */
2310 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2316 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2317 redisClient
*c
= (redisClient
*) privdata
;
2318 char buf
[REDIS_IOBUF_LEN
];
2321 REDIS_NOTUSED(mask
);
2323 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2325 if (errno
== EAGAIN
) {
2328 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2332 } else if (nread
== 0) {
2333 redisLog(REDIS_VERBOSE
, "Client closed connection");
2338 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2339 c
->lastinteraction
= time(NULL
);
2343 if (!(c
->flags
& REDIS_BLOCKED
))
2344 processInputBuffer(c
);
2347 static int selectDb(redisClient
*c
, int id
) {
2348 if (id
< 0 || id
>= server
.dbnum
)
2350 c
->db
= &server
.db
[id
];
2354 static void *dupClientReplyValue(void *o
) {
2355 incrRefCount((robj
*)o
);
2359 static redisClient
*createClient(int fd
) {
2360 redisClient
*c
= zmalloc(sizeof(*c
));
2362 anetNonBlock(NULL
,fd
);
2363 anetTcpNoDelay(NULL
,fd
);
2364 if (!c
) return NULL
;
2367 c
->querybuf
= sdsempty();
2376 c
->lastinteraction
= time(NULL
);
2377 c
->authenticated
= 0;
2378 c
->replstate
= REDIS_REPL_NONE
;
2379 c
->reply
= listCreate();
2380 listSetFreeMethod(c
->reply
,decrRefCount
);
2381 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2382 c
->blockingkeys
= NULL
;
2383 c
->blockingkeysnum
= 0;
2384 c
->io_keys
= listCreate();
2385 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2386 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2387 readQueryFromClient
, c
) == AE_ERR
) {
2391 listAddNodeTail(server
.clients
,c
);
2392 initClientMultiState(c
);
2396 static void addReply(redisClient
*c
, robj
*obj
) {
2397 if (listLength(c
->reply
) == 0 &&
2398 (c
->replstate
== REDIS_REPL_NONE
||
2399 c
->replstate
== REDIS_REPL_ONLINE
) &&
2400 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2401 sendReplyToClient
, c
) == AE_ERR
) return;
2403 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2404 obj
= dupStringObject(obj
);
2405 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2407 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2410 static void addReplySds(redisClient
*c
, sds s
) {
2411 robj
*o
= createObject(REDIS_STRING
,s
);
2416 static void addReplyDouble(redisClient
*c
, double d
) {
2419 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2420 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2421 (unsigned long) strlen(buf
),buf
));
2424 static void addReplyLong(redisClient
*c
, long l
) {
2428 len
= snprintf(buf
,sizeof(buf
),":%ld\r\n",l
);
2429 addReplySds(c
,sdsnewlen(buf
,len
));
2432 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2435 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2436 len
= sdslen(obj
->ptr
);
2438 long n
= (long)obj
->ptr
;
2440 /* Compute how many bytes will take this integer as a radix 10 string */
2446 while((n
= n
/10) != 0) {
2450 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len
));
2453 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2458 REDIS_NOTUSED(mask
);
2459 REDIS_NOTUSED(privdata
);
2461 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2462 if (cfd
== AE_ERR
) {
2463 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2466 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2467 if ((c
= createClient(cfd
)) == NULL
) {
2468 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2469 close(cfd
); /* May be already closed, just ingore errors */
2472 /* If maxclient directive is set and this is one client more... close the
2473 * connection. Note that we create the client instead to check before
2474 * for this condition, since now the socket is already set in nonblocking
2475 * mode and we can send an error for free using the Kernel I/O */
2476 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2477 char *err
= "-ERR max number of clients reached\r\n";
2479 /* That's a best effort error message, don't check write errors */
2480 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2481 /* Nothing to do, Just to avoid the warning... */
2486 server
.stat_numconnections
++;
2489 /* ======================= Redis objects implementation ===================== */
2491 static robj
*createObject(int type
, void *ptr
) {
2494 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2495 if (listLength(server
.objfreelist
)) {
2496 listNode
*head
= listFirst(server
.objfreelist
);
2497 o
= listNodeValue(head
);
2498 listDelNode(server
.objfreelist
,head
);
2499 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2501 if (server
.vm_enabled
) {
2502 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2503 o
= zmalloc(sizeof(*o
));
2505 o
= zmalloc(sizeof(*o
)-sizeof(struct redisObjectVM
));
2509 o
->encoding
= REDIS_ENCODING_RAW
;
2512 if (server
.vm_enabled
) {
2513 /* Note that this code may run in the context of an I/O thread
2514 * and accessing to server.unixtime in theory is an error
2515 * (no locks). But in practice this is safe, and even if we read
2516 * garbage Redis will not fail, as it's just a statistical info */
2517 o
->vm
.atime
= server
.unixtime
;
2518 o
->storage
= REDIS_VM_MEMORY
;
2523 static robj
*createStringObject(char *ptr
, size_t len
) {
2524 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
2527 static robj
*dupStringObject(robj
*o
) {
2528 assert(o
->encoding
== REDIS_ENCODING_RAW
);
2529 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
2532 static robj
*createListObject(void) {
2533 list
*l
= listCreate();
2535 listSetFreeMethod(l
,decrRefCount
);
2536 return createObject(REDIS_LIST
,l
);
2539 static robj
*createSetObject(void) {
2540 dict
*d
= dictCreate(&setDictType
,NULL
);
2541 return createObject(REDIS_SET
,d
);
2544 static robj
*createZsetObject(void) {
2545 zset
*zs
= zmalloc(sizeof(*zs
));
2547 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
2548 zs
->zsl
= zslCreate();
2549 return createObject(REDIS_ZSET
,zs
);
2552 static void freeStringObject(robj
*o
) {
2553 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2558 static void freeListObject(robj
*o
) {
2559 listRelease((list
*) o
->ptr
);
2562 static void freeSetObject(robj
*o
) {
2563 dictRelease((dict
*) o
->ptr
);
2566 static void freeZsetObject(robj
*o
) {
2569 dictRelease(zs
->dict
);
2574 static void freeHashObject(robj
*o
) {
2575 dictRelease((dict
*) o
->ptr
);
2578 static void incrRefCount(robj
*o
) {
2579 redisAssert(!server
.vm_enabled
|| o
->storage
== REDIS_VM_MEMORY
);
2583 static void decrRefCount(void *obj
) {
2586 /* Object is a key of a swapped out value, or in the process of being
2588 if (server
.vm_enabled
&&
2589 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
2591 if (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
) {
2592 redisAssert(o
->refcount
== 1);
2594 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(obj
);
2595 redisAssert(o
->type
== REDIS_STRING
);
2596 freeStringObject(o
);
2597 vmMarkPagesFree(o
->vm
.page
,o
->vm
.usedpages
);
2598 pthread_mutex_lock(&server
.obj_freelist_mutex
);
2599 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2600 !listAddNodeHead(server
.objfreelist
,o
))
2602 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2603 server
.vm_stats_swapped_objects
--;
2606 /* Object is in memory, or in the process of being swapped out. */
2607 if (--(o
->refcount
) == 0) {
2608 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
2609 vmCancelThreadedIOJob(obj
);
2611 case REDIS_STRING
: freeStringObject(o
); break;
2612 case REDIS_LIST
: freeListObject(o
); break;
2613 case REDIS_SET
: freeSetObject(o
); break;
2614 case REDIS_ZSET
: freeZsetObject(o
); break;
2615 case REDIS_HASH
: freeHashObject(o
); break;
2616 default: redisAssert(0 != 0); break;
2618 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2619 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2620 !listAddNodeHead(server
.objfreelist
,o
))
2622 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2626 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
2627 dictEntry
*de
= dictFind(db
->dict
,key
);
2629 robj
*key
= dictGetEntryKey(de
);
2630 robj
*val
= dictGetEntryVal(de
);
2632 if (server
.vm_enabled
) {
2633 if (key
->storage
== REDIS_VM_MEMORY
||
2634 key
->storage
== REDIS_VM_SWAPPING
)
2636 /* If we were swapping the object out, stop it, this key
2638 if (key
->storage
== REDIS_VM_SWAPPING
)
2639 vmCancelThreadedIOJob(key
);
2640 /* Update the access time of the key for the aging algorithm. */
2641 key
->vm
.atime
= server
.unixtime
;
2643 int notify
= (key
->storage
== REDIS_VM_LOADING
);
2645 /* Our value was swapped on disk. Bring it at home. */
2646 redisAssert(val
== NULL
);
2647 val
= vmLoadObject(key
);
2648 dictGetEntryVal(de
) = val
;
2650 /* Clients blocked by the VM subsystem may be waiting for
2652 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
2661 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
2662 expireIfNeeded(db
,key
);
2663 return lookupKey(db
,key
);
2666 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
2667 deleteIfVolatile(db
,key
);
2668 return lookupKey(db
,key
);
2671 static int deleteKey(redisDb
*db
, robj
*key
) {
2674 /* We need to protect key from destruction: after the first dictDelete()
2675 * it may happen that 'key' is no longer valid if we don't increment
2676 * it's count. This may happen when we get the object reference directly
2677 * from the hash table with dictRandomKey() or dict iterators */
2679 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
);
2680 retval
= dictDelete(db
->dict
,key
);
2683 return retval
== DICT_OK
;
2686 /* Try to share an object against the shared objects pool */
2687 static robj
*tryObjectSharing(robj
*o
) {
2688 struct dictEntry
*de
;
2691 if (o
== NULL
|| server
.shareobjects
== 0) return o
;
2693 redisAssert(o
->type
== REDIS_STRING
);
2694 de
= dictFind(server
.sharingpool
,o
);
2696 robj
*shared
= dictGetEntryKey(de
);
2698 c
= ((unsigned long) dictGetEntryVal(de
))+1;
2699 dictGetEntryVal(de
) = (void*) c
;
2700 incrRefCount(shared
);
2704 /* Here we are using a stream algorihtm: Every time an object is
2705 * shared we increment its count, everytime there is a miss we
2706 * recrement the counter of a random object. If this object reaches
2707 * zero we remove the object and put the current object instead. */
2708 if (dictSize(server
.sharingpool
) >=
2709 server
.sharingpoolsize
) {
2710 de
= dictGetRandomKey(server
.sharingpool
);
2711 redisAssert(de
!= NULL
);
2712 c
= ((unsigned long) dictGetEntryVal(de
))-1;
2713 dictGetEntryVal(de
) = (void*) c
;
2715 dictDelete(server
.sharingpool
,de
->key
);
2718 c
= 0; /* If the pool is empty we want to add this object */
2723 retval
= dictAdd(server
.sharingpool
,o
,(void*)1);
2724 redisAssert(retval
== DICT_OK
);
2731 /* Check if the nul-terminated string 's' can be represented by a long
2732 * (that is, is a number that fits into long without any other space or
2733 * character before or after the digits).
2735 * If so, the function returns REDIS_OK and *longval is set to the value
2736 * of the number. Otherwise REDIS_ERR is returned */
2737 static int isStringRepresentableAsLong(sds s
, long *longval
) {
2738 char buf
[32], *endptr
;
2742 value
= strtol(s
, &endptr
, 10);
2743 if (endptr
[0] != '\0') return REDIS_ERR
;
2744 slen
= snprintf(buf
,32,"%ld",value
);
2746 /* If the number converted back into a string is not identical
2747 * then it's not possible to encode the string as integer */
2748 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
2749 if (longval
) *longval
= value
;
2753 /* Try to encode a string object in order to save space */
2754 static int tryObjectEncoding(robj
*o
) {
2758 if (o
->encoding
!= REDIS_ENCODING_RAW
)
2759 return REDIS_ERR
; /* Already encoded */
2761 /* It's not save to encode shared objects: shared objects can be shared
2762 * everywhere in the "object space" of Redis. Encoded objects can only
2763 * appear as "values" (and not, for instance, as keys) */
2764 if (o
->refcount
> 1) return REDIS_ERR
;
2766 /* Currently we try to encode only strings */
2767 redisAssert(o
->type
== REDIS_STRING
);
2769 /* Check if we can represent this string as a long integer */
2770 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return REDIS_ERR
;
2772 /* Ok, this object can be encoded */
2773 o
->encoding
= REDIS_ENCODING_INT
;
2775 o
->ptr
= (void*) value
;
2779 /* Get a decoded version of an encoded object (returned as a new object).
2780 * If the object is already raw-encoded just increment the ref count. */
2781 static robj
*getDecodedObject(robj
*o
) {
2784 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2788 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
2791 snprintf(buf
,32,"%ld",(long)o
->ptr
);
2792 dec
= createStringObject(buf
,strlen(buf
));
2795 redisAssert(1 != 1);
2799 /* Compare two string objects via strcmp() or alike.
2800 * Note that the objects may be integer-encoded. In such a case we
2801 * use snprintf() to get a string representation of the numbers on the stack
2802 * and compare the strings, it's much faster than calling getDecodedObject().
2804 * Important note: if objects are not integer encoded, but binary-safe strings,
2805 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2807 static int compareStringObjects(robj
*a
, robj
*b
) {
2808 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
2809 char bufa
[128], bufb
[128], *astr
, *bstr
;
2812 if (a
== b
) return 0;
2813 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
2814 snprintf(bufa
,sizeof(bufa
),"%ld",(long) a
->ptr
);
2820 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
2821 snprintf(bufb
,sizeof(bufb
),"%ld",(long) b
->ptr
);
2827 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
2830 static size_t stringObjectLen(robj
*o
) {
2831 redisAssert(o
->type
== REDIS_STRING
);
2832 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2833 return sdslen(o
->ptr
);
2837 return snprintf(buf
,32,"%ld",(long)o
->ptr
);
2841 /*============================ RDB saving/loading =========================== */
2843 static int rdbSaveType(FILE *fp
, unsigned char type
) {
2844 if (fwrite(&type
,1,1,fp
) == 0) return -1;
2848 static int rdbSaveTime(FILE *fp
, time_t t
) {
2849 int32_t t32
= (int32_t) t
;
2850 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
2854 /* check rdbLoadLen() comments for more info */
2855 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
2856 unsigned char buf
[2];
2859 /* Save a 6 bit len */
2860 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
2861 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2862 } else if (len
< (1<<14)) {
2863 /* Save a 14 bit len */
2864 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
2866 if (fwrite(buf
,2,1,fp
) == 0) return -1;
2868 /* Save a 32 bit len */
2869 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
2870 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2872 if (fwrite(&len
,4,1,fp
) == 0) return -1;
2877 /* String objects in the form "2391" "-100" without any space and with a
2878 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2879 * encoded as integers to save space */
2880 static int rdbTryIntegerEncoding(sds s
, unsigned char *enc
) {
2882 char *endptr
, buf
[32];
2884 /* Check if it's possible to encode this value as a number */
2885 value
= strtoll(s
, &endptr
, 10);
2886 if (endptr
[0] != '\0') return 0;
2887 snprintf(buf
,32,"%lld",value
);
2889 /* If the number converted back into a string is not identical
2890 * then it's not possible to encode the string as integer */
2891 if (strlen(buf
) != sdslen(s
) || memcmp(buf
,s
,sdslen(s
))) return 0;
2893 /* Finally check if it fits in our ranges */
2894 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
2895 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
2896 enc
[1] = value
&0xFF;
2898 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
2899 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
2900 enc
[1] = value
&0xFF;
2901 enc
[2] = (value
>>8)&0xFF;
2903 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
2904 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
2905 enc
[1] = value
&0xFF;
2906 enc
[2] = (value
>>8)&0xFF;
2907 enc
[3] = (value
>>16)&0xFF;
2908 enc
[4] = (value
>>24)&0xFF;
2915 static int rdbSaveLzfStringObject(FILE *fp
, robj
*obj
) {
2916 unsigned int comprlen
, outlen
;
2920 /* We require at least four bytes compression for this to be worth it */
2921 outlen
= sdslen(obj
->ptr
)-4;
2922 if (outlen
<= 0) return 0;
2923 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
2924 comprlen
= lzf_compress(obj
->ptr
, sdslen(obj
->ptr
), out
, outlen
);
2925 if (comprlen
== 0) {
2929 /* Data compressed! Let's save it on disk */
2930 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
2931 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
2932 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
2933 if (rdbSaveLen(fp
,sdslen(obj
->ptr
)) == -1) goto writeerr
;
2934 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
2943 /* Save a string objet as [len][data] on disk. If the object is a string
2944 * representation of an integer value we try to safe it in a special form */
2945 static int rdbSaveStringObjectRaw(FILE *fp
, robj
*obj
) {
2949 len
= sdslen(obj
->ptr
);
2951 /* Try integer encoding */
2953 unsigned char buf
[5];
2954 if ((enclen
= rdbTryIntegerEncoding(obj
->ptr
,buf
)) > 0) {
2955 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
2960 /* Try LZF compression - under 20 bytes it's unable to compress even
2961 * aaaaaaaaaaaaaaaaaa so skip it */
2962 if (server
.rdbcompression
&& len
> 20) {
2965 retval
= rdbSaveLzfStringObject(fp
,obj
);
2966 if (retval
== -1) return -1;
2967 if (retval
> 0) return 0;
2968 /* retval == 0 means data can't be compressed, save the old way */
2971 /* Store verbatim */
2972 if (rdbSaveLen(fp
,len
) == -1) return -1;
2973 if (len
&& fwrite(obj
->ptr
,len
,1,fp
) == 0) return -1;
2977 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
2978 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
2981 /* Avoid incr/decr ref count business when possible.
2982 * This plays well with copy-on-write given that we are probably
2983 * in a child process (BGSAVE). Also this makes sure key objects
2984 * of swapped objects are not incRefCount-ed (an assert does not allow
2985 * this in order to avoid bugs) */
2986 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
2987 obj
= getDecodedObject(obj
);
2988 retval
= rdbSaveStringObjectRaw(fp
,obj
);
2991 retval
= rdbSaveStringObjectRaw(fp
,obj
);
2996 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
2997 * 8 bit integer specifing the length of the representation.
2998 * This 8 bit integer has special values in order to specify the following
3004 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3005 unsigned char buf
[128];
3011 } else if (!isfinite(val
)) {
3013 buf
[0] = (val
< 0) ? 255 : 254;
3015 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3016 buf
[0] = strlen((char*)buf
+1);
3019 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3023 /* Save a Redis object. */
3024 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3025 if (o
->type
== REDIS_STRING
) {
3026 /* Save a string value */
3027 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3028 } else if (o
->type
== REDIS_LIST
) {
3029 /* Save a list value */
3030 list
*list
= o
->ptr
;
3034 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3035 listRewind(list
,&li
);
3036 while((ln
= listNext(&li
))) {
3037 robj
*eleobj
= listNodeValue(ln
);
3039 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3041 } else if (o
->type
== REDIS_SET
) {
3042 /* Save a set value */
3044 dictIterator
*di
= dictGetIterator(set
);
3047 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3048 while((de
= dictNext(di
)) != NULL
) {
3049 robj
*eleobj
= dictGetEntryKey(de
);
3051 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3053 dictReleaseIterator(di
);
3054 } else if (o
->type
== REDIS_ZSET
) {
3055 /* Save a set value */
3057 dictIterator
*di
= dictGetIterator(zs
->dict
);
3060 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3061 while((de
= dictNext(di
)) != NULL
) {
3062 robj
*eleobj
= dictGetEntryKey(de
);
3063 double *score
= dictGetEntryVal(de
);
3065 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3066 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3068 dictReleaseIterator(di
);
3070 redisAssert(0 != 0);
3075 /* Return the length the object will have on disk if saved with
3076 * the rdbSaveObject() function. Currently we use a trick to get
3077 * this length with very little changes to the code. In the future
3078 * we could switch to a faster solution. */
3079 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3080 if (fp
== NULL
) fp
= server
.devnull
;
3082 assert(rdbSaveObject(fp
,o
) != 1);
3086 /* Return the number of pages required to save this object in the swap file */
3087 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3088 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3090 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3093 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3094 static int rdbSave(char *filename
) {
3095 dictIterator
*di
= NULL
;
3100 time_t now
= time(NULL
);
3102 /* Wait for I/O therads to terminate, just in case this is a
3103 * foreground-saving, to avoid seeking the swap file descriptor at the
3105 if (server
.vm_enabled
)
3106 waitEmptyIOJobsQueue();
3108 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3109 fp
= fopen(tmpfile
,"w");
3111 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3114 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3115 for (j
= 0; j
< server
.dbnum
; j
++) {
3116 redisDb
*db
= server
.db
+j
;
3118 if (dictSize(d
) == 0) continue;
3119 di
= dictGetIterator(d
);
3125 /* Write the SELECT DB opcode */
3126 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3127 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3129 /* Iterate this DB writing every entry */
3130 while((de
= dictNext(di
)) != NULL
) {
3131 robj
*key
= dictGetEntryKey(de
);
3132 robj
*o
= dictGetEntryVal(de
);
3133 time_t expiretime
= getExpire(db
,key
);
3135 /* Save the expire time */
3136 if (expiretime
!= -1) {
3137 /* If this key is already expired skip it */
3138 if (expiretime
< now
) continue;
3139 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3140 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3142 /* Save the key and associated value. This requires special
3143 * handling if the value is swapped out. */
3144 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
3145 key
->storage
== REDIS_VM_SWAPPING
) {
3146 /* Save type, key, value */
3147 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3148 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3149 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3151 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3153 /* Get a preview of the object in memory */
3154 po
= vmPreviewObject(key
);
3155 /* Save type, key, value */
3156 if (rdbSaveType(fp
,key
->vtype
) == -1) goto werr
;
3157 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3158 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3159 /* Remove the loaded object from memory */
3163 dictReleaseIterator(di
);
3166 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3168 /* Make sure data will not remain on the OS's output buffers */
3173 /* Use RENAME to make sure the DB file is changed atomically only
3174 * if the generate DB file is ok. */
3175 if (rename(tmpfile
,filename
) == -1) {
3176 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3180 redisLog(REDIS_NOTICE
,"DB saved on disk");
3182 server
.lastsave
= time(NULL
);
3188 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
3189 if (di
) dictReleaseIterator(di
);
3193 static int rdbSaveBackground(char *filename
) {
3196 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
3197 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
3198 if ((childpid
= fork()) == 0) {
3200 if (server
.vm_enabled
) vmReopenSwapFile();
3202 if (rdbSave(filename
) == REDIS_OK
) {
3209 if (childpid
== -1) {
3210 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
3214 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
3215 server
.bgsavechildpid
= childpid
;
3218 return REDIS_OK
; /* unreached */
3221 static void rdbRemoveTempFile(pid_t childpid
) {
3224 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
3228 static int rdbLoadType(FILE *fp
) {
3230 if (fread(&type
,1,1,fp
) == 0) return -1;
3234 static time_t rdbLoadTime(FILE *fp
) {
3236 if (fread(&t32
,4,1,fp
) == 0) return -1;
3237 return (time_t) t32
;
3240 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3241 * of this file for a description of how this are stored on disk.
3243 * isencoded is set to 1 if the readed length is not actually a length but
3244 * an "encoding type", check the above comments for more info */
3245 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
3246 unsigned char buf
[2];
3250 if (isencoded
) *isencoded
= 0;
3251 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3252 type
= (buf
[0]&0xC0)>>6;
3253 if (type
== REDIS_RDB_6BITLEN
) {
3254 /* Read a 6 bit len */
3256 } else if (type
== REDIS_RDB_ENCVAL
) {
3257 /* Read a 6 bit len encoding type */
3258 if (isencoded
) *isencoded
= 1;
3260 } else if (type
== REDIS_RDB_14BITLEN
) {
3261 /* Read a 14 bit len */
3262 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3263 return ((buf
[0]&0x3F)<<8)|buf
[1];
3265 /* Read a 32 bit len */
3266 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
3271 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
) {
3272 unsigned char enc
[4];
3275 if (enctype
== REDIS_RDB_ENC_INT8
) {
3276 if (fread(enc
,1,1,fp
) == 0) return NULL
;
3277 val
= (signed char)enc
[0];
3278 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
3280 if (fread(enc
,2,1,fp
) == 0) return NULL
;
3281 v
= enc
[0]|(enc
[1]<<8);
3283 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
3285 if (fread(enc
,4,1,fp
) == 0) return NULL
;
3286 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
3289 val
= 0; /* anti-warning */
3292 return createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",val
));
3295 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
3296 unsigned int len
, clen
;
3297 unsigned char *c
= NULL
;
3300 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3301 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3302 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
3303 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
3304 if (fread(c
,clen
,1,fp
) == 0) goto err
;
3305 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
3307 return createObject(REDIS_STRING
,val
);
3314 static robj
*rdbLoadStringObject(FILE*fp
) {
3319 len
= rdbLoadLen(fp
,&isencoded
);
3322 case REDIS_RDB_ENC_INT8
:
3323 case REDIS_RDB_ENC_INT16
:
3324 case REDIS_RDB_ENC_INT32
:
3325 return tryObjectSharing(rdbLoadIntegerObject(fp
,len
));
3326 case REDIS_RDB_ENC_LZF
:
3327 return tryObjectSharing(rdbLoadLzfStringObject(fp
));
3333 if (len
== REDIS_RDB_LENERR
) return NULL
;
3334 val
= sdsnewlen(NULL
,len
);
3335 if (len
&& fread(val
,len
,1,fp
) == 0) {
3339 return tryObjectSharing(createObject(REDIS_STRING
,val
));
3342 /* For information about double serialization check rdbSaveDoubleValue() */
3343 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
3347 if (fread(&len
,1,1,fp
) == 0) return -1;
3349 case 255: *val
= R_NegInf
; return 0;
3350 case 254: *val
= R_PosInf
; return 0;
3351 case 253: *val
= R_Nan
; return 0;
3353 if (fread(buf
,len
,1,fp
) == 0) return -1;
3355 sscanf(buf
, "%lg", val
);
3360 /* Load a Redis object of the specified type from the specified file.
3361 * On success a newly allocated object is returned, otherwise NULL. */
3362 static robj
*rdbLoadObject(int type
, FILE *fp
) {
3365 if (type
== REDIS_STRING
) {
3366 /* Read string value */
3367 if ((o
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3368 tryObjectEncoding(o
);
3369 } else if (type
== REDIS_LIST
|| type
== REDIS_SET
) {
3370 /* Read list/set value */
3373 if ((listlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3374 o
= (type
== REDIS_LIST
) ? createListObject() : createSetObject();
3375 /* It's faster to expand the dict to the right size asap in order
3376 * to avoid rehashing */
3377 if (type
== REDIS_SET
&& listlen
> DICT_HT_INITIAL_SIZE
)
3378 dictExpand(o
->ptr
,listlen
);
3379 /* Load every single element of the list/set */
3383 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3384 tryObjectEncoding(ele
);
3385 if (type
== REDIS_LIST
) {
3386 listAddNodeTail((list
*)o
->ptr
,ele
);
3388 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
3391 } else if (type
== REDIS_ZSET
) {
3392 /* Read list/set value */
3396 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3397 o
= createZsetObject();
3399 /* Load every single element of the list/set */
3402 double *score
= zmalloc(sizeof(double));
3404 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3405 tryObjectEncoding(ele
);
3406 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
3407 dictAdd(zs
->dict
,ele
,score
);
3408 zslInsert(zs
->zsl
,*score
,ele
);
3409 incrRefCount(ele
); /* added to skiplist */
3412 redisAssert(0 != 0);
3417 static int rdbLoad(char *filename
) {
3419 robj
*keyobj
= NULL
;
3421 int type
, retval
, rdbver
;
3422 dict
*d
= server
.db
[0].dict
;
3423 redisDb
*db
= server
.db
+0;
3425 time_t expiretime
= -1, now
= time(NULL
);
3426 long long loadedkeys
= 0;
3428 fp
= fopen(filename
,"r");
3429 if (!fp
) return REDIS_ERR
;
3430 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
3432 if (memcmp(buf
,"REDIS",5) != 0) {
3434 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
3437 rdbver
= atoi(buf
+5);
3440 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
3447 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3448 if (type
== REDIS_EXPIRETIME
) {
3449 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
3450 /* We read the time so we need to read the object type again */
3451 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3453 if (type
== REDIS_EOF
) break;
3454 /* Handle SELECT DB opcode as a special case */
3455 if (type
== REDIS_SELECTDB
) {
3456 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
3458 if (dbid
>= (unsigned)server
.dbnum
) {
3459 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
3462 db
= server
.db
+dbid
;
3467 if ((keyobj
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
3469 if ((o
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
3470 /* Add the new object in the hash table */
3471 retval
= dictAdd(d
,keyobj
,o
);
3472 if (retval
== DICT_ERR
) {
3473 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj
->ptr
);
3476 /* Set the expire time if needed */
3477 if (expiretime
!= -1) {
3478 setExpire(db
,keyobj
,expiretime
);
3479 /* Delete this key if already expired */
3480 if (expiretime
< now
) deleteKey(db
,keyobj
);
3484 /* Handle swapping while loading big datasets when VM is on */
3486 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
3487 while (zmalloc_used_memory() > server
.vm_max_memory
) {
3488 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
3495 eoferr
: /* unexpected end of file is handled here with a fatal exit */
3496 if (keyobj
) decrRefCount(keyobj
);
3497 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3499 return REDIS_ERR
; /* Just to avoid warning */
3502 /*================================== Commands =============================== */
3504 static void authCommand(redisClient
*c
) {
3505 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
3506 c
->authenticated
= 1;
3507 addReply(c
,shared
.ok
);
3509 c
->authenticated
= 0;
3510 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3514 static void pingCommand(redisClient
*c
) {
3515 addReply(c
,shared
.pong
);
3518 static void echoCommand(redisClient
*c
) {
3519 addReplyBulkLen(c
,c
->argv
[1]);
3520 addReply(c
,c
->argv
[1]);
3521 addReply(c
,shared
.crlf
);
3524 /*=================================== Strings =============================== */
3526 static void setGenericCommand(redisClient
*c
, int nx
) {
3529 if (nx
) deleteIfVolatile(c
->db
,c
->argv
[1]);
3530 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3531 if (retval
== DICT_ERR
) {
3533 /* If the key is about a swapped value, we want a new key object
3534 * to overwrite the old. So we delete the old key in the database.
3535 * This will also make sure that swap pages about the old object
3536 * will be marked as free. */
3537 if (deleteIfSwapped(c
->db
,c
->argv
[1]))
3538 incrRefCount(c
->argv
[1]);
3539 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3540 incrRefCount(c
->argv
[2]);
3542 addReply(c
,shared
.czero
);
3546 incrRefCount(c
->argv
[1]);
3547 incrRefCount(c
->argv
[2]);
3550 removeExpire(c
->db
,c
->argv
[1]);
3551 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3554 static void setCommand(redisClient
*c
) {
3555 setGenericCommand(c
,0);
3558 static void setnxCommand(redisClient
*c
) {
3559 setGenericCommand(c
,1);
3562 static int getGenericCommand(redisClient
*c
) {
3563 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3566 addReply(c
,shared
.nullbulk
);
3569 if (o
->type
!= REDIS_STRING
) {
3570 addReply(c
,shared
.wrongtypeerr
);
3573 addReplyBulkLen(c
,o
);
3575 addReply(c
,shared
.crlf
);
3581 static void getCommand(redisClient
*c
) {
3582 getGenericCommand(c
);
3585 static void getsetCommand(redisClient
*c
) {
3586 if (getGenericCommand(c
) == REDIS_ERR
) return;
3587 if (dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]) == DICT_ERR
) {
3588 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3590 incrRefCount(c
->argv
[1]);
3592 incrRefCount(c
->argv
[2]);
3594 removeExpire(c
->db
,c
->argv
[1]);
3597 static void mgetCommand(redisClient
*c
) {
3600 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
3601 for (j
= 1; j
< c
->argc
; j
++) {
3602 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
3604 addReply(c
,shared
.nullbulk
);
3606 if (o
->type
!= REDIS_STRING
) {
3607 addReply(c
,shared
.nullbulk
);
3609 addReplyBulkLen(c
,o
);
3611 addReply(c
,shared
.crlf
);
3617 static void msetGenericCommand(redisClient
*c
, int nx
) {
3618 int j
, busykeys
= 0;
3620 if ((c
->argc
% 2) == 0) {
3621 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3624 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3625 * set nothing at all if at least one already key exists. */
3627 for (j
= 1; j
< c
->argc
; j
+= 2) {
3628 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
3634 addReply(c
, shared
.czero
);
3638 for (j
= 1; j
< c
->argc
; j
+= 2) {
3641 tryObjectEncoding(c
->argv
[j
+1]);
3642 retval
= dictAdd(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3643 if (retval
== DICT_ERR
) {
3644 dictReplace(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3645 incrRefCount(c
->argv
[j
+1]);
3647 incrRefCount(c
->argv
[j
]);
3648 incrRefCount(c
->argv
[j
+1]);
3650 removeExpire(c
->db
,c
->argv
[j
]);
3652 server
.dirty
+= (c
->argc
-1)/2;
3653 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3656 static void msetCommand(redisClient
*c
) {
3657 msetGenericCommand(c
,0);
3660 static void msetnxCommand(redisClient
*c
) {
3661 msetGenericCommand(c
,1);
3664 static void incrDecrCommand(redisClient
*c
, long long incr
) {
3669 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3673 if (o
->type
!= REDIS_STRING
) {
3678 if (o
->encoding
== REDIS_ENCODING_RAW
)
3679 value
= strtoll(o
->ptr
, &eptr
, 10);
3680 else if (o
->encoding
== REDIS_ENCODING_INT
)
3681 value
= (long)o
->ptr
;
3683 redisAssert(1 != 1);
3688 o
= createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",value
));
3689 tryObjectEncoding(o
);
3690 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],o
);
3691 if (retval
== DICT_ERR
) {
3692 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3693 removeExpire(c
->db
,c
->argv
[1]);
3695 incrRefCount(c
->argv
[1]);
3698 addReply(c
,shared
.colon
);
3700 addReply(c
,shared
.crlf
);
3703 static void incrCommand(redisClient
*c
) {
3704 incrDecrCommand(c
,1);
3707 static void decrCommand(redisClient
*c
) {
3708 incrDecrCommand(c
,-1);
3711 static void incrbyCommand(redisClient
*c
) {
3712 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3713 incrDecrCommand(c
,incr
);
3716 static void decrbyCommand(redisClient
*c
) {
3717 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3718 incrDecrCommand(c
,-incr
);
3721 static void appendCommand(redisClient
*c
) {
3726 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3728 /* Create the key */
3729 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3730 incrRefCount(c
->argv
[1]);
3731 incrRefCount(c
->argv
[2]);
3732 totlen
= stringObjectLen(c
->argv
[2]);
3736 de
= dictFind(c
->db
->dict
,c
->argv
[1]);
3739 o
= dictGetEntryVal(de
);
3740 if (o
->type
!= REDIS_STRING
) {
3741 addReply(c
,shared
.wrongtypeerr
);
3744 /* If the object is specially encoded or shared we have to make
3746 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
3747 robj
*decoded
= getDecodedObject(o
);
3749 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
3750 decrRefCount(decoded
);
3751 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3754 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
3755 o
->ptr
= sdscatlen(o
->ptr
,
3756 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
3758 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
3759 (unsigned long) c
->argv
[2]->ptr
);
3761 totlen
= sdslen(o
->ptr
);
3764 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
3767 /* ========================= Type agnostic commands ========================= */
3769 static void delCommand(redisClient
*c
) {
3772 for (j
= 1; j
< c
->argc
; j
++) {
3773 if (deleteKey(c
->db
,c
->argv
[j
])) {
3780 addReply(c
,shared
.czero
);
3783 addReply(c
,shared
.cone
);
3786 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",deleted
));
3791 static void existsCommand(redisClient
*c
) {
3792 addReply(c
,lookupKeyRead(c
->db
,c
->argv
[1]) ? shared
.cone
: shared
.czero
);
3795 static void selectCommand(redisClient
*c
) {
3796 int id
= atoi(c
->argv
[1]->ptr
);
3798 if (selectDb(c
,id
) == REDIS_ERR
) {
3799 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
3801 addReply(c
,shared
.ok
);
3805 static void randomkeyCommand(redisClient
*c
) {
3809 de
= dictGetRandomKey(c
->db
->dict
);
3810 if (!de
|| expireIfNeeded(c
->db
,dictGetEntryKey(de
)) == 0) break;
3813 addReply(c
,shared
.plus
);
3814 addReply(c
,shared
.crlf
);
3816 addReply(c
,shared
.plus
);
3817 addReply(c
,dictGetEntryKey(de
));
3818 addReply(c
,shared
.crlf
);
3822 static void keysCommand(redisClient
*c
) {
3825 sds pattern
= c
->argv
[1]->ptr
;
3826 int plen
= sdslen(pattern
);
3827 unsigned long numkeys
= 0;
3828 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
3830 di
= dictGetIterator(c
->db
->dict
);
3832 decrRefCount(lenobj
);
3833 while((de
= dictNext(di
)) != NULL
) {
3834 robj
*keyobj
= dictGetEntryKey(de
);
3836 sds key
= keyobj
->ptr
;
3837 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
3838 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
3839 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
3840 addReplyBulkLen(c
,keyobj
);
3842 addReply(c
,shared
.crlf
);
3847 dictReleaseIterator(di
);
3848 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
3851 static void dbsizeCommand(redisClient
*c
) {
3853 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
3856 static void lastsaveCommand(redisClient
*c
) {
3858 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
3861 static void typeCommand(redisClient
*c
) {
3865 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3870 case REDIS_STRING
: type
= "+string"; break;
3871 case REDIS_LIST
: type
= "+list"; break;
3872 case REDIS_SET
: type
= "+set"; break;
3873 case REDIS_ZSET
: type
= "+zset"; break;
3874 default: type
= "unknown"; break;
3877 addReplySds(c
,sdsnew(type
));
3878 addReply(c
,shared
.crlf
);
3881 static void saveCommand(redisClient
*c
) {
3882 if (server
.bgsavechildpid
!= -1) {
3883 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
3886 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
3887 addReply(c
,shared
.ok
);
3889 addReply(c
,shared
.err
);
3893 static void bgsaveCommand(redisClient
*c
) {
3894 if (server
.bgsavechildpid
!= -1) {
3895 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
3898 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
3899 char *status
= "+Background saving started\r\n";
3900 addReplySds(c
,sdsnew(status
));
3902 addReply(c
,shared
.err
);
3906 static void shutdownCommand(redisClient
*c
) {
3907 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
3908 /* Kill the saving child if there is a background saving in progress.
3909 We want to avoid race conditions, for instance our saving child may
3910 overwrite the synchronous saving did by SHUTDOWN. */
3911 if (server
.bgsavechildpid
!= -1) {
3912 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
3913 kill(server
.bgsavechildpid
,SIGKILL
);
3914 rdbRemoveTempFile(server
.bgsavechildpid
);
3916 if (server
.appendonly
) {
3917 /* Append only file: fsync() the AOF and exit */
3918 fsync(server
.appendfd
);
3919 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
3922 /* Snapshotting. Perform a SYNC SAVE and exit */
3923 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
3924 if (server
.daemonize
)
3925 unlink(server
.pidfile
);
3926 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
3927 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
3928 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
3931 /* Ooops.. error saving! The best we can do is to continue operating.
3932 * Note that if there was a background saving process, in the next
3933 * cron() Redis will be notified that the background saving aborted,
3934 * handling special stuff like slaves pending for synchronization... */
3935 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
3936 addReplySds(c
,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3941 static void renameGenericCommand(redisClient
*c
, int nx
) {
3944 /* To use the same key as src and dst is probably an error */
3945 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
3946 addReply(c
,shared
.sameobjecterr
);
3950 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3952 addReply(c
,shared
.nokeyerr
);
3956 deleteIfVolatile(c
->db
,c
->argv
[2]);
3957 if (dictAdd(c
->db
->dict
,c
->argv
[2],o
) == DICT_ERR
) {
3960 addReply(c
,shared
.czero
);
3963 dictReplace(c
->db
->dict
,c
->argv
[2],o
);
3965 incrRefCount(c
->argv
[2]);
3967 deleteKey(c
->db
,c
->argv
[1]);
3969 addReply(c
,nx
? shared
.cone
: shared
.ok
);
3972 static void renameCommand(redisClient
*c
) {
3973 renameGenericCommand(c
,0);
3976 static void renamenxCommand(redisClient
*c
) {
3977 renameGenericCommand(c
,1);
3980 static void moveCommand(redisClient
*c
) {
3985 /* Obtain source and target DB pointers */
3988 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
3989 addReply(c
,shared
.outofrangeerr
);
3993 selectDb(c
,srcid
); /* Back to the source DB */
3995 /* If the user is moving using as target the same
3996 * DB as the source DB it is probably an error. */
3998 addReply(c
,shared
.sameobjecterr
);
4002 /* Check if the element exists and get a reference */
4003 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4005 addReply(c
,shared
.czero
);
4009 /* Try to add the element to the target DB */
4010 deleteIfVolatile(dst
,c
->argv
[1]);
4011 if (dictAdd(dst
->dict
,c
->argv
[1],o
) == DICT_ERR
) {
4012 addReply(c
,shared
.czero
);
4015 incrRefCount(c
->argv
[1]);
4018 /* OK! key moved, free the entry in the source DB */
4019 deleteKey(src
,c
->argv
[1]);
4021 addReply(c
,shared
.cone
);
4024 /* =================================== Lists ================================ */
4025 static void pushGenericCommand(redisClient
*c
, int where
) {
4029 lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4031 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4032 addReply(c
,shared
.ok
);
4035 lobj
= createListObject();
4037 if (where
== REDIS_HEAD
) {
4038 listAddNodeHead(list
,c
->argv
[2]);
4040 listAddNodeTail(list
,c
->argv
[2]);
4042 dictAdd(c
->db
->dict
,c
->argv
[1],lobj
);
4043 incrRefCount(c
->argv
[1]);
4044 incrRefCount(c
->argv
[2]);
4046 if (lobj
->type
!= REDIS_LIST
) {
4047 addReply(c
,shared
.wrongtypeerr
);
4050 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4051 addReply(c
,shared
.ok
);
4055 if (where
== REDIS_HEAD
) {
4056 listAddNodeHead(list
,c
->argv
[2]);
4058 listAddNodeTail(list
,c
->argv
[2]);
4060 incrRefCount(c
->argv
[2]);
4063 addReply(c
,shared
.ok
);
4066 static void lpushCommand(redisClient
*c
) {
4067 pushGenericCommand(c
,REDIS_HEAD
);
4070 static void rpushCommand(redisClient
*c
) {
4071 pushGenericCommand(c
,REDIS_TAIL
);
4074 static void llenCommand(redisClient
*c
) {
4078 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4080 addReply(c
,shared
.czero
);
4083 if (o
->type
!= REDIS_LIST
) {
4084 addReply(c
,shared
.wrongtypeerr
);
4087 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(l
)));
4092 static void lindexCommand(redisClient
*c
) {
4094 int index
= atoi(c
->argv
[2]->ptr
);
4096 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4098 addReply(c
,shared
.nullbulk
);
4100 if (o
->type
!= REDIS_LIST
) {
4101 addReply(c
,shared
.wrongtypeerr
);
4103 list
*list
= o
->ptr
;
4106 ln
= listIndex(list
, index
);
4108 addReply(c
,shared
.nullbulk
);
4110 robj
*ele
= listNodeValue(ln
);
4111 addReplyBulkLen(c
,ele
);
4113 addReply(c
,shared
.crlf
);
4119 static void lsetCommand(redisClient
*c
) {
4121 int index
= atoi(c
->argv
[2]->ptr
);
4123 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4125 addReply(c
,shared
.nokeyerr
);
4127 if (o
->type
!= REDIS_LIST
) {
4128 addReply(c
,shared
.wrongtypeerr
);
4130 list
*list
= o
->ptr
;
4133 ln
= listIndex(list
, index
);
4135 addReply(c
,shared
.outofrangeerr
);
4137 robj
*ele
= listNodeValue(ln
);
4140 listNodeValue(ln
) = c
->argv
[3];
4141 incrRefCount(c
->argv
[3]);
4142 addReply(c
,shared
.ok
);
4149 static void popGenericCommand(redisClient
*c
, int where
) {
4152 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4154 addReply(c
,shared
.nullbulk
);
4156 if (o
->type
!= REDIS_LIST
) {
4157 addReply(c
,shared
.wrongtypeerr
);
4159 list
*list
= o
->ptr
;
4162 if (where
== REDIS_HEAD
)
4163 ln
= listFirst(list
);
4165 ln
= listLast(list
);
4168 addReply(c
,shared
.nullbulk
);
4170 robj
*ele
= listNodeValue(ln
);
4171 addReplyBulkLen(c
,ele
);
4173 addReply(c
,shared
.crlf
);
4174 listDelNode(list
,ln
);
4181 static void lpopCommand(redisClient
*c
) {
4182 popGenericCommand(c
,REDIS_HEAD
);
4185 static void rpopCommand(redisClient
*c
) {
4186 popGenericCommand(c
,REDIS_TAIL
);
4189 static void lrangeCommand(redisClient
*c
) {
4191 int start
= atoi(c
->argv
[2]->ptr
);
4192 int end
= atoi(c
->argv
[3]->ptr
);
4194 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4196 addReply(c
,shared
.nullmultibulk
);
4198 if (o
->type
!= REDIS_LIST
) {
4199 addReply(c
,shared
.wrongtypeerr
);
4201 list
*list
= o
->ptr
;
4203 int llen
= listLength(list
);
4207 /* convert negative indexes */
4208 if (start
< 0) start
= llen
+start
;
4209 if (end
< 0) end
= llen
+end
;
4210 if (start
< 0) start
= 0;
4211 if (end
< 0) end
= 0;
4213 /* indexes sanity checks */
4214 if (start
> end
|| start
>= llen
) {
4215 /* Out of range start or start > end result in empty list */
4216 addReply(c
,shared
.emptymultibulk
);
4219 if (end
>= llen
) end
= llen
-1;
4220 rangelen
= (end
-start
)+1;
4222 /* Return the result in form of a multi-bulk reply */
4223 ln
= listIndex(list
, start
);
4224 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
4225 for (j
= 0; j
< rangelen
; j
++) {
4226 ele
= listNodeValue(ln
);
4227 addReplyBulkLen(c
,ele
);
4229 addReply(c
,shared
.crlf
);
4236 static void ltrimCommand(redisClient
*c
) {
4238 int start
= atoi(c
->argv
[2]->ptr
);
4239 int end
= atoi(c
->argv
[3]->ptr
);
4241 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4243 addReply(c
,shared
.ok
);
4245 if (o
->type
!= REDIS_LIST
) {
4246 addReply(c
,shared
.wrongtypeerr
);
4248 list
*list
= o
->ptr
;
4250 int llen
= listLength(list
);
4251 int j
, ltrim
, rtrim
;
4253 /* convert negative indexes */
4254 if (start
< 0) start
= llen
+start
;
4255 if (end
< 0) end
= llen
+end
;
4256 if (start
< 0) start
= 0;
4257 if (end
< 0) end
= 0;
4259 /* indexes sanity checks */
4260 if (start
> end
|| start
>= llen
) {
4261 /* Out of range start or start > end result in empty list */
4265 if (end
>= llen
) end
= llen
-1;
4270 /* Remove list elements to perform the trim */
4271 for (j
= 0; j
< ltrim
; j
++) {
4272 ln
= listFirst(list
);
4273 listDelNode(list
,ln
);
4275 for (j
= 0; j
< rtrim
; j
++) {
4276 ln
= listLast(list
);
4277 listDelNode(list
,ln
);
4280 addReply(c
,shared
.ok
);
4285 static void lremCommand(redisClient
*c
) {
4288 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4290 addReply(c
,shared
.czero
);
4292 if (o
->type
!= REDIS_LIST
) {
4293 addReply(c
,shared
.wrongtypeerr
);
4295 list
*list
= o
->ptr
;
4296 listNode
*ln
, *next
;
4297 int toremove
= atoi(c
->argv
[2]->ptr
);
4302 toremove
= -toremove
;
4305 ln
= fromtail
? list
->tail
: list
->head
;
4307 robj
*ele
= listNodeValue(ln
);
4309 next
= fromtail
? ln
->prev
: ln
->next
;
4310 if (compareStringObjects(ele
,c
->argv
[3]) == 0) {
4311 listDelNode(list
,ln
);
4314 if (toremove
&& removed
== toremove
) break;
4318 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
4323 /* This is the semantic of this command:
4324 * RPOPLPUSH srclist dstlist:
4325 * IF LLEN(srclist) > 0
4326 * element = RPOP srclist
4327 * LPUSH dstlist element
4334 * The idea is to be able to get an element from a list in a reliable way
4335 * since the element is not just returned but pushed against another list
4336 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4338 static void rpoplpushcommand(redisClient
*c
) {
4341 sobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4343 addReply(c
,shared
.nullbulk
);
4345 if (sobj
->type
!= REDIS_LIST
) {
4346 addReply(c
,shared
.wrongtypeerr
);
4348 list
*srclist
= sobj
->ptr
;
4349 listNode
*ln
= listLast(srclist
);
4352 addReply(c
,shared
.nullbulk
);
4354 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4355 robj
*ele
= listNodeValue(ln
);
4358 if (dobj
&& dobj
->type
!= REDIS_LIST
) {
4359 addReply(c
,shared
.wrongtypeerr
);
4363 /* Add the element to the target list (unless it's directly
4364 * passed to some BLPOP-ing client */
4365 if (!handleClientsWaitingListPush(c
,c
->argv
[2],ele
)) {
4367 /* Create the list if the key does not exist */
4368 dobj
= createListObject();
4369 dictAdd(c
->db
->dict
,c
->argv
[2],dobj
);
4370 incrRefCount(c
->argv
[2]);
4372 dstlist
= dobj
->ptr
;
4373 listAddNodeHead(dstlist
,ele
);
4377 /* Send the element to the client as reply as well */
4378 addReplyBulkLen(c
,ele
);
4380 addReply(c
,shared
.crlf
);
4382 /* Finally remove the element from the source list */
4383 listDelNode(srclist
,ln
);
4391 /* ==================================== Sets ================================ */
4393 static void saddCommand(redisClient
*c
) {
4396 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4398 set
= createSetObject();
4399 dictAdd(c
->db
->dict
,c
->argv
[1],set
);
4400 incrRefCount(c
->argv
[1]);
4402 if (set
->type
!= REDIS_SET
) {
4403 addReply(c
,shared
.wrongtypeerr
);
4407 if (dictAdd(set
->ptr
,c
->argv
[2],NULL
) == DICT_OK
) {
4408 incrRefCount(c
->argv
[2]);
4410 addReply(c
,shared
.cone
);
4412 addReply(c
,shared
.czero
);
4416 static void sremCommand(redisClient
*c
) {
4419 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4421 addReply(c
,shared
.czero
);
4423 if (set
->type
!= REDIS_SET
) {
4424 addReply(c
,shared
.wrongtypeerr
);
4427 if (dictDelete(set
->ptr
,c
->argv
[2]) == DICT_OK
) {
4429 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4430 addReply(c
,shared
.cone
);
4432 addReply(c
,shared
.czero
);
4437 static void smoveCommand(redisClient
*c
) {
4438 robj
*srcset
, *dstset
;
4440 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4441 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4443 /* If the source key does not exist return 0, if it's of the wrong type
4445 if (srcset
== NULL
|| srcset
->type
!= REDIS_SET
) {
4446 addReply(c
, srcset
? shared
.wrongtypeerr
: shared
.czero
);
4449 /* Error if the destination key is not a set as well */
4450 if (dstset
&& dstset
->type
!= REDIS_SET
) {
4451 addReply(c
,shared
.wrongtypeerr
);
4454 /* Remove the element from the source set */
4455 if (dictDelete(srcset
->ptr
,c
->argv
[3]) == DICT_ERR
) {
4456 /* Key not found in the src set! return zero */
4457 addReply(c
,shared
.czero
);
4461 /* Add the element to the destination set */
4463 dstset
= createSetObject();
4464 dictAdd(c
->db
->dict
,c
->argv
[2],dstset
);
4465 incrRefCount(c
->argv
[2]);
4467 if (dictAdd(dstset
->ptr
,c
->argv
[3],NULL
) == DICT_OK
)
4468 incrRefCount(c
->argv
[3]);
4469 addReply(c
,shared
.cone
);
4472 static void sismemberCommand(redisClient
*c
) {
4475 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4477 addReply(c
,shared
.czero
);
4479 if (set
->type
!= REDIS_SET
) {
4480 addReply(c
,shared
.wrongtypeerr
);
4483 if (dictFind(set
->ptr
,c
->argv
[2]))
4484 addReply(c
,shared
.cone
);
4486 addReply(c
,shared
.czero
);
4490 static void scardCommand(redisClient
*c
) {
4494 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4496 addReply(c
,shared
.czero
);
4499 if (o
->type
!= REDIS_SET
) {
4500 addReply(c
,shared
.wrongtypeerr
);
4503 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4509 static void spopCommand(redisClient
*c
) {
4513 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4515 addReply(c
,shared
.nullbulk
);
4517 if (set
->type
!= REDIS_SET
) {
4518 addReply(c
,shared
.wrongtypeerr
);
4521 de
= dictGetRandomKey(set
->ptr
);
4523 addReply(c
,shared
.nullbulk
);
4525 robj
*ele
= dictGetEntryKey(de
);
4527 addReplyBulkLen(c
,ele
);
4529 addReply(c
,shared
.crlf
);
4530 dictDelete(set
->ptr
,ele
);
4531 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4537 static void srandmemberCommand(redisClient
*c
) {
4541 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4543 addReply(c
,shared
.nullbulk
);
4545 if (set
->type
!= REDIS_SET
) {
4546 addReply(c
,shared
.wrongtypeerr
);
4549 de
= dictGetRandomKey(set
->ptr
);
4551 addReply(c
,shared
.nullbulk
);
4553 robj
*ele
= dictGetEntryKey(de
);
4555 addReplyBulkLen(c
,ele
);
4557 addReply(c
,shared
.crlf
);
4562 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
4563 dict
**d1
= (void*) s1
, **d2
= (void*) s2
;
4565 return dictSize(*d1
)-dictSize(*d2
);
4568 static void sinterGenericCommand(redisClient
*c
, robj
**setskeys
, unsigned long setsnum
, robj
*dstkey
) {
4569 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4572 robj
*lenobj
= NULL
, *dstset
= NULL
;
4573 unsigned long j
, cardinality
= 0;
4575 for (j
= 0; j
< setsnum
; j
++) {
4579 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4580 lookupKeyRead(c
->db
,setskeys
[j
]);
4584 if (deleteKey(c
->db
,dstkey
))
4586 addReply(c
,shared
.czero
);
4588 addReply(c
,shared
.nullmultibulk
);
4592 if (setobj
->type
!= REDIS_SET
) {
4594 addReply(c
,shared
.wrongtypeerr
);
4597 dv
[j
] = setobj
->ptr
;
4599 /* Sort sets from the smallest to largest, this will improve our
4600 * algorithm's performace */
4601 qsort(dv
,setsnum
,sizeof(dict
*),qsortCompareSetsByCardinality
);
4603 /* The first thing we should output is the total number of elements...
4604 * since this is a multi-bulk write, but at this stage we don't know
4605 * the intersection set size, so we use a trick, append an empty object
4606 * to the output list and save the pointer to later modify it with the
4609 lenobj
= createObject(REDIS_STRING
,NULL
);
4611 decrRefCount(lenobj
);
4613 /* If we have a target key where to store the resulting set
4614 * create this key with an empty set inside */
4615 dstset
= createSetObject();
4618 /* Iterate all the elements of the first (smallest) set, and test
4619 * the element against all the other sets, if at least one set does
4620 * not include the element it is discarded */
4621 di
= dictGetIterator(dv
[0]);
4623 while((de
= dictNext(di
)) != NULL
) {
4626 for (j
= 1; j
< setsnum
; j
++)
4627 if (dictFind(dv
[j
],dictGetEntryKey(de
)) == NULL
) break;
4629 continue; /* at least one set does not contain the member */
4630 ele
= dictGetEntryKey(de
);
4632 addReplyBulkLen(c
,ele
);
4634 addReply(c
,shared
.crlf
);
4637 dictAdd(dstset
->ptr
,ele
,NULL
);
4641 dictReleaseIterator(di
);
4644 /* Store the resulting set into the target */
4645 deleteKey(c
->db
,dstkey
);
4646 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4647 incrRefCount(dstkey
);
4651 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
4653 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4654 dictSize((dict
*)dstset
->ptr
)));
4660 static void sinterCommand(redisClient
*c
) {
4661 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
4664 static void sinterstoreCommand(redisClient
*c
) {
4665 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
4668 #define REDIS_OP_UNION 0
4669 #define REDIS_OP_DIFF 1
4671 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setskeys
, int setsnum
, robj
*dstkey
, int op
) {
4672 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4675 robj
*dstset
= NULL
;
4676 int j
, cardinality
= 0;
4678 for (j
= 0; j
< setsnum
; j
++) {
4682 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4683 lookupKeyRead(c
->db
,setskeys
[j
]);
4688 if (setobj
->type
!= REDIS_SET
) {
4690 addReply(c
,shared
.wrongtypeerr
);
4693 dv
[j
] = setobj
->ptr
;
4696 /* We need a temp set object to store our union. If the dstkey
4697 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4698 * this set object will be the resulting object to set into the target key*/
4699 dstset
= createSetObject();
4701 /* Iterate all the elements of all the sets, add every element a single
4702 * time to the result set */
4703 for (j
= 0; j
< setsnum
; j
++) {
4704 if (op
== REDIS_OP_DIFF
&& j
== 0 && !dv
[j
]) break; /* result set is empty */
4705 if (!dv
[j
]) continue; /* non existing keys are like empty sets */
4707 di
= dictGetIterator(dv
[j
]);
4709 while((de
= dictNext(di
)) != NULL
) {
4712 /* dictAdd will not add the same element multiple times */
4713 ele
= dictGetEntryKey(de
);
4714 if (op
== REDIS_OP_UNION
|| j
== 0) {
4715 if (dictAdd(dstset
->ptr
,ele
,NULL
) == DICT_OK
) {
4719 } else if (op
== REDIS_OP_DIFF
) {
4720 if (dictDelete(dstset
->ptr
,ele
) == DICT_OK
) {
4725 dictReleaseIterator(di
);
4727 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break; /* result set is empty */
4730 /* Output the content of the resulting set, if not in STORE mode */
4732 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
4733 di
= dictGetIterator(dstset
->ptr
);
4734 while((de
= dictNext(di
)) != NULL
) {
4737 ele
= dictGetEntryKey(de
);
4738 addReplyBulkLen(c
,ele
);
4740 addReply(c
,shared
.crlf
);
4742 dictReleaseIterator(di
);
4744 /* If we have a target key where to store the resulting set
4745 * create this key with the result set inside */
4746 deleteKey(c
->db
,dstkey
);
4747 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4748 incrRefCount(dstkey
);
4753 decrRefCount(dstset
);
4755 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4756 dictSize((dict
*)dstset
->ptr
)));
4762 static void sunionCommand(redisClient
*c
) {
4763 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
4766 static void sunionstoreCommand(redisClient
*c
) {
4767 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
4770 static void sdiffCommand(redisClient
*c
) {
4771 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
4774 static void sdiffstoreCommand(redisClient
*c
) {
4775 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
4778 /* ==================================== ZSets =============================== */
4780 /* ZSETs are ordered sets using two data structures to hold the same elements
4781 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4784 * The elements are added to an hash table mapping Redis objects to scores.
4785 * At the same time the elements are added to a skip list mapping scores
4786 * to Redis objects (so objects are sorted by scores in this "view"). */
4788 /* This skiplist implementation is almost a C translation of the original
4789 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4790 * Alternative to Balanced Trees", modified in three ways:
4791 * a) this implementation allows for repeated values.
4792 * b) the comparison is not just by key (our 'score') but by satellite data.
4793 * c) there is a back pointer, so it's a doubly linked list with the back
4794 * pointers being only at "level 1". This allows to traverse the list
4795 * from tail to head, useful for ZREVRANGE. */
4797 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
4798 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
4800 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
4801 zn
->span
= zmalloc(sizeof(unsigned int) * level
);
4807 static zskiplist
*zslCreate(void) {
4811 zsl
= zmalloc(sizeof(*zsl
));
4814 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
4815 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
4816 zsl
->header
->forward
[j
] = NULL
;
4817 zsl
->header
->span
[j
] = 0;
4819 zsl
->header
->backward
= NULL
;
4824 static void zslFreeNode(zskiplistNode
*node
) {
4825 decrRefCount(node
->obj
);
4826 zfree(node
->forward
);
4831 static void zslFree(zskiplist
*zsl
) {
4832 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
4834 zfree(zsl
->header
->forward
);
4835 zfree(zsl
->header
->span
);
4838 next
= node
->forward
[0];
4845 static int zslRandomLevel(void) {
4847 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
4852 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
4853 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4854 unsigned int span
[ZSKIPLIST_MAXLEVEL
];
4858 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4859 /* store span that is crossed to reach the insert position */
4860 span
[i
] = i
== (zsl
->level
-1) ? 0 : span
[i
+1];
4862 while (x
->forward
[i
] &&
4863 (x
->forward
[i
]->score
< score
||
4864 (x
->forward
[i
]->score
== score
&&
4865 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
4866 span
[i
] += x
->span
[i
];
4871 /* we assume the key is not already inside, since we allow duplicated
4872 * scores, and the re-insertion of score and redis object should never
4873 * happpen since the caller of zslInsert() should test in the hash table
4874 * if the element is already inside or not. */
4875 level
= zslRandomLevel();
4876 if (level
> zsl
->level
) {
4877 for (i
= zsl
->level
; i
< level
; i
++) {
4879 update
[i
] = zsl
->header
;
4880 update
[i
]->span
[i
] = zsl
->length
;
4884 x
= zslCreateNode(level
,score
,obj
);
4885 for (i
= 0; i
< level
; i
++) {
4886 x
->forward
[i
] = update
[i
]->forward
[i
];
4887 update
[i
]->forward
[i
] = x
;
4889 /* update span covered by update[i] as x is inserted here */
4890 x
->span
[i
] = update
[i
]->span
[i
] - (span
[0] - span
[i
]);
4891 update
[i
]->span
[i
] = (span
[0] - span
[i
]) + 1;
4894 /* increment span for untouched levels */
4895 for (i
= level
; i
< zsl
->level
; i
++) {
4896 update
[i
]->span
[i
]++;
4899 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
4901 x
->forward
[0]->backward
= x
;
4907 /* Delete an element with matching score/object from the skiplist. */
4908 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
4909 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4913 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4914 while (x
->forward
[i
] &&
4915 (x
->forward
[i
]->score
< score
||
4916 (x
->forward
[i
]->score
== score
&&
4917 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
4921 /* We may have multiple elements with the same score, what we need
4922 * is to find the element with both the right score and object. */
4924 if (x
&& score
== x
->score
&& compareStringObjects(x
->obj
,obj
) == 0) {
4925 for (i
= 0; i
< zsl
->level
; i
++) {
4926 if (update
[i
]->forward
[i
] == x
) {
4927 update
[i
]->span
[i
] += x
->span
[i
] - 1;
4928 update
[i
]->forward
[i
] = x
->forward
[i
];
4930 update
[i
]->span
[i
] -= 1;
4933 if (x
->forward
[0]) {
4934 x
->forward
[0]->backward
= x
->backward
;
4936 zsl
->tail
= x
->backward
;
4939 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
4944 return 0; /* not found */
4946 return 0; /* not found */
4949 /* Delete all the elements with score between min and max from the skiplist.
4950 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4951 * Note that this function takes the reference to the hash table view of the
4952 * sorted set, in order to remove the elements from the hash table too. */
4953 static unsigned long zslDeleteRange(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
4954 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
4955 unsigned long removed
= 0;
4959 for (i
= zsl
->level
-1; i
>= 0; i
--) {
4960 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
4964 /* We may have multiple elements with the same score, what we need
4965 * is to find the element with both the right score and object. */
4967 while (x
&& x
->score
<= max
) {
4968 zskiplistNode
*next
;
4970 for (i
= 0; i
< zsl
->level
; i
++) {
4971 if (update
[i
]->forward
[i
] == x
) {
4972 update
[i
]->span
[i
] += x
->span
[i
] - 1;
4973 update
[i
]->forward
[i
] = x
->forward
[i
];
4975 update
[i
]->span
[i
] -= 1;
4978 if (x
->forward
[0]) {
4979 x
->forward
[0]->backward
= x
->backward
;
4981 zsl
->tail
= x
->backward
;
4983 next
= x
->forward
[0];
4984 dictDelete(dict
,x
->obj
);
4986 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
4992 return removed
; /* not found */
4995 /* Find the first node having a score equal or greater than the specified one.
4996 * Returns NULL if there is no match. */
4997 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
5002 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5003 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
5006 /* We may have multiple elements with the same score, what we need
5007 * is to find the element with both the right score and object. */
5008 return x
->forward
[0];
5011 /* The actual Z-commands implementations */
5013 /* This generic command implements both ZADD and ZINCRBY.
5014 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5015 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5016 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
5021 zsetobj
= lookupKeyWrite(c
->db
,key
);
5022 if (zsetobj
== NULL
) {
5023 zsetobj
= createZsetObject();
5024 dictAdd(c
->db
->dict
,key
,zsetobj
);
5027 if (zsetobj
->type
!= REDIS_ZSET
) {
5028 addReply(c
,shared
.wrongtypeerr
);
5034 /* Ok now since we implement both ZADD and ZINCRBY here the code
5035 * needs to handle the two different conditions. It's all about setting
5036 * '*score', that is, the new score to set, to the right value. */
5037 score
= zmalloc(sizeof(double));
5041 /* Read the old score. If the element was not present starts from 0 */
5042 de
= dictFind(zs
->dict
,ele
);
5044 double *oldscore
= dictGetEntryVal(de
);
5045 *score
= *oldscore
+ scoreval
;
5053 /* What follows is a simple remove and re-insert operation that is common
5054 * to both ZADD and ZINCRBY... */
5055 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
5056 /* case 1: New element */
5057 incrRefCount(ele
); /* added to hash */
5058 zslInsert(zs
->zsl
,*score
,ele
);
5059 incrRefCount(ele
); /* added to skiplist */
5062 addReplyDouble(c
,*score
);
5064 addReply(c
,shared
.cone
);
5069 /* case 2: Score update operation */
5070 de
= dictFind(zs
->dict
,ele
);
5071 redisAssert(de
!= NULL
);
5072 oldscore
= dictGetEntryVal(de
);
5073 if (*score
!= *oldscore
) {
5076 /* Remove and insert the element in the skip list with new score */
5077 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
5078 redisAssert(deleted
!= 0);
5079 zslInsert(zs
->zsl
,*score
,ele
);
5081 /* Update the score in the hash table */
5082 dictReplace(zs
->dict
,ele
,score
);
5088 addReplyDouble(c
,*score
);
5090 addReply(c
,shared
.czero
);
5094 static void zaddCommand(redisClient
*c
) {
5097 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5098 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
5101 static void zincrbyCommand(redisClient
*c
) {
5104 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5105 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
5108 static void zremCommand(redisClient
*c
) {
5112 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5113 if (zsetobj
== NULL
) {
5114 addReply(c
,shared
.czero
);
5120 if (zsetobj
->type
!= REDIS_ZSET
) {
5121 addReply(c
,shared
.wrongtypeerr
);
5125 de
= dictFind(zs
->dict
,c
->argv
[2]);
5127 addReply(c
,shared
.czero
);
5130 /* Delete from the skiplist */
5131 oldscore
= dictGetEntryVal(de
);
5132 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
5133 redisAssert(deleted
!= 0);
5135 /* Delete from the hash table */
5136 dictDelete(zs
->dict
,c
->argv
[2]);
5137 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5139 addReply(c
,shared
.cone
);
5143 static void zremrangebyscoreCommand(redisClient
*c
) {
5144 double min
= strtod(c
->argv
[2]->ptr
,NULL
);
5145 double max
= strtod(c
->argv
[3]->ptr
,NULL
);
5149 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5150 if (zsetobj
== NULL
) {
5151 addReply(c
,shared
.czero
);
5155 if (zsetobj
->type
!= REDIS_ZSET
) {
5156 addReply(c
,shared
.wrongtypeerr
);
5160 deleted
= zslDeleteRange(zs
->zsl
,min
,max
,zs
->dict
);
5161 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5162 server
.dirty
+= deleted
;
5163 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",deleted
));
5167 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
5169 int start
= atoi(c
->argv
[2]->ptr
);
5170 int end
= atoi(c
->argv
[3]->ptr
);
5173 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
5175 } else if (c
->argc
>= 5) {
5176 addReply(c
,shared
.syntaxerr
);
5180 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5182 addReply(c
,shared
.nullmultibulk
);
5184 if (o
->type
!= REDIS_ZSET
) {
5185 addReply(c
,shared
.wrongtypeerr
);
5187 zset
*zsetobj
= o
->ptr
;
5188 zskiplist
*zsl
= zsetobj
->zsl
;
5191 int llen
= zsl
->length
;
5195 /* convert negative indexes */
5196 if (start
< 0) start
= llen
+start
;
5197 if (end
< 0) end
= llen
+end
;
5198 if (start
< 0) start
= 0;
5199 if (end
< 0) end
= 0;
5201 /* indexes sanity checks */
5202 if (start
> end
|| start
>= llen
) {
5203 /* Out of range start or start > end result in empty list */
5204 addReply(c
,shared
.emptymultibulk
);
5207 if (end
>= llen
) end
= llen
-1;
5208 rangelen
= (end
-start
)+1;
5210 /* Return the result in form of a multi-bulk reply */
5216 ln
= zsl
->header
->forward
[0];
5218 ln
= ln
->forward
[0];
5221 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
5222 withscores
? (rangelen
*2) : rangelen
));
5223 for (j
= 0; j
< rangelen
; j
++) {
5225 addReplyBulkLen(c
,ele
);
5227 addReply(c
,shared
.crlf
);
5229 addReplyDouble(c
,ln
->score
);
5230 ln
= reverse
? ln
->backward
: ln
->forward
[0];
5236 static void zrangeCommand(redisClient
*c
) {
5237 zrangeGenericCommand(c
,0);
5240 static void zrevrangeCommand(redisClient
*c
) {
5241 zrangeGenericCommand(c
,1);
5244 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5245 * If justcount is non-zero, just the count is returned. */
5246 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
5249 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
5250 int offset
= 0, limit
= -1;
5254 /* Parse the min-max interval. If one of the values is prefixed
5255 * by the "(" character, it's considered "open". For instance
5256 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5257 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5258 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
5259 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
5262 min
= strtod(c
->argv
[2]->ptr
,NULL
);
5264 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
5265 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
5268 max
= strtod(c
->argv
[3]->ptr
,NULL
);
5271 /* Parse "WITHSCORES": note that if the command was called with
5272 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5273 * enter the following paths to parse WITHSCORES and LIMIT. */
5274 if (c
->argc
== 5 || c
->argc
== 8) {
5275 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
5280 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
5284 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5289 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
5290 addReply(c
,shared
.syntaxerr
);
5292 } else if (c
->argc
== (7 + withscores
)) {
5293 offset
= atoi(c
->argv
[5]->ptr
);
5294 limit
= atoi(c
->argv
[6]->ptr
);
5295 if (offset
< 0) offset
= 0;
5298 /* Ok, lookup the key and get the range */
5299 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5301 addReply(c
,justcount
? shared
.czero
: shared
.nullmultibulk
);
5303 if (o
->type
!= REDIS_ZSET
) {
5304 addReply(c
,shared
.wrongtypeerr
);
5306 zset
*zsetobj
= o
->ptr
;
5307 zskiplist
*zsl
= zsetobj
->zsl
;
5309 robj
*ele
, *lenobj
= NULL
;
5310 unsigned long rangelen
= 0;
5312 /* Get the first node with the score >= min, or with
5313 * score > min if 'minex' is true. */
5314 ln
= zslFirstWithScore(zsl
,min
);
5315 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
5318 /* No element matching the speciifed interval */
5319 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
5323 /* We don't know in advance how many matching elements there
5324 * are in the list, so we push this object that will represent
5325 * the multi-bulk length in the output buffer, and will "fix"
5328 lenobj
= createObject(REDIS_STRING
,NULL
);
5330 decrRefCount(lenobj
);
5333 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
5336 ln
= ln
->forward
[0];
5339 if (limit
== 0) break;
5342 addReplyBulkLen(c
,ele
);
5344 addReply(c
,shared
.crlf
);
5346 addReplyDouble(c
,ln
->score
);
5348 ln
= ln
->forward
[0];
5350 if (limit
> 0) limit
--;
5353 addReplyLong(c
,(long)rangelen
);
5355 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
5356 withscores
? (rangelen
*2) : rangelen
);
5362 static void zrangebyscoreCommand(redisClient
*c
) {
5363 genericZrangebyscoreCommand(c
,0);
5366 static void zcountCommand(redisClient
*c
) {
5367 genericZrangebyscoreCommand(c
,1);
5370 static void zcardCommand(redisClient
*c
) {
5374 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5376 addReply(c
,shared
.czero
);
5379 if (o
->type
!= REDIS_ZSET
) {
5380 addReply(c
,shared
.wrongtypeerr
);
5383 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",zs
->zsl
->length
));
5388 static void zscoreCommand(redisClient
*c
) {
5392 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5394 addReply(c
,shared
.nullbulk
);
5397 if (o
->type
!= REDIS_ZSET
) {
5398 addReply(c
,shared
.wrongtypeerr
);
5403 de
= dictFind(zs
->dict
,c
->argv
[2]);
5405 addReply(c
,shared
.nullbulk
);
5407 double *score
= dictGetEntryVal(de
);
5409 addReplyDouble(c
,*score
);
5415 static void zrankCommand(redisClient
*c
) {
5417 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5419 addReply(c
,shared
.nullbulk
);
5422 if (o
->type
!= REDIS_ZSET
) {
5423 addReply(c
,shared
.wrongtypeerr
);
5428 zskiplist
*zsl
= zs
->zsl
;
5429 dictEntry
*de
= dictFind(zs
->dict
,c
->argv
[2]);
5431 addReply(c
,shared
.nullbulk
);
5435 double *score
= dictGetEntryVal(de
);
5437 unsigned int rank
= 0;
5441 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5442 while (x
->forward
[i
] &&
5443 (x
->forward
[i
]->score
< *score
||
5444 (x
->forward
[i
]->score
== *score
&&
5445 compareStringObjects(x
->forward
[i
]->obj
,c
->argv
[2]) < 0))) {
5450 if (x
->forward
[i
] && compareStringObjects(x
->forward
[i
]->obj
,c
->argv
[2]) == 0) {
5451 addReplyLong(c
, rank
);
5456 addReply(c
,shared
.nullbulk
);
5459 /* ========================= Non type-specific commands ==================== */
5461 static void flushdbCommand(redisClient
*c
) {
5462 server
.dirty
+= dictSize(c
->db
->dict
);
5463 dictEmpty(c
->db
->dict
);
5464 dictEmpty(c
->db
->expires
);
5465 addReply(c
,shared
.ok
);
5468 static void flushallCommand(redisClient
*c
) {
5469 server
.dirty
+= emptyDb();
5470 addReply(c
,shared
.ok
);
5471 rdbSave(server
.dbfilename
);
5475 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
5476 redisSortOperation
*so
= zmalloc(sizeof(*so
));
5478 so
->pattern
= pattern
;
5482 /* Return the value associated to the key with a name obtained
5483 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5484 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
5488 int prefixlen
, sublen
, postfixlen
;
5489 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5493 char buf
[REDIS_SORTKEY_MAX
+1];
5496 /* If the pattern is "#" return the substitution object itself in order
5497 * to implement the "SORT ... GET #" feature. */
5498 spat
= pattern
->ptr
;
5499 if (spat
[0] == '#' && spat
[1] == '\0') {
5503 /* The substitution object may be specially encoded. If so we create
5504 * a decoded object on the fly. Otherwise getDecodedObject will just
5505 * increment the ref count, that we'll decrement later. */
5506 subst
= getDecodedObject(subst
);
5509 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
5510 p
= strchr(spat
,'*');
5512 decrRefCount(subst
);
5517 sublen
= sdslen(ssub
);
5518 postfixlen
= sdslen(spat
)-(prefixlen
+1);
5519 memcpy(keyname
.buf
,spat
,prefixlen
);
5520 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
5521 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
5522 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
5523 keyname
.len
= prefixlen
+sublen
+postfixlen
;
5525 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2))
5526 decrRefCount(subst
);
5528 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5529 return lookupKeyRead(db
,&keyobj
);
5532 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5533 * the additional parameter is not standard but a BSD-specific we have to
5534 * pass sorting parameters via the global 'server' structure */
5535 static int sortCompare(const void *s1
, const void *s2
) {
5536 const redisSortObject
*so1
= s1
, *so2
= s2
;
5539 if (!server
.sort_alpha
) {
5540 /* Numeric sorting. Here it's trivial as we precomputed scores */
5541 if (so1
->u
.score
> so2
->u
.score
) {
5543 } else if (so1
->u
.score
< so2
->u
.score
) {
5549 /* Alphanumeric sorting */
5550 if (server
.sort_bypattern
) {
5551 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
5552 /* At least one compare object is NULL */
5553 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
5555 else if (so1
->u
.cmpobj
== NULL
)
5560 /* We have both the objects, use strcoll */
5561 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
5564 /* Compare elements directly */
5567 dec1
= getDecodedObject(so1
->obj
);
5568 dec2
= getDecodedObject(so2
->obj
);
5569 cmp
= strcoll(dec1
->ptr
,dec2
->ptr
);
5574 return server
.sort_desc
? -cmp
: cmp
;
5577 /* The SORT command is the most complex command in Redis. Warning: this code
5578 * is optimized for speed and a bit less for readability */
5579 static void sortCommand(redisClient
*c
) {
5582 int desc
= 0, alpha
= 0;
5583 int limit_start
= 0, limit_count
= -1, start
, end
;
5584 int j
, dontsort
= 0, vectorlen
;
5585 int getop
= 0; /* GET operation counter */
5586 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
5587 redisSortObject
*vector
; /* Resulting vector to sort */
5589 /* Lookup the key to sort. It must be of the right types */
5590 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
5591 if (sortval
== NULL
) {
5592 addReply(c
,shared
.nullmultibulk
);
5595 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
5596 sortval
->type
!= REDIS_ZSET
)
5598 addReply(c
,shared
.wrongtypeerr
);
5602 /* Create a list of operations to perform for every sorted element.
5603 * Operations can be GET/DEL/INCR/DECR */
5604 operations
= listCreate();
5605 listSetFreeMethod(operations
,zfree
);
5608 /* Now we need to protect sortval incrementing its count, in the future
5609 * SORT may have options able to overwrite/delete keys during the sorting
5610 * and the sorted key itself may get destroied */
5611 incrRefCount(sortval
);
5613 /* The SORT command has an SQL-alike syntax, parse it */
5614 while(j
< c
->argc
) {
5615 int leftargs
= c
->argc
-j
-1;
5616 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
5618 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
5620 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
5622 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
5623 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
5624 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
5626 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
5627 storekey
= c
->argv
[j
+1];
5629 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
5630 sortby
= c
->argv
[j
+1];
5631 /* If the BY pattern does not contain '*', i.e. it is constant,
5632 * we don't need to sort nor to lookup the weight keys. */
5633 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
5635 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
5636 listAddNodeTail(operations
,createSortOperation(
5637 REDIS_SORT_GET
,c
->argv
[j
+1]));
5641 decrRefCount(sortval
);
5642 listRelease(operations
);
5643 addReply(c
,shared
.syntaxerr
);
5649 /* Load the sorting vector with all the objects to sort */
5650 switch(sortval
->type
) {
5651 case REDIS_LIST
: vectorlen
= listLength((list
*)sortval
->ptr
); break;
5652 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
5653 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
5654 default: vectorlen
= 0; redisAssert(0); /* Avoid GCC warning */
5656 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
5659 if (sortval
->type
== REDIS_LIST
) {
5660 list
*list
= sortval
->ptr
;
5664 listRewind(list
,&li
);
5665 while((ln
= listNext(&li
))) {
5666 robj
*ele
= ln
->value
;
5667 vector
[j
].obj
= ele
;
5668 vector
[j
].u
.score
= 0;
5669 vector
[j
].u
.cmpobj
= NULL
;
5677 if (sortval
->type
== REDIS_SET
) {
5680 zset
*zs
= sortval
->ptr
;
5684 di
= dictGetIterator(set
);
5685 while((setele
= dictNext(di
)) != NULL
) {
5686 vector
[j
].obj
= dictGetEntryKey(setele
);
5687 vector
[j
].u
.score
= 0;
5688 vector
[j
].u
.cmpobj
= NULL
;
5691 dictReleaseIterator(di
);
5693 redisAssert(j
== vectorlen
);
5695 /* Now it's time to load the right scores in the sorting vector */
5696 if (dontsort
== 0) {
5697 for (j
= 0; j
< vectorlen
; j
++) {
5701 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
5702 if (!byval
|| byval
->type
!= REDIS_STRING
) continue;
5704 vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
5706 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
5707 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
5709 /* Don't need to decode the object if it's
5710 * integer-encoded (the only encoding supported) so
5711 * far. We can just cast it */
5712 if (byval
->encoding
== REDIS_ENCODING_INT
) {
5713 vector
[j
].u
.score
= (long)byval
->ptr
;
5715 redisAssert(1 != 1);
5720 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_RAW
)
5721 vector
[j
].u
.score
= strtod(vector
[j
].obj
->ptr
,NULL
);
5723 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_INT
)
5724 vector
[j
].u
.score
= (long) vector
[j
].obj
->ptr
;
5726 redisAssert(1 != 1);
5733 /* We are ready to sort the vector... perform a bit of sanity check
5734 * on the LIMIT option too. We'll use a partial version of quicksort. */
5735 start
= (limit_start
< 0) ? 0 : limit_start
;
5736 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
5737 if (start
>= vectorlen
) {
5738 start
= vectorlen
-1;
5741 if (end
>= vectorlen
) end
= vectorlen
-1;
5743 if (dontsort
== 0) {
5744 server
.sort_desc
= desc
;
5745 server
.sort_alpha
= alpha
;
5746 server
.sort_bypattern
= sortby
? 1 : 0;
5747 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
5748 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
5750 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
5753 /* Send command output to the output buffer, performing the specified
5754 * GET/DEL/INCR/DECR operations if any. */
5755 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
5756 if (storekey
== NULL
) {
5757 /* STORE option not specified, sent the sorting result to client */
5758 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
5759 for (j
= start
; j
<= end
; j
++) {
5764 addReplyBulkLen(c
,vector
[j
].obj
);
5765 addReply(c
,vector
[j
].obj
);
5766 addReply(c
,shared
.crlf
);
5768 listRewind(operations
,&li
);
5769 while((ln
= listNext(&li
))) {
5770 redisSortOperation
*sop
= ln
->value
;
5771 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
5774 if (sop
->type
== REDIS_SORT_GET
) {
5775 if (!val
|| val
->type
!= REDIS_STRING
) {
5776 addReply(c
,shared
.nullbulk
);
5778 addReplyBulkLen(c
,val
);
5780 addReply(c
,shared
.crlf
);
5783 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
5788 robj
*listObject
= createListObject();
5789 list
*listPtr
= (list
*) listObject
->ptr
;
5791 /* STORE option specified, set the sorting result as a List object */
5792 for (j
= start
; j
<= end
; j
++) {
5797 listAddNodeTail(listPtr
,vector
[j
].obj
);
5798 incrRefCount(vector
[j
].obj
);
5800 listRewind(operations
,&li
);
5801 while((ln
= listNext(&li
))) {
5802 redisSortOperation
*sop
= ln
->value
;
5803 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
5806 if (sop
->type
== REDIS_SORT_GET
) {
5807 if (!val
|| val
->type
!= REDIS_STRING
) {
5808 listAddNodeTail(listPtr
,createStringObject("",0));
5810 listAddNodeTail(listPtr
,val
);
5814 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
5818 if (dictReplace(c
->db
->dict
,storekey
,listObject
)) {
5819 incrRefCount(storekey
);
5821 /* Note: we add 1 because the DB is dirty anyway since even if the
5822 * SORT result is empty a new key is set and maybe the old content
5824 server
.dirty
+= 1+outputlen
;
5825 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
5829 decrRefCount(sortval
);
5830 listRelease(operations
);
5831 for (j
= 0; j
< vectorlen
; j
++) {
5832 if (sortby
&& alpha
&& vector
[j
].u
.cmpobj
)
5833 decrRefCount(vector
[j
].u
.cmpobj
);
5838 /* Convert an amount of bytes into a human readable string in the form
5839 * of 100B, 2G, 100M, 4K, and so forth. */
5840 static void bytesToHuman(char *s
, unsigned long long n
) {
5845 sprintf(s
,"%lluB",n
);
5847 } else if (n
< (1024*1024)) {
5848 d
= (double)n
/(1024);
5849 sprintf(s
,"%.2fK",d
);
5850 } else if (n
< (1024LL*1024*1024)) {
5851 d
= (double)n
/(1024*1024);
5852 sprintf(s
,"%.2fM",d
);
5853 } else if (n
< (1024LL*1024*1024*1024)) {
5854 d
= (double)n
/(1024LL*1024*1024);
5855 sprintf(s
,"%.2fG",d
);
5859 /* Create the string returned by the INFO command. This is decoupled
5860 * by the INFO command itself as we need to report the same information
5861 * on memory corruption problems. */
5862 static sds
genRedisInfoString(void) {
5864 time_t uptime
= time(NULL
)-server
.stat_starttime
;
5868 bytesToHuman(hmem
,zmalloc_used_memory());
5869 info
= sdscatprintf(sdsempty(),
5870 "redis_version:%s\r\n"
5872 "multiplexing_api:%s\r\n"
5873 "process_id:%ld\r\n"
5874 "uptime_in_seconds:%ld\r\n"
5875 "uptime_in_days:%ld\r\n"
5876 "connected_clients:%d\r\n"
5877 "connected_slaves:%d\r\n"
5878 "blocked_clients:%d\r\n"
5879 "used_memory:%zu\r\n"
5880 "used_memory_human:%s\r\n"
5881 "changes_since_last_save:%lld\r\n"
5882 "bgsave_in_progress:%d\r\n"
5883 "last_save_time:%ld\r\n"
5884 "bgrewriteaof_in_progress:%d\r\n"
5885 "total_connections_received:%lld\r\n"
5886 "total_commands_processed:%lld\r\n"
5890 (sizeof(long) == 8) ? "64" : "32",
5895 listLength(server
.clients
)-listLength(server
.slaves
),
5896 listLength(server
.slaves
),
5897 server
.blpop_blocked_clients
,
5898 zmalloc_used_memory(),
5901 server
.bgsavechildpid
!= -1,
5903 server
.bgrewritechildpid
!= -1,
5904 server
.stat_numconnections
,
5905 server
.stat_numcommands
,
5906 server
.vm_enabled
!= 0,
5907 server
.masterhost
== NULL
? "master" : "slave"
5909 if (server
.masterhost
) {
5910 info
= sdscatprintf(info
,
5911 "master_host:%s\r\n"
5912 "master_port:%d\r\n"
5913 "master_link_status:%s\r\n"
5914 "master_last_io_seconds_ago:%d\r\n"
5917 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
5919 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
5922 if (server
.vm_enabled
) {
5924 info
= sdscatprintf(info
,
5925 "vm_conf_max_memory:%llu\r\n"
5926 "vm_conf_page_size:%llu\r\n"
5927 "vm_conf_pages:%llu\r\n"
5928 "vm_stats_used_pages:%llu\r\n"
5929 "vm_stats_swapped_objects:%llu\r\n"
5930 "vm_stats_swappin_count:%llu\r\n"
5931 "vm_stats_swappout_count:%llu\r\n"
5932 "vm_stats_io_newjobs_len:%lu\r\n"
5933 "vm_stats_io_processing_len:%lu\r\n"
5934 "vm_stats_io_processed_len:%lu\r\n"
5935 "vm_stats_io_active_threads:%lu\r\n"
5936 "vm_stats_blocked_clients:%lu\r\n"
5937 ,(unsigned long long) server
.vm_max_memory
,
5938 (unsigned long long) server
.vm_page_size
,
5939 (unsigned long long) server
.vm_pages
,
5940 (unsigned long long) server
.vm_stats_used_pages
,
5941 (unsigned long long) server
.vm_stats_swapped_objects
,
5942 (unsigned long long) server
.vm_stats_swapins
,
5943 (unsigned long long) server
.vm_stats_swapouts
,
5944 (unsigned long) listLength(server
.io_newjobs
),
5945 (unsigned long) listLength(server
.io_processing
),
5946 (unsigned long) listLength(server
.io_processed
),
5947 (unsigned long) server
.io_active_threads
,
5948 (unsigned long) server
.vm_blocked_clients
5952 for (j
= 0; j
< server
.dbnum
; j
++) {
5953 long long keys
, vkeys
;
5955 keys
= dictSize(server
.db
[j
].dict
);
5956 vkeys
= dictSize(server
.db
[j
].expires
);
5957 if (keys
|| vkeys
) {
5958 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
5965 static void infoCommand(redisClient
*c
) {
5966 sds info
= genRedisInfoString();
5967 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
5968 (unsigned long)sdslen(info
)));
5969 addReplySds(c
,info
);
5970 addReply(c
,shared
.crlf
);
5973 static void monitorCommand(redisClient
*c
) {
5974 /* ignore MONITOR if aleady slave or in monitor mode */
5975 if (c
->flags
& REDIS_SLAVE
) return;
5977 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
5979 listAddNodeTail(server
.monitors
,c
);
5980 addReply(c
,shared
.ok
);
5983 /* ================================= Expire ================================= */
5984 static int removeExpire(redisDb
*db
, robj
*key
) {
5985 if (dictDelete(db
->expires
,key
) == DICT_OK
) {
5992 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
5993 if (dictAdd(db
->expires
,key
,(void*)when
) == DICT_ERR
) {
6001 /* Return the expire time of the specified key, or -1 if no expire
6002 * is associated with this key (i.e. the key is non volatile) */
6003 static time_t getExpire(redisDb
*db
, robj
*key
) {
6006 /* No expire? return ASAP */
6007 if (dictSize(db
->expires
) == 0 ||
6008 (de
= dictFind(db
->expires
,key
)) == NULL
) return -1;
6010 return (time_t) dictGetEntryVal(de
);
6013 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
6017 /* No expire? return ASAP */
6018 if (dictSize(db
->expires
) == 0 ||
6019 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6021 /* Lookup the expire */
6022 when
= (time_t) dictGetEntryVal(de
);
6023 if (time(NULL
) <= when
) return 0;
6025 /* Delete the key */
6026 dictDelete(db
->expires
,key
);
6027 return dictDelete(db
->dict
,key
) == DICT_OK
;
6030 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
6033 /* No expire? return ASAP */
6034 if (dictSize(db
->expires
) == 0 ||
6035 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6037 /* Delete the key */
6039 dictDelete(db
->expires
,key
);
6040 return dictDelete(db
->dict
,key
) == DICT_OK
;
6043 static void expireGenericCommand(redisClient
*c
, robj
*key
, time_t seconds
) {
6046 de
= dictFind(c
->db
->dict
,key
);
6048 addReply(c
,shared
.czero
);
6052 if (deleteKey(c
->db
,key
)) server
.dirty
++;
6053 addReply(c
, shared
.cone
);
6056 time_t when
= time(NULL
)+seconds
;
6057 if (setExpire(c
->db
,key
,when
)) {
6058 addReply(c
,shared
.cone
);
6061 addReply(c
,shared
.czero
);
6067 static void expireCommand(redisClient
*c
) {
6068 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10));
6071 static void expireatCommand(redisClient
*c
) {
6072 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10)-time(NULL
));
6075 static void ttlCommand(redisClient
*c
) {
6079 expire
= getExpire(c
->db
,c
->argv
[1]);
6081 ttl
= (int) (expire
-time(NULL
));
6082 if (ttl
< 0) ttl
= -1;
6084 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
6087 /* ================================ MULTI/EXEC ============================== */
6089 /* Client state initialization for MULTI/EXEC */
6090 static void initClientMultiState(redisClient
*c
) {
6091 c
->mstate
.commands
= NULL
;
6092 c
->mstate
.count
= 0;
6095 /* Release all the resources associated with MULTI/EXEC state */
6096 static void freeClientMultiState(redisClient
*c
) {
6099 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6101 multiCmd
*mc
= c
->mstate
.commands
+j
;
6103 for (i
= 0; i
< mc
->argc
; i
++)
6104 decrRefCount(mc
->argv
[i
]);
6107 zfree(c
->mstate
.commands
);
6110 /* Add a new command into the MULTI commands queue */
6111 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
6115 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
6116 sizeof(multiCmd
)*(c
->mstate
.count
+1));
6117 mc
= c
->mstate
.commands
+c
->mstate
.count
;
6120 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
6121 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
6122 for (j
= 0; j
< c
->argc
; j
++)
6123 incrRefCount(mc
->argv
[j
]);
6127 static void multiCommand(redisClient
*c
) {
6128 c
->flags
|= REDIS_MULTI
;
6129 addReply(c
,shared
.ok
);
6132 static void discardCommand(redisClient
*c
) {
6133 if (!(c
->flags
& REDIS_MULTI
)) {
6134 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
6138 freeClientMultiState(c
);
6139 initClientMultiState(c
);
6140 c
->flags
&= (~REDIS_MULTI
);
6141 addReply(c
,shared
.ok
);
6144 static void execCommand(redisClient
*c
) {
6149 if (!(c
->flags
& REDIS_MULTI
)) {
6150 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
6154 orig_argv
= c
->argv
;
6155 orig_argc
= c
->argc
;
6156 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
6157 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6158 c
->argc
= c
->mstate
.commands
[j
].argc
;
6159 c
->argv
= c
->mstate
.commands
[j
].argv
;
6160 call(c
,c
->mstate
.commands
[j
].cmd
);
6162 c
->argv
= orig_argv
;
6163 c
->argc
= orig_argc
;
6164 freeClientMultiState(c
);
6165 initClientMultiState(c
);
6166 c
->flags
&= (~REDIS_MULTI
);
6169 /* =========================== Blocking Operations ========================= */
6171 /* Currently Redis blocking operations support is limited to list POP ops,
6172 * so the current implementation is not fully generic, but it is also not
6173 * completely specific so it will not require a rewrite to support new
6174 * kind of blocking operations in the future.
6176 * Still it's important to note that list blocking operations can be already
6177 * used as a notification mechanism in order to implement other blocking
6178 * operations at application level, so there must be a very strong evidence
6179 * of usefulness and generality before new blocking operations are implemented.
6181 * This is how the current blocking POP works, we use BLPOP as example:
6182 * - If the user calls BLPOP and the key exists and contains a non empty list
6183 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6184 * if there is not to block.
6185 * - If instead BLPOP is called and the key does not exists or the list is
6186 * empty we need to block. In order to do so we remove the notification for
6187 * new data to read in the client socket (so that we'll not serve new
6188 * requests if the blocking request is not served). Also we put the client
6189 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6190 * blocking for this keys.
6191 * - If a PUSH operation against a key with blocked clients waiting is
6192 * performed, we serve the first in the list: basically instead to push
6193 * the new element inside the list we return it to the (first / oldest)
6194 * blocking client, unblock the client, and remove it form the list.
6196 * The above comment and the source code should be enough in order to understand
6197 * the implementation and modify / fix it later.
6200 /* Set a client in blocking mode for the specified key, with the specified
6202 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
6207 c
->blockingkeys
= zmalloc(sizeof(robj
*)*numkeys
);
6208 c
->blockingkeysnum
= numkeys
;
6209 c
->blockingto
= timeout
;
6210 for (j
= 0; j
< numkeys
; j
++) {
6211 /* Add the key in the client structure, to map clients -> keys */
6212 c
->blockingkeys
[j
] = keys
[j
];
6213 incrRefCount(keys
[j
]);
6215 /* And in the other "side", to map keys -> clients */
6216 de
= dictFind(c
->db
->blockingkeys
,keys
[j
]);
6220 /* For every key we take a list of clients blocked for it */
6222 retval
= dictAdd(c
->db
->blockingkeys
,keys
[j
],l
);
6223 incrRefCount(keys
[j
]);
6224 assert(retval
== DICT_OK
);
6226 l
= dictGetEntryVal(de
);
6228 listAddNodeTail(l
,c
);
6230 /* Mark the client as a blocked client */
6231 c
->flags
|= REDIS_BLOCKED
;
6232 server
.blpop_blocked_clients
++;
6235 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6236 static void unblockClientWaitingData(redisClient
*c
) {
6241 assert(c
->blockingkeys
!= NULL
);
6242 /* The client may wait for multiple keys, so unblock it for every key. */
6243 for (j
= 0; j
< c
->blockingkeysnum
; j
++) {
6244 /* Remove this client from the list of clients waiting for this key. */
6245 de
= dictFind(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6247 l
= dictGetEntryVal(de
);
6248 listDelNode(l
,listSearchKey(l
,c
));
6249 /* If the list is empty we need to remove it to avoid wasting memory */
6250 if (listLength(l
) == 0)
6251 dictDelete(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6252 decrRefCount(c
->blockingkeys
[j
]);
6254 /* Cleanup the client structure */
6255 zfree(c
->blockingkeys
);
6256 c
->blockingkeys
= NULL
;
6257 c
->flags
&= (~REDIS_BLOCKED
);
6258 server
.blpop_blocked_clients
--;
6259 /* We want to process data if there is some command waiting
6260 * in the input buffer. Note that this is safe even if
6261 * unblockClientWaitingData() gets called from freeClient() because
6262 * freeClient() will be smart enough to call this function
6263 * *after* c->querybuf was set to NULL. */
6264 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
6267 /* This should be called from any function PUSHing into lists.
6268 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6269 * 'ele' is the element pushed.
6271 * If the function returns 0 there was no client waiting for a list push
6274 * If the function returns 1 there was a client waiting for a list push
6275 * against this key, the element was passed to this client thus it's not
6276 * needed to actually add it to the list and the caller should return asap. */
6277 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
6278 struct dictEntry
*de
;
6279 redisClient
*receiver
;
6283 de
= dictFind(c
->db
->blockingkeys
,key
);
6284 if (de
== NULL
) return 0;
6285 l
= dictGetEntryVal(de
);
6288 receiver
= ln
->value
;
6290 addReplySds(receiver
,sdsnew("*2\r\n"));
6291 addReplyBulkLen(receiver
,key
);
6292 addReply(receiver
,key
);
6293 addReply(receiver
,shared
.crlf
);
6294 addReplyBulkLen(receiver
,ele
);
6295 addReply(receiver
,ele
);
6296 addReply(receiver
,shared
.crlf
);
6297 unblockClientWaitingData(receiver
);
6301 /* Blocking RPOP/LPOP */
6302 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
6307 for (j
= 1; j
< c
->argc
-1; j
++) {
6308 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
6310 if (o
->type
!= REDIS_LIST
) {
6311 addReply(c
,shared
.wrongtypeerr
);
6314 list
*list
= o
->ptr
;
6315 if (listLength(list
) != 0) {
6316 /* If the list contains elements fall back to the usual
6317 * non-blocking POP operation */
6318 robj
*argv
[2], **orig_argv
;
6321 /* We need to alter the command arguments before to call
6322 * popGenericCommand() as the command takes a single key. */
6323 orig_argv
= c
->argv
;
6324 orig_argc
= c
->argc
;
6325 argv
[1] = c
->argv
[j
];
6329 /* Also the return value is different, we need to output
6330 * the multi bulk reply header and the key name. The
6331 * "real" command will add the last element (the value)
6332 * for us. If this souds like an hack to you it's just
6333 * because it is... */
6334 addReplySds(c
,sdsnew("*2\r\n"));
6335 addReplyBulkLen(c
,argv
[1]);
6336 addReply(c
,argv
[1]);
6337 addReply(c
,shared
.crlf
);
6338 popGenericCommand(c
,where
);
6340 /* Fix the client structure with the original stuff */
6341 c
->argv
= orig_argv
;
6342 c
->argc
= orig_argc
;
6348 /* If the list is empty or the key does not exists we must block */
6349 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
6350 if (timeout
> 0) timeout
+= time(NULL
);
6351 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
6354 static void blpopCommand(redisClient
*c
) {
6355 blockingPopGenericCommand(c
,REDIS_HEAD
);
6358 static void brpopCommand(redisClient
*c
) {
6359 blockingPopGenericCommand(c
,REDIS_TAIL
);
6362 /* =============================== Replication ============================= */
6364 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6365 ssize_t nwritten
, ret
= size
;
6366 time_t start
= time(NULL
);
6370 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
6371 nwritten
= write(fd
,ptr
,size
);
6372 if (nwritten
== -1) return -1;
6376 if ((time(NULL
)-start
) > timeout
) {
6384 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6385 ssize_t nread
, totread
= 0;
6386 time_t start
= time(NULL
);
6390 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
6391 nread
= read(fd
,ptr
,size
);
6392 if (nread
== -1) return -1;
6397 if ((time(NULL
)-start
) > timeout
) {
6405 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
6412 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
6415 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
6426 static void syncCommand(redisClient
*c
) {
6427 /* ignore SYNC if aleady slave or in monitor mode */
6428 if (c
->flags
& REDIS_SLAVE
) return;
6430 /* SYNC can't be issued when the server has pending data to send to
6431 * the client about already issued commands. We need a fresh reply
6432 * buffer registering the differences between the BGSAVE and the current
6433 * dataset, so that we can copy to other slaves if needed. */
6434 if (listLength(c
->reply
) != 0) {
6435 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6439 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
6440 /* Here we need to check if there is a background saving operation
6441 * in progress, or if it is required to start one */
6442 if (server
.bgsavechildpid
!= -1) {
6443 /* Ok a background save is in progress. Let's check if it is a good
6444 * one for replication, i.e. if there is another slave that is
6445 * registering differences since the server forked to save */
6450 listRewind(server
.slaves
,&li
);
6451 while((ln
= listNext(&li
))) {
6453 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
6456 /* Perfect, the server is already registering differences for
6457 * another slave. Set the right state, and copy the buffer. */
6458 listRelease(c
->reply
);
6459 c
->reply
= listDup(slave
->reply
);
6460 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6461 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
6463 /* No way, we need to wait for the next BGSAVE in order to
6464 * register differences */
6465 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
6466 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
6469 /* Ok we don't have a BGSAVE in progress, let's start one */
6470 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
6471 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
6472 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
6473 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
6476 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6479 c
->flags
|= REDIS_SLAVE
;
6481 listAddNodeTail(server
.slaves
,c
);
6485 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
6486 redisClient
*slave
= privdata
;
6488 REDIS_NOTUSED(mask
);
6489 char buf
[REDIS_IOBUF_LEN
];
6490 ssize_t nwritten
, buflen
;
6492 if (slave
->repldboff
== 0) {
6493 /* Write the bulk write count before to transfer the DB. In theory here
6494 * we don't know how much room there is in the output buffer of the
6495 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6496 * operations) will never be smaller than the few bytes we need. */
6499 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6501 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
6509 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
6510 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
6512 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
6513 (buflen
== 0) ? "premature EOF" : strerror(errno
));
6517 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
6518 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
6523 slave
->repldboff
+= nwritten
;
6524 if (slave
->repldboff
== slave
->repldbsize
) {
6525 close(slave
->repldbfd
);
6526 slave
->repldbfd
= -1;
6527 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
6528 slave
->replstate
= REDIS_REPL_ONLINE
;
6529 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
6530 sendReplyToClient
, slave
) == AE_ERR
) {
6534 addReplySds(slave
,sdsempty());
6535 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
6539 /* This function is called at the end of every backgrond saving.
6540 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6541 * otherwise REDIS_ERR is passed to the function.
6543 * The goal of this function is to handle slaves waiting for a successful
6544 * background saving in order to perform non-blocking synchronization. */
6545 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
6547 int startbgsave
= 0;
6550 listRewind(server
.slaves
,&li
);
6551 while((ln
= listNext(&li
))) {
6552 redisClient
*slave
= ln
->value
;
6554 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
6556 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
6557 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
6558 struct redis_stat buf
;
6560 if (bgsaveerr
!= REDIS_OK
) {
6562 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
6565 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
6566 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
6568 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
6571 slave
->repldboff
= 0;
6572 slave
->repldbsize
= buf
.st_size
;
6573 slave
->replstate
= REDIS_REPL_SEND_BULK
;
6574 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
6575 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
6582 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
6585 listRewind(server
.slaves
,&li
);
6586 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
6587 while((ln
= listNext(&li
))) {
6588 redisClient
*slave
= ln
->value
;
6590 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
6597 static int syncWithMaster(void) {
6598 char buf
[1024], tmpfile
[256], authcmd
[1024];
6600 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
6604 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
6609 /* AUTH with the master if required. */
6610 if(server
.masterauth
) {
6611 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
6612 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
6614 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
6618 /* Read the AUTH result. */
6619 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
6621 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
6625 if (buf
[0] != '+') {
6627 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
6632 /* Issue the SYNC command */
6633 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
6635 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
6639 /* Read the bulk write count */
6640 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
6642 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
6646 if (buf
[0] != '$') {
6648 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6651 dumpsize
= atoi(buf
+1);
6652 redisLog(REDIS_NOTICE
,"Receiving %d bytes data dump from MASTER",dumpsize
);
6653 /* Read the bulk write data on a temp file */
6654 snprintf(tmpfile
,256,"temp-%d.%ld.rdb",(int)time(NULL
),(long int)random());
6655 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
,0644);
6658 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
6662 int nread
, nwritten
;
6664 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
6666 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
6672 nwritten
= write(dfd
,buf
,nread
);
6673 if (nwritten
== -1) {
6674 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
6682 if (rename(tmpfile
,server
.dbfilename
) == -1) {
6683 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
6689 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
6690 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
6694 server
.master
= createClient(fd
);
6695 server
.master
->flags
|= REDIS_MASTER
;
6696 server
.master
->authenticated
= 1;
6697 server
.replstate
= REDIS_REPL_CONNECTED
;
6701 static void slaveofCommand(redisClient
*c
) {
6702 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
6703 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
6704 if (server
.masterhost
) {
6705 sdsfree(server
.masterhost
);
6706 server
.masterhost
= NULL
;
6707 if (server
.master
) freeClient(server
.master
);
6708 server
.replstate
= REDIS_REPL_NONE
;
6709 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
6712 sdsfree(server
.masterhost
);
6713 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
6714 server
.masterport
= atoi(c
->argv
[2]->ptr
);
6715 if (server
.master
) freeClient(server
.master
);
6716 server
.replstate
= REDIS_REPL_CONNECT
;
6717 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
6718 server
.masterhost
, server
.masterport
);
6720 addReply(c
,shared
.ok
);
6723 /* ============================ Maxmemory directive ======================== */
6725 /* Try to free one object form the pre-allocated objects free list.
6726 * This is useful under low mem conditions as by default we take 1 million
6727 * free objects allocated. On success REDIS_OK is returned, otherwise
6729 static int tryFreeOneObjectFromFreelist(void) {
6732 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
6733 if (listLength(server
.objfreelist
)) {
6734 listNode
*head
= listFirst(server
.objfreelist
);
6735 o
= listNodeValue(head
);
6736 listDelNode(server
.objfreelist
,head
);
6737 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
6741 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
6746 /* This function gets called when 'maxmemory' is set on the config file to limit
6747 * the max memory used by the server, and we are out of memory.
6748 * This function will try to, in order:
6750 * - Free objects from the free list
6751 * - Try to remove keys with an EXPIRE set
6753 * It is not possible to free enough memory to reach used-memory < maxmemory
6754 * the server will start refusing commands that will enlarge even more the
6757 static void freeMemoryIfNeeded(void) {
6758 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
6759 int j
, k
, freed
= 0;
6761 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
6762 for (j
= 0; j
< server
.dbnum
; j
++) {
6764 robj
*minkey
= NULL
;
6765 struct dictEntry
*de
;
6767 if (dictSize(server
.db
[j
].expires
)) {
6769 /* From a sample of three keys drop the one nearest to
6770 * the natural expire */
6771 for (k
= 0; k
< 3; k
++) {
6774 de
= dictGetRandomKey(server
.db
[j
].expires
);
6775 t
= (time_t) dictGetEntryVal(de
);
6776 if (minttl
== -1 || t
< minttl
) {
6777 minkey
= dictGetEntryKey(de
);
6781 deleteKey(server
.db
+j
,minkey
);
6784 if (!freed
) return; /* nothing to free... */
6788 /* ============================== Append Only file ========================== */
6790 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
6791 sds buf
= sdsempty();
6797 /* The DB this command was targetting is not the same as the last command
6798 * we appendend. To issue a SELECT command is needed. */
6799 if (dictid
!= server
.appendseldb
) {
6802 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
6803 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6804 (unsigned long)strlen(seldb
),seldb
);
6805 server
.appendseldb
= dictid
;
6808 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6809 * EXPIREs into EXPIREATs calls */
6810 if (cmd
->proc
== expireCommand
) {
6813 tmpargv
[0] = createStringObject("EXPIREAT",8);
6814 tmpargv
[1] = argv
[1];
6815 incrRefCount(argv
[1]);
6816 when
= time(NULL
)+strtol(argv
[2]->ptr
,NULL
,10);
6817 tmpargv
[2] = createObject(REDIS_STRING
,
6818 sdscatprintf(sdsempty(),"%ld",when
));
6822 /* Append the actual command */
6823 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
6824 for (j
= 0; j
< argc
; j
++) {
6827 o
= getDecodedObject(o
);
6828 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
6829 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
6830 buf
= sdscatlen(buf
,"\r\n",2);
6834 /* Free the objects from the modified argv for EXPIREAT */
6835 if (cmd
->proc
== expireCommand
) {
6836 for (j
= 0; j
< 3; j
++)
6837 decrRefCount(argv
[j
]);
6840 /* We want to perform a single write. This should be guaranteed atomic
6841 * at least if the filesystem we are writing is a real physical one.
6842 * While this will save us against the server being killed I don't think
6843 * there is much to do about the whole server stopping for power problems
6845 nwritten
= write(server
.appendfd
,buf
,sdslen(buf
));
6846 if (nwritten
!= (signed)sdslen(buf
)) {
6847 /* Ooops, we are in troubles. The best thing to do for now is
6848 * to simply exit instead to give the illusion that everything is
6849 * working as expected. */
6850 if (nwritten
== -1) {
6851 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
6853 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
6857 /* If a background append only file rewriting is in progress we want to
6858 * accumulate the differences between the child DB and the current one
6859 * in a buffer, so that when the child process will do its work we
6860 * can append the differences to the new append only file. */
6861 if (server
.bgrewritechildpid
!= -1)
6862 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
6866 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
6867 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
6868 now
-server
.lastfsync
> 1))
6870 fsync(server
.appendfd
); /* Let's try to get this data on the disk */
6871 server
.lastfsync
= now
;
6875 /* In Redis commands are always executed in the context of a client, so in
6876 * order to load the append only file we need to create a fake client. */
6877 static struct redisClient
*createFakeClient(void) {
6878 struct redisClient
*c
= zmalloc(sizeof(*c
));
6882 c
->querybuf
= sdsempty();
6886 /* We set the fake client as a slave waiting for the synchronization
6887 * so that Redis will not try to send replies to this client. */
6888 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
6889 c
->reply
= listCreate();
6890 listSetFreeMethod(c
->reply
,decrRefCount
);
6891 listSetDupMethod(c
->reply
,dupClientReplyValue
);
6895 static void freeFakeClient(struct redisClient
*c
) {
6896 sdsfree(c
->querybuf
);
6897 listRelease(c
->reply
);
6901 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
6902 * error (the append only file is zero-length) REDIS_ERR is returned. On
6903 * fatal error an error message is logged and the program exists. */
6904 int loadAppendOnlyFile(char *filename
) {
6905 struct redisClient
*fakeClient
;
6906 FILE *fp
= fopen(filename
,"r");
6907 struct redis_stat sb
;
6908 unsigned long long loadedkeys
= 0;
6910 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
6914 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
6918 fakeClient
= createFakeClient();
6925 struct redisCommand
*cmd
;
6927 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
6933 if (buf
[0] != '*') goto fmterr
;
6935 argv
= zmalloc(sizeof(robj
*)*argc
);
6936 for (j
= 0; j
< argc
; j
++) {
6937 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
6938 if (buf
[0] != '$') goto fmterr
;
6939 len
= strtol(buf
+1,NULL
,10);
6940 argsds
= sdsnewlen(NULL
,len
);
6941 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
6942 argv
[j
] = createObject(REDIS_STRING
,argsds
);
6943 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
6946 /* Command lookup */
6947 cmd
= lookupCommand(argv
[0]->ptr
);
6949 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
6952 /* Try object sharing and encoding */
6953 if (server
.shareobjects
) {
6955 for(j
= 1; j
< argc
; j
++)
6956 argv
[j
] = tryObjectSharing(argv
[j
]);
6958 if (cmd
->flags
& REDIS_CMD_BULK
)
6959 tryObjectEncoding(argv
[argc
-1]);
6960 /* Run the command in the context of a fake client */
6961 fakeClient
->argc
= argc
;
6962 fakeClient
->argv
= argv
;
6963 cmd
->proc(fakeClient
);
6964 /* Discard the reply objects list from the fake client */
6965 while(listLength(fakeClient
->reply
))
6966 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
6967 /* Clean up, ready for the next command */
6968 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
6970 /* Handle swapping while loading big datasets when VM is on */
6972 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
6973 while (zmalloc_used_memory() > server
.vm_max_memory
) {
6974 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
6979 freeFakeClient(fakeClient
);
6984 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
6986 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
6990 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
6994 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6995 static int fwriteBulk(FILE *fp
, robj
*obj
) {
6999 /* Avoid the incr/decr ref count business if possible to help
7000 * copy-on-write (we are often in a child process when this function
7002 * Also makes sure that key objects don't get incrRefCount-ed when VM
7004 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
7005 obj
= getDecodedObject(obj
);
7008 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(long)sdslen(obj
->ptr
));
7009 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) goto err
;
7010 if (sdslen(obj
->ptr
) && fwrite(obj
->ptr
,sdslen(obj
->ptr
),1,fp
) == 0)
7012 if (fwrite("\r\n",2,1,fp
) == 0) goto err
;
7013 if (decrrc
) decrRefCount(obj
);
7016 if (decrrc
) decrRefCount(obj
);
7020 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7021 static int fwriteBulkDouble(FILE *fp
, double d
) {
7022 char buf
[128], dbuf
[128];
7024 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
7025 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
7026 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7027 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
7031 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7032 static int fwriteBulkLong(FILE *fp
, long l
) {
7033 char buf
[128], lbuf
[128];
7035 snprintf(lbuf
,sizeof(lbuf
),"%ld\r\n",l
);
7036 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(lbuf
)-2);
7037 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7038 if (fwrite(lbuf
,strlen(lbuf
),1,fp
) == 0) return 0;
7042 /* Write a sequence of commands able to fully rebuild the dataset into
7043 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7044 static int rewriteAppendOnlyFile(char *filename
) {
7045 dictIterator
*di
= NULL
;
7050 time_t now
= time(NULL
);
7052 /* Note that we have to use a different temp name here compared to the
7053 * one used by rewriteAppendOnlyFileBackground() function. */
7054 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
7055 fp
= fopen(tmpfile
,"w");
7057 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
7060 for (j
= 0; j
< server
.dbnum
; j
++) {
7061 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
7062 redisDb
*db
= server
.db
+j
;
7064 if (dictSize(d
) == 0) continue;
7065 di
= dictGetIterator(d
);
7071 /* SELECT the new DB */
7072 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
7073 if (fwriteBulkLong(fp
,j
) == 0) goto werr
;
7075 /* Iterate this DB writing every entry */
7076 while((de
= dictNext(di
)) != NULL
) {
7081 key
= dictGetEntryKey(de
);
7082 /* If the value for this key is swapped, load a preview in memory.
7083 * We use a "swapped" flag to remember if we need to free the
7084 * value object instead to just increment the ref count anyway
7085 * in order to avoid copy-on-write of pages if we are forked() */
7086 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
7087 key
->storage
== REDIS_VM_SWAPPING
) {
7088 o
= dictGetEntryVal(de
);
7091 o
= vmPreviewObject(key
);
7094 expiretime
= getExpire(db
,key
);
7096 /* Save the key and associated value */
7097 if (o
->type
== REDIS_STRING
) {
7098 /* Emit a SET command */
7099 char cmd
[]="*3\r\n$3\r\nSET\r\n";
7100 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7102 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7103 if (fwriteBulk(fp
,o
) == 0) goto werr
;
7104 } else if (o
->type
== REDIS_LIST
) {
7105 /* Emit the RPUSHes needed to rebuild the list */
7106 list
*list
= o
->ptr
;
7110 listRewind(list
,&li
);
7111 while((ln
= listNext(&li
))) {
7112 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
7113 robj
*eleobj
= listNodeValue(ln
);
7115 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7116 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7117 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7119 } else if (o
->type
== REDIS_SET
) {
7120 /* Emit the SADDs needed to rebuild the set */
7122 dictIterator
*di
= dictGetIterator(set
);
7125 while((de
= dictNext(di
)) != NULL
) {
7126 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
7127 robj
*eleobj
= dictGetEntryKey(de
);
7129 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7130 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7131 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7133 dictReleaseIterator(di
);
7134 } else if (o
->type
== REDIS_ZSET
) {
7135 /* Emit the ZADDs needed to rebuild the sorted set */
7137 dictIterator
*di
= dictGetIterator(zs
->dict
);
7140 while((de
= dictNext(di
)) != NULL
) {
7141 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
7142 robj
*eleobj
= dictGetEntryKey(de
);
7143 double *score
= dictGetEntryVal(de
);
7145 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7146 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7147 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
7148 if (fwriteBulk(fp
,eleobj
) == 0) goto werr
;
7150 dictReleaseIterator(di
);
7152 redisAssert(0 != 0);
7154 /* Save the expire time */
7155 if (expiretime
!= -1) {
7156 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
7157 /* If this key is already expired skip it */
7158 if (expiretime
< now
) continue;
7159 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7160 if (fwriteBulk(fp
,key
) == 0) goto werr
;
7161 if (fwriteBulkLong(fp
,expiretime
) == 0) goto werr
;
7163 if (swapped
) decrRefCount(o
);
7165 dictReleaseIterator(di
);
7168 /* Make sure data will not remain on the OS's output buffers */
7173 /* Use RENAME to make sure the DB file is changed atomically only
7174 * if the generate DB file is ok. */
7175 if (rename(tmpfile
,filename
) == -1) {
7176 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
7180 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
7186 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
7187 if (di
) dictReleaseIterator(di
);
7191 /* This is how rewriting of the append only file in background works:
7193 * 1) The user calls BGREWRITEAOF
7194 * 2) Redis calls this function, that forks():
7195 * 2a) the child rewrite the append only file in a temp file.
7196 * 2b) the parent accumulates differences in server.bgrewritebuf.
7197 * 3) When the child finished '2a' exists.
7198 * 4) The parent will trap the exit code, if it's OK, will append the
7199 * data accumulated into server.bgrewritebuf into the temp file, and
7200 * finally will rename(2) the temp file in the actual file name.
7201 * The the new file is reopened as the new append only file. Profit!
7203 static int rewriteAppendOnlyFileBackground(void) {
7206 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
7207 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
7208 if ((childpid
= fork()) == 0) {
7212 if (server
.vm_enabled
) vmReopenSwapFile();
7214 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7215 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
7222 if (childpid
== -1) {
7223 redisLog(REDIS_WARNING
,
7224 "Can't rewrite append only file in background: fork: %s",
7228 redisLog(REDIS_NOTICE
,
7229 "Background append only file rewriting started by pid %d",childpid
);
7230 server
.bgrewritechildpid
= childpid
;
7231 /* We set appendseldb to -1 in order to force the next call to the
7232 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7233 * accumulated by the parent into server.bgrewritebuf will start
7234 * with a SELECT statement and it will be safe to merge. */
7235 server
.appendseldb
= -1;
7238 return REDIS_OK
; /* unreached */
7241 static void bgrewriteaofCommand(redisClient
*c
) {
7242 if (server
.bgrewritechildpid
!= -1) {
7243 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7246 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
7247 char *status
= "+Background append only file rewriting started\r\n";
7248 addReplySds(c
,sdsnew(status
));
7250 addReply(c
,shared
.err
);
7254 static void aofRemoveTempFile(pid_t childpid
) {
7257 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
7261 /* Virtual Memory is composed mainly of two subsystems:
7262 * - Blocking Virutal Memory
7263 * - Threaded Virtual Memory I/O
7264 * The two parts are not fully decoupled, but functions are split among two
7265 * different sections of the source code (delimited by comments) in order to
7266 * make more clear what functionality is about the blocking VM and what about
7267 * the threaded (not blocking) VM.
7271 * Redis VM is a blocking VM (one that blocks reading swapped values from
7272 * disk into memory when a value swapped out is needed in memory) that is made
7273 * unblocking by trying to examine the command argument vector in order to
7274 * load in background values that will likely be needed in order to exec
7275 * the command. The command is executed only once all the relevant keys
7276 * are loaded into memory.
7278 * This basically is almost as simple of a blocking VM, but almost as parallel
7279 * as a fully non-blocking VM.
7282 /* =================== Virtual Memory - Blocking Side ====================== */
7284 /* substitute the first occurrence of '%p' with the process pid in the
7285 * swap file name. */
7286 static void expandVmSwapFilename(void) {
7287 char *p
= strstr(server
.vm_swap_file
,"%p");
7293 new = sdscat(new,server
.vm_swap_file
);
7294 new = sdscatprintf(new,"%ld",(long) getpid());
7295 new = sdscat(new,p
+2);
7296 zfree(server
.vm_swap_file
);
7297 server
.vm_swap_file
= new;
7300 static void vmInit(void) {
7305 if (server
.vm_max_threads
!= 0)
7306 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7308 expandVmSwapFilename();
7309 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
7310 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
7311 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
7313 if (server
.vm_fp
== NULL
) {
7314 redisLog(REDIS_WARNING
,
7315 "Impossible to open the swap file: %s. Exiting.",
7319 server
.vm_fd
= fileno(server
.vm_fp
);
7320 server
.vm_next_page
= 0;
7321 server
.vm_near_pages
= 0;
7322 server
.vm_stats_used_pages
= 0;
7323 server
.vm_stats_swapped_objects
= 0;
7324 server
.vm_stats_swapouts
= 0;
7325 server
.vm_stats_swapins
= 0;
7326 totsize
= server
.vm_pages
*server
.vm_page_size
;
7327 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
7328 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
7329 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
7333 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
7335 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
7336 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
7337 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
7338 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
7340 /* Initialize threaded I/O (used by Virtual Memory) */
7341 server
.io_newjobs
= listCreate();
7342 server
.io_processing
= listCreate();
7343 server
.io_processed
= listCreate();
7344 server
.io_ready_clients
= listCreate();
7345 pthread_mutex_init(&server
.io_mutex
,NULL
);
7346 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
7347 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
7348 server
.io_active_threads
= 0;
7349 if (pipe(pipefds
) == -1) {
7350 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
7354 server
.io_ready_pipe_read
= pipefds
[0];
7355 server
.io_ready_pipe_write
= pipefds
[1];
7356 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
7357 /* LZF requires a lot of stack */
7358 pthread_attr_init(&server
.io_threads_attr
);
7359 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
7360 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
7361 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
7362 /* Listen for events in the threaded I/O pipe */
7363 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
7364 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
7365 oom("creating file event");
7368 /* Mark the page as used */
7369 static void vmMarkPageUsed(off_t page
) {
7370 off_t byte
= page
/8;
7372 redisAssert(vmFreePage(page
) == 1);
7373 server
.vm_bitmap
[byte
] |= 1<<bit
;
7376 /* Mark N contiguous pages as used, with 'page' being the first. */
7377 static void vmMarkPagesUsed(off_t page
, off_t count
) {
7380 for (j
= 0; j
< count
; j
++)
7381 vmMarkPageUsed(page
+j
);
7382 server
.vm_stats_used_pages
+= count
;
7383 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
7384 (long long)count
, (long long)page
);
7387 /* Mark the page as free */
7388 static void vmMarkPageFree(off_t page
) {
7389 off_t byte
= page
/8;
7391 redisAssert(vmFreePage(page
) == 0);
7392 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
7395 /* Mark N contiguous pages as free, with 'page' being the first. */
7396 static void vmMarkPagesFree(off_t page
, off_t count
) {
7399 for (j
= 0; j
< count
; j
++)
7400 vmMarkPageFree(page
+j
);
7401 server
.vm_stats_used_pages
-= count
;
7402 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
7403 (long long)count
, (long long)page
);
7406 /* Test if the page is free */
7407 static int vmFreePage(off_t page
) {
7408 off_t byte
= page
/8;
7410 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
7413 /* Find N contiguous free pages storing the first page of the cluster in *first.
7414 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7415 * REDIS_ERR is returned.
7417 * This function uses a simple algorithm: we try to allocate
7418 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7419 * again from the start of the swap file searching for free spaces.
7421 * If it looks pretty clear that there are no free pages near our offset
7422 * we try to find less populated places doing a forward jump of
7423 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7424 * without hurry, and then we jump again and so forth...
7426 * This function can be improved using a free list to avoid to guess
7427 * too much, since we could collect data about freed pages.
7429 * note: I implemented this function just after watching an episode of
7430 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7432 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
7433 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
7435 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
7436 server
.vm_near_pages
= 0;
7437 server
.vm_next_page
= 0;
7439 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
7440 base
= server
.vm_next_page
;
7442 while(offset
< server
.vm_pages
) {
7443 off_t
this = base
+offset
;
7445 /* If we overflow, restart from page zero */
7446 if (this >= server
.vm_pages
) {
7447 this -= server
.vm_pages
;
7449 /* Just overflowed, what we found on tail is no longer
7450 * interesting, as it's no longer contiguous. */
7454 if (vmFreePage(this)) {
7455 /* This is a free page */
7457 /* Already got N free pages? Return to the caller, with success */
7459 *first
= this-(n
-1);
7460 server
.vm_next_page
= this+1;
7461 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
7465 /* The current one is not a free page */
7469 /* Fast-forward if the current page is not free and we already
7470 * searched enough near this place. */
7472 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
7473 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
7475 /* Note that even if we rewind after the jump, we are don't need
7476 * to make sure numfree is set to zero as we only jump *if* it
7477 * is set to zero. */
7479 /* Otherwise just check the next page */
7486 /* Write the specified object at the specified page of the swap file */
7487 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
7488 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
7489 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
7490 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7491 redisLog(REDIS_WARNING
,
7492 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7496 rdbSaveObject(server
.vm_fp
,o
);
7497 fflush(server
.vm_fp
);
7498 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7502 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7503 * needed to later retrieve the object into the key object.
7504 * If we can't find enough contiguous empty pages to swap the object on disk
7505 * REDIS_ERR is returned. */
7506 static int vmSwapObjectBlocking(robj
*key
, robj
*val
) {
7507 off_t pages
= rdbSavedObjectPages(val
,NULL
);
7510 assert(key
->storage
== REDIS_VM_MEMORY
);
7511 assert(key
->refcount
== 1);
7512 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return REDIS_ERR
;
7513 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return REDIS_ERR
;
7514 key
->vm
.page
= page
;
7515 key
->vm
.usedpages
= pages
;
7516 key
->storage
= REDIS_VM_SWAPPED
;
7517 key
->vtype
= val
->type
;
7518 decrRefCount(val
); /* Deallocate the object from memory. */
7519 vmMarkPagesUsed(page
,pages
);
7520 redisLog(REDIS_DEBUG
,"VM: object %s swapped out at %lld (%lld pages)",
7521 (unsigned char*) key
->ptr
,
7522 (unsigned long long) page
, (unsigned long long) pages
);
7523 server
.vm_stats_swapped_objects
++;
7524 server
.vm_stats_swapouts
++;
7528 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
7531 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
7532 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
7533 redisLog(REDIS_WARNING
,
7534 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7538 o
= rdbLoadObject(type
,server
.vm_fp
);
7540 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
7543 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
7547 /* Load the value object relative to the 'key' object from swap to memory.
7548 * The newly allocated object is returned.
7550 * If preview is true the unserialized object is returned to the caller but
7551 * no changes are made to the key object, nor the pages are marked as freed */
7552 static robj
*vmGenericLoadObject(robj
*key
, int preview
) {
7555 redisAssert(key
->storage
== REDIS_VM_SWAPPED
|| key
->storage
== REDIS_VM_LOADING
);
7556 val
= vmReadObjectFromSwap(key
->vm
.page
,key
->vtype
);
7558 key
->storage
= REDIS_VM_MEMORY
;
7559 key
->vm
.atime
= server
.unixtime
;
7560 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
7561 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk",
7562 (unsigned char*) key
->ptr
);
7563 server
.vm_stats_swapped_objects
--;
7565 redisLog(REDIS_DEBUG
, "VM: object %s previewed from disk",
7566 (unsigned char*) key
->ptr
);
7568 server
.vm_stats_swapins
++;
7572 /* Plain object loading, from swap to memory */
7573 static robj
*vmLoadObject(robj
*key
) {
7574 /* If we are loading the object in background, stop it, we
7575 * need to load this object synchronously ASAP. */
7576 if (key
->storage
== REDIS_VM_LOADING
)
7577 vmCancelThreadedIOJob(key
);
7578 return vmGenericLoadObject(key
,0);
7581 /* Just load the value on disk, without to modify the key.
7582 * This is useful when we want to perform some operation on the value
7583 * without to really bring it from swap to memory, like while saving the
7584 * dataset or rewriting the append only log. */
7585 static robj
*vmPreviewObject(robj
*key
) {
7586 return vmGenericLoadObject(key
,1);
7589 /* How a good candidate is this object for swapping?
7590 * The better candidate it is, the greater the returned value.
7592 * Currently we try to perform a fast estimation of the object size in
7593 * memory, and combine it with aging informations.
7595 * Basically swappability = idle-time * log(estimated size)
7597 * Bigger objects are preferred over smaller objects, but not
7598 * proportionally, this is why we use the logarithm. This algorithm is
7599 * just a first try and will probably be tuned later. */
7600 static double computeObjectSwappability(robj
*o
) {
7601 time_t age
= server
.unixtime
- o
->vm
.atime
;
7605 struct dictEntry
*de
;
7608 if (age
<= 0) return 0;
7611 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
7614 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
7619 listNode
*ln
= listFirst(l
);
7621 asize
= sizeof(list
);
7623 robj
*ele
= ln
->value
;
7626 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
7627 (sizeof(*o
)+sdslen(ele
->ptr
)) :
7629 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
7634 z
= (o
->type
== REDIS_ZSET
);
7635 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
7637 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
7638 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
7643 de
= dictGetRandomKey(d
);
7644 ele
= dictGetEntryKey(de
);
7645 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
7646 (sizeof(*o
)+sdslen(ele
->ptr
)) :
7648 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
7649 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
7653 return (double)age
*log(1+asize
);
7656 /* Try to swap an object that's a good candidate for swapping.
7657 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7658 * to swap any object at all.
7660 * If 'usethreaded' is true, Redis will try to swap the object in background
7661 * using I/O threads. */
7662 static int vmSwapOneObject(int usethreads
) {
7664 struct dictEntry
*best
= NULL
;
7665 double best_swappability
= 0;
7666 redisDb
*best_db
= NULL
;
7669 for (j
= 0; j
< server
.dbnum
; j
++) {
7670 redisDb
*db
= server
.db
+j
;
7671 /* Why maxtries is set to 100?
7672 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7673 * are swappable objects */
7676 if (dictSize(db
->dict
) == 0) continue;
7677 for (i
= 0; i
< 5; i
++) {
7679 double swappability
;
7681 if (maxtries
) maxtries
--;
7682 de
= dictGetRandomKey(db
->dict
);
7683 key
= dictGetEntryKey(de
);
7684 val
= dictGetEntryVal(de
);
7685 /* Only swap objects that are currently in memory.
7687 * Also don't swap shared objects if threaded VM is on, as we
7688 * try to ensure that the main thread does not touch the
7689 * object while the I/O thread is using it, but we can't
7690 * control other keys without adding additional mutex. */
7691 if (key
->storage
!= REDIS_VM_MEMORY
||
7692 (server
.vm_max_threads
!= 0 && val
->refcount
!= 1)) {
7693 if (maxtries
) i
--; /* don't count this try */
7696 swappability
= computeObjectSwappability(val
);
7697 if (!best
|| swappability
> best_swappability
) {
7699 best_swappability
= swappability
;
7704 if (best
== NULL
) return REDIS_ERR
;
7705 key
= dictGetEntryKey(best
);
7706 val
= dictGetEntryVal(best
);
7708 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
7709 key
->ptr
, best_swappability
);
7711 /* Unshare the key if needed */
7712 if (key
->refcount
> 1) {
7713 robj
*newkey
= dupStringObject(key
);
7715 key
= dictGetEntryKey(best
) = newkey
;
7719 vmSwapObjectThreaded(key
,val
,best_db
);
7722 if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
7723 dictGetEntryVal(best
) = NULL
;
7731 static int vmSwapOneObjectBlocking() {
7732 return vmSwapOneObject(0);
7735 static int vmSwapOneObjectThreaded() {
7736 return vmSwapOneObject(1);
7739 /* Return true if it's safe to swap out objects in a given moment.
7740 * Basically we don't want to swap objects out while there is a BGSAVE
7741 * or a BGAEOREWRITE running in backgroud. */
7742 static int vmCanSwapOut(void) {
7743 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
7746 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7747 * and was deleted. Otherwise 0 is returned. */
7748 static int deleteIfSwapped(redisDb
*db
, robj
*key
) {
7752 if ((de
= dictFind(db
->dict
,key
)) == NULL
) return 0;
7753 foundkey
= dictGetEntryKey(de
);
7754 if (foundkey
->storage
== REDIS_VM_MEMORY
) return 0;
7759 /* =================== Virtual Memory - Threaded I/O ======================= */
7761 static void freeIOJob(iojob
*j
) {
7762 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
7763 j
->type
== REDIS_IOJOB_DO_SWAP
||
7764 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
7765 decrRefCount(j
->val
);
7766 decrRefCount(j
->key
);
7770 /* Every time a thread finished a Job, it writes a byte into the write side
7771 * of an unix pipe in order to "awake" the main thread, and this function
7773 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
7777 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
7779 REDIS_NOTUSED(mask
);
7780 REDIS_NOTUSED(privdata
);
7782 /* For every byte we read in the read side of the pipe, there is one
7783 * I/O job completed to process. */
7784 while((retval
= read(fd
,buf
,1)) == 1) {
7788 struct dictEntry
*de
;
7790 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
7792 /* Get the processed element (the oldest one) */
7794 assert(listLength(server
.io_processed
) != 0);
7795 if (toprocess
== -1) {
7796 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
7797 if (toprocess
<= 0) toprocess
= 1;
7799 ln
= listFirst(server
.io_processed
);
7801 listDelNode(server
.io_processed
,ln
);
7803 /* If this job is marked as canceled, just ignore it */
7808 /* Post process it in the main thread, as there are things we
7809 * can do just here to avoid race conditions and/or invasive locks */
7810 redisLog(REDIS_DEBUG
,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j
, j
->type
, (void*)j
->key
, (char*)j
->key
->ptr
, j
->key
->refcount
);
7811 de
= dictFind(j
->db
->dict
,j
->key
);
7813 key
= dictGetEntryKey(de
);
7814 if (j
->type
== REDIS_IOJOB_LOAD
) {
7817 /* Key loaded, bring it at home */
7818 key
->storage
= REDIS_VM_MEMORY
;
7819 key
->vm
.atime
= server
.unixtime
;
7820 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
7821 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
7822 (unsigned char*) key
->ptr
);
7823 server
.vm_stats_swapped_objects
--;
7824 server
.vm_stats_swapins
++;
7825 dictGetEntryVal(de
) = j
->val
;
7826 incrRefCount(j
->val
);
7829 /* Handle clients waiting for this key to be loaded. */
7830 handleClientsBlockedOnSwappedKey(db
,key
);
7831 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
7832 /* Now we know the amount of pages required to swap this object.
7833 * Let's find some space for it, and queue this task again
7834 * rebranded as REDIS_IOJOB_DO_SWAP. */
7835 if (!vmCanSwapOut() ||
7836 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
7838 /* Ooops... no space or we can't swap as there is
7839 * a fork()ed Redis trying to save stuff on disk. */
7841 key
->storage
= REDIS_VM_MEMORY
; /* undo operation */
7843 /* Note that we need to mark this pages as used now,
7844 * if the job will be canceled, we'll mark them as freed
7846 vmMarkPagesUsed(j
->page
,j
->pages
);
7847 j
->type
= REDIS_IOJOB_DO_SWAP
;
7852 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
7855 /* Key swapped. We can finally free some memory. */
7856 if (key
->storage
!= REDIS_VM_SWAPPING
) {
7857 printf("key->storage: %d\n",key
->storage
);
7858 printf("key->name: %s\n",(char*)key
->ptr
);
7859 printf("key->refcount: %d\n",key
->refcount
);
7860 printf("val: %p\n",(void*)j
->val
);
7861 printf("val->type: %d\n",j
->val
->type
);
7862 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
7864 redisAssert(key
->storage
== REDIS_VM_SWAPPING
);
7865 val
= dictGetEntryVal(de
);
7866 key
->vm
.page
= j
->page
;
7867 key
->vm
.usedpages
= j
->pages
;
7868 key
->storage
= REDIS_VM_SWAPPED
;
7869 key
->vtype
= j
->val
->type
;
7870 decrRefCount(val
); /* Deallocate the object from memory. */
7871 dictGetEntryVal(de
) = NULL
;
7872 redisLog(REDIS_DEBUG
,
7873 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7874 (unsigned char*) key
->ptr
,
7875 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
7876 server
.vm_stats_swapped_objects
++;
7877 server
.vm_stats_swapouts
++;
7879 /* Put a few more swap requests in queue if we are still
7881 if (trytoswap
&& vmCanSwapOut() &&
7882 zmalloc_used_memory() > server
.vm_max_memory
)
7887 more
= listLength(server
.io_newjobs
) <
7888 (unsigned) server
.vm_max_threads
;
7890 /* Don't waste CPU time if swappable objects are rare. */
7891 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
7899 if (processed
== toprocess
) return;
7901 if (retval
< 0 && errno
!= EAGAIN
) {
7902 redisLog(REDIS_WARNING
,
7903 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7908 static void lockThreadedIO(void) {
7909 pthread_mutex_lock(&server
.io_mutex
);
7912 static void unlockThreadedIO(void) {
7913 pthread_mutex_unlock(&server
.io_mutex
);
7916 /* Remove the specified object from the threaded I/O queue if still not
7917 * processed, otherwise make sure to flag it as canceled. */
7918 static void vmCancelThreadedIOJob(robj
*o
) {
7920 server
.io_newjobs
, /* 0 */
7921 server
.io_processing
, /* 1 */
7922 server
.io_processed
/* 2 */
7926 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
7929 /* Search for a matching key in one of the queues */
7930 for (i
= 0; i
< 3; i
++) {
7934 listRewind(lists
[i
],&li
);
7935 while ((ln
= listNext(&li
)) != NULL
) {
7936 iojob
*job
= ln
->value
;
7938 if (job
->canceled
) continue; /* Skip this, already canceled. */
7939 if (compareStringObjects(job
->key
,o
) == 0) {
7940 redisLog(REDIS_DEBUG
,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
7941 (void*)job
, (char*)o
->ptr
, job
->type
, i
);
7942 /* Mark the pages as free since the swap didn't happened
7943 * or happened but is now discarded. */
7944 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
7945 vmMarkPagesFree(job
->page
,job
->pages
);
7946 /* Cancel the job. It depends on the list the job is
7949 case 0: /* io_newjobs */
7950 /* If the job was yet not processed the best thing to do
7951 * is to remove it from the queue at all */
7953 listDelNode(lists
[i
],ln
);
7955 case 1: /* io_processing */
7956 /* Oh Shi- the thread is messing with the Job:
7958 * Probably it's accessing the object if this is a
7959 * PREPARE_SWAP or DO_SWAP job.
7960 * If it's a LOAD job it may be reading from disk and
7961 * if we don't wait for the job to terminate before to
7962 * cancel it, maybe in a few microseconds data can be
7963 * corrupted in this pages. So the short story is:
7965 * Better to wait for the job to move into the
7966 * next queue (processed)... */
7968 /* We try again and again until the job is completed. */
7970 /* But let's wait some time for the I/O thread
7971 * to finish with this job. After all this condition
7972 * should be very rare. */
7975 case 2: /* io_processed */
7976 /* The job was already processed, that's easy...
7977 * just mark it as canceled so that we'll ignore it
7978 * when processing completed jobs. */
7982 /* Finally we have to adjust the storage type of the object
7983 * in order to "UNDO" the operaiton. */
7984 if (o
->storage
== REDIS_VM_LOADING
)
7985 o
->storage
= REDIS_VM_SWAPPED
;
7986 else if (o
->storage
== REDIS_VM_SWAPPING
)
7987 o
->storage
= REDIS_VM_MEMORY
;
7994 assert(1 != 1); /* We should never reach this */
7997 static void *IOThreadEntryPoint(void *arg
) {
8002 pthread_detach(pthread_self());
8004 /* Get a new job to process */
8006 if (listLength(server
.io_newjobs
) == 0) {
8007 /* No new jobs in queue, exit. */
8008 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
8009 (long) pthread_self());
8010 server
.io_active_threads
--;
8014 ln
= listFirst(server
.io_newjobs
);
8016 listDelNode(server
.io_newjobs
,ln
);
8017 /* Add the job in the processing queue */
8018 j
->thread
= pthread_self();
8019 listAddNodeTail(server
.io_processing
,j
);
8020 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
8022 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
8023 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
8025 /* Process the Job */
8026 if (j
->type
== REDIS_IOJOB_LOAD
) {
8027 j
->val
= vmReadObjectFromSwap(j
->page
,j
->key
->vtype
);
8028 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
8029 FILE *fp
= fopen("/dev/null","w+");
8030 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
8032 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
8033 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
8037 /* Done: insert the job into the processed queue */
8038 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
8039 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
8041 listDelNode(server
.io_processing
,ln
);
8042 listAddNodeTail(server
.io_processed
,j
);
8045 /* Signal the main thread there is new stuff to process */
8046 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
8048 return NULL
; /* never reached */
8051 static void spawnIOThread(void) {
8053 sigset_t mask
, omask
;
8056 sigaddset(&mask
,SIGCHLD
);
8057 sigaddset(&mask
,SIGHUP
);
8058 sigaddset(&mask
,SIGPIPE
);
8059 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
8060 pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
);
8061 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
8062 server
.io_active_threads
++;
8065 /* We need to wait for the last thread to exit before we are able to
8066 * fork() in order to BGSAVE or BGREWRITEAOF. */
8067 static void waitEmptyIOJobsQueue(void) {
8069 int io_processed_len
;
8072 if (listLength(server
.io_newjobs
) == 0 &&
8073 listLength(server
.io_processing
) == 0 &&
8074 server
.io_active_threads
== 0)
8079 /* While waiting for empty jobs queue condition we post-process some
8080 * finshed job, as I/O threads may be hanging trying to write against
8081 * the io_ready_pipe_write FD but there are so much pending jobs that
8083 io_processed_len
= listLength(server
.io_processed
);
8085 if (io_processed_len
) {
8086 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
8087 usleep(1000); /* 1 millisecond */
8089 usleep(10000); /* 10 milliseconds */
8094 static void vmReopenSwapFile(void) {
8095 /* Note: we don't close the old one as we are in the child process
8096 * and don't want to mess at all with the original file object. */
8097 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
8098 if (server
.vm_fp
== NULL
) {
8099 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
8100 server
.vm_swap_file
);
8103 server
.vm_fd
= fileno(server
.vm_fp
);
8106 /* This function must be called while with threaded IO locked */
8107 static void queueIOJob(iojob
*j
) {
8108 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
8109 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
8110 listAddNodeTail(server
.io_newjobs
,j
);
8111 if (server
.io_active_threads
< server
.vm_max_threads
)
8115 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
8118 assert(key
->storage
== REDIS_VM_MEMORY
);
8119 assert(key
->refcount
== 1);
8121 j
= zmalloc(sizeof(*j
));
8122 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
8124 j
->key
= dupStringObject(key
);
8128 j
->thread
= (pthread_t
) -1;
8129 key
->storage
= REDIS_VM_SWAPPING
;
8137 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8139 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8140 * If there is not already a job loading the key, it is craeted.
8141 * The key is added to the io_keys list in the client structure, and also
8142 * in the hash table mapping swapped keys to waiting clients, that is,
8143 * server.io_waited_keys. */
8144 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
8145 struct dictEntry
*de
;
8149 /* If the key does not exist or is already in RAM we don't need to
8150 * block the client at all. */
8151 de
= dictFind(c
->db
->dict
,key
);
8152 if (de
== NULL
) return 0;
8153 o
= dictGetEntryKey(de
);
8154 if (o
->storage
== REDIS_VM_MEMORY
) {
8156 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
8157 /* We were swapping the key, undo it! */
8158 vmCancelThreadedIOJob(o
);
8162 /* OK: the key is either swapped, or being loaded just now. */
8164 /* Add the key to the list of keys this client is waiting for.
8165 * This maps clients to keys they are waiting for. */
8166 listAddNodeTail(c
->io_keys
,key
);
8169 /* Add the client to the swapped keys => clients waiting map. */
8170 de
= dictFind(c
->db
->io_keys
,key
);
8174 /* For every key we take a list of clients blocked for it */
8176 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
8178 assert(retval
== DICT_OK
);
8180 l
= dictGetEntryVal(de
);
8182 listAddNodeTail(l
,c
);
8184 /* Are we already loading the key from disk? If not create a job */
8185 if (o
->storage
== REDIS_VM_SWAPPED
) {
8188 o
->storage
= REDIS_VM_LOADING
;
8189 j
= zmalloc(sizeof(*j
));
8190 j
->type
= REDIS_IOJOB_LOAD
;
8192 j
->key
= dupStringObject(key
);
8193 j
->key
->vtype
= o
->vtype
;
8194 j
->page
= o
->vm
.page
;
8197 j
->thread
= (pthread_t
) -1;
8205 /* Is this client attempting to run a command against swapped keys?
8206 * If so, block it ASAP, load the keys in background, then resume it.
8208 * The important idea about this function is that it can fail! If keys will
8209 * still be swapped when the client is resumed, this key lookups will
8210 * just block loading keys from disk. In practical terms this should only
8211 * happen with SORT BY command or if there is a bug in this function.
8213 * Return 1 if the client is marked as blocked, 0 if the client can
8214 * continue as the keys it is going to access appear to be in memory. */
8215 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
) {
8218 if (cmd
->vm_firstkey
== 0) return 0;
8219 last
= cmd
->vm_lastkey
;
8220 if (last
< 0) last
= c
->argc
+last
;
8221 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
)
8222 waitForSwappedKey(c
,c
->argv
[j
]);
8223 /* If the client was blocked for at least one key, mark it as blocked. */
8224 if (listLength(c
->io_keys
)) {
8225 c
->flags
|= REDIS_IO_WAIT
;
8226 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
8227 server
.vm_blocked_clients
++;
8234 /* Remove the 'key' from the list of blocked keys for a given client.
8236 * The function returns 1 when there are no longer blocking keys after
8237 * the current one was removed (and the client can be unblocked). */
8238 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
8242 struct dictEntry
*de
;
8244 /* Remove the key from the list of keys this client is waiting for. */
8245 listRewind(c
->io_keys
,&li
);
8246 while ((ln
= listNext(&li
)) != NULL
) {
8247 if (compareStringObjects(ln
->value
,key
) == 0) {
8248 listDelNode(c
->io_keys
,ln
);
8254 /* Remove the client form the key => waiting clients map. */
8255 de
= dictFind(c
->db
->io_keys
,key
);
8257 l
= dictGetEntryVal(de
);
8258 ln
= listSearchKey(l
,c
);
8261 if (listLength(l
) == 0)
8262 dictDelete(c
->db
->io_keys
,key
);
8264 return listLength(c
->io_keys
) == 0;
8267 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
8268 struct dictEntry
*de
;
8273 de
= dictFind(db
->io_keys
,key
);
8276 l
= dictGetEntryVal(de
);
8277 len
= listLength(l
);
8278 /* Note: we can't use something like while(listLength(l)) as the list
8279 * can be freed by the calling function when we remove the last element. */
8282 redisClient
*c
= ln
->value
;
8284 if (dontWaitForSwappedKey(c
,key
)) {
8285 /* Put the client in the list of clients ready to go as we
8286 * loaded all the keys about it. */
8287 listAddNodeTail(server
.io_ready_clients
,c
);
8292 /* ================================= Debugging ============================== */
8294 static void debugCommand(redisClient
*c
) {
8295 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
8297 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
8298 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
8299 addReply(c
,shared
.err
);
8303 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
8304 addReply(c
,shared
.err
);
8307 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
8308 addReply(c
,shared
.ok
);
8309 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
8311 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
8312 addReply(c
,shared
.err
);
8315 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
8316 addReply(c
,shared
.ok
);
8317 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
8318 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
8322 addReply(c
,shared
.nokeyerr
);
8325 key
= dictGetEntryKey(de
);
8326 val
= dictGetEntryVal(de
);
8327 if (!server
.vm_enabled
|| (key
->storage
== REDIS_VM_MEMORY
||
8328 key
->storage
== REDIS_VM_SWAPPING
)) {
8329 addReplySds(c
,sdscatprintf(sdsempty(),
8330 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8331 "encoding:%d serializedlength:%lld\r\n",
8332 (void*)key
, key
->refcount
, (void*)val
, val
->refcount
,
8333 val
->encoding
, (long long) rdbSavedObjectLen(val
,NULL
)));
8335 addReplySds(c
,sdscatprintf(sdsempty(),
8336 "+Key at:%p refcount:%d, value swapped at: page %llu "
8337 "using %llu pages\r\n",
8338 (void*)key
, key
->refcount
, (unsigned long long) key
->vm
.page
,
8339 (unsigned long long) key
->vm
.usedpages
));
8341 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
8342 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
8345 if (!server
.vm_enabled
) {
8346 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8350 addReply(c
,shared
.nokeyerr
);
8353 key
= dictGetEntryKey(de
);
8354 val
= dictGetEntryVal(de
);
8355 /* If the key is shared we want to create a copy */
8356 if (key
->refcount
> 1) {
8357 robj
*newkey
= dupStringObject(key
);
8359 key
= dictGetEntryKey(de
) = newkey
;
8362 if (key
->storage
!= REDIS_VM_MEMORY
) {
8363 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
8364 } else if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
8365 dictGetEntryVal(de
) = NULL
;
8366 addReply(c
,shared
.ok
);
8368 addReply(c
,shared
.err
);
8371 addReplySds(c
,sdsnew(
8372 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8376 static void _redisAssert(char *estr
, char *file
, int line
) {
8377 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
8378 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true\n",file
,line
,estr
);
8379 #ifdef HAVE_BACKTRACE
8380 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
8385 /* =================================== Main! ================================ */
8388 int linuxOvercommitMemoryValue(void) {
8389 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
8393 if (fgets(buf
,64,fp
) == NULL
) {
8402 void linuxOvercommitMemoryWarning(void) {
8403 if (linuxOvercommitMemoryValue() == 0) {
8404 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8407 #endif /* __linux__ */
8409 static void daemonize(void) {
8413 if (fork() != 0) exit(0); /* parent exits */
8414 setsid(); /* create a new session */
8416 /* Every output goes to /dev/null. If Redis is daemonized but
8417 * the 'logfile' is set to 'stdout' in the configuration file
8418 * it will not log at all. */
8419 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
8420 dup2(fd
, STDIN_FILENO
);
8421 dup2(fd
, STDOUT_FILENO
);
8422 dup2(fd
, STDERR_FILENO
);
8423 if (fd
> STDERR_FILENO
) close(fd
);
8425 /* Try to write the pid file */
8426 fp
= fopen(server
.pidfile
,"w");
8428 fprintf(fp
,"%d\n",getpid());
8433 int main(int argc
, char **argv
) {
8438 resetServerSaveParams();
8439 loadServerConfig(argv
[1]);
8440 } else if (argc
> 2) {
8441 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
8444 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8446 if (server
.daemonize
) daemonize();
8448 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
8450 linuxOvercommitMemoryWarning();
8453 if (server
.appendonly
) {
8454 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
8455 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
8457 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
8458 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
8460 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
8461 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
8463 aeDeleteEventLoop(server
.el
);
8467 /* ============================= Backtrace support ========================= */
8469 #ifdef HAVE_BACKTRACE
8470 static char *findFuncName(void *pointer
, unsigned long *offset
);
8472 static void *getMcontextEip(ucontext_t
*uc
) {
8473 #if defined(__FreeBSD__)
8474 return (void*) uc
->uc_mcontext
.mc_eip
;
8475 #elif defined(__dietlibc__)
8476 return (void*) uc
->uc_mcontext
.eip
;
8477 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8479 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
8481 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
8483 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8484 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8485 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
8487 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
8489 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8490 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
8491 #elif defined(__ia64__) /* Linux IA64 */
8492 return (void*) uc
->uc_mcontext
.sc_ip
;
8498 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
8500 char **messages
= NULL
;
8501 int i
, trace_size
= 0;
8502 unsigned long offset
=0;
8503 ucontext_t
*uc
= (ucontext_t
*) secret
;
8505 REDIS_NOTUSED(info
);
8507 redisLog(REDIS_WARNING
,
8508 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
8509 infostring
= genRedisInfoString();
8510 redisLog(REDIS_WARNING
, "%s",infostring
);
8511 /* It's not safe to sdsfree() the returned string under memory
8512 * corruption conditions. Let it leak as we are going to abort */
8514 trace_size
= backtrace(trace
, 100);
8515 /* overwrite sigaction with caller's address */
8516 if (getMcontextEip(uc
) != NULL
) {
8517 trace
[1] = getMcontextEip(uc
);
8519 messages
= backtrace_symbols(trace
, trace_size
);
8521 for (i
=1; i
<trace_size
; ++i
) {
8522 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
8524 p
= strchr(messages
[i
],'+');
8525 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
8526 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
8528 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
8531 /* free(messages); Don't call free() with possibly corrupted memory. */
8535 static void setupSigSegvAction(void) {
8536 struct sigaction act
;
8538 sigemptyset (&act
.sa_mask
);
8539 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8540 * is used. Otherwise, sa_handler is used */
8541 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
8542 act
.sa_sigaction
= segvHandler
;
8543 sigaction (SIGSEGV
, &act
, NULL
);
8544 sigaction (SIGBUS
, &act
, NULL
);
8545 sigaction (SIGFPE
, &act
, NULL
);
8546 sigaction (SIGILL
, &act
, NULL
);
8547 sigaction (SIGBUS
, &act
, NULL
);
8551 #include "staticsymbols.h"
8552 /* This function try to convert a pointer into a function name. It's used in
8553 * oreder to provide a backtrace under segmentation fault that's able to
8554 * display functions declared as static (otherwise the backtrace is useless). */
8555 static char *findFuncName(void *pointer
, unsigned long *offset
){
8557 unsigned long off
, minoff
= 0;
8559 /* Try to match against the Symbol with the smallest offset */
8560 for (i
=0; symsTable
[i
].pointer
; i
++) {
8561 unsigned long lp
= (unsigned long) pointer
;
8563 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
8564 off
=lp
-symsTable
[i
].pointer
;
8565 if (ret
< 0 || off
< minoff
) {
8571 if (ret
== -1) return NULL
;
8573 return symsTable
[ret
].name
;
8575 #else /* HAVE_BACKTRACE */
8576 static void setupSigSegvAction(void) {
8578 #endif /* HAVE_BACKTRACE */