2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "1.3.10"
40 #define __USE_POSIX199309
47 #endif /* HAVE_BACKTRACE */
55 #include <arpa/inet.h>
59 #include <sys/resource.h>
66 #include "solarisfixes.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
117 #define REDIS_STRING 0
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131 static char* strencoding
[] = {
132 "raw", "int", "zipmap", "hashtable"
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
207 /* List related stuff */
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
241 static void _redisAssert(char *estr
, char *file
, int line
);
242 static void _redisPanic(char *msg
, char *file
, int line
);
244 /*================================= Data types ============================== */
246 /* A redis object, that is a type able to hold a string / list / set */
248 /* The VM object structure */
249 struct redisObjectVM
{
250 off_t page
; /* the page at witch the object is stored on disk */
251 off_t usedpages
; /* number of pages used on disk */
252 time_t atime
; /* Last access time */
255 /* The actual Redis Object */
256 typedef struct redisObject
{
259 unsigned char encoding
;
260 unsigned char storage
; /* If this object is a key, where is the value?
261 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
262 unsigned char vtype
; /* If this object is a key, and value is swapped out,
263 * this is the type of the swapped out object. */
265 /* VM fields, this are only allocated if VM is active, otherwise the
266 * object allocation function will just allocate
267 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
268 * Redis without VM active will not have any overhead. */
269 struct redisObjectVM vm
;
272 /* Macro used to initalize a Redis object allocated on the stack.
273 * Note that this macro is taken near the structure definition to make sure
274 * we'll update it when the structure is changed, to avoid bugs like
275 * bug #85 introduced exactly in this way. */
276 #define initStaticStringObject(_var,_ptr) do { \
278 _var.type = REDIS_STRING; \
279 _var.encoding = REDIS_ENCODING_RAW; \
281 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 typedef struct redisDb
{
285 dict
*dict
; /* The keyspace for this DB */
286 dict
*expires
; /* Timeout of keys with a timeout set */
287 dict
*blockingkeys
; /* Keys with clients waiting for data (BLPOP) */
288 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
292 /* Client MULTI/EXEC state */
293 typedef struct multiCmd
{
296 struct redisCommand
*cmd
;
299 typedef struct multiState
{
300 multiCmd
*commands
; /* Array of MULTI commands */
301 int count
; /* Total number of MULTI commands */
304 /* With multiplexing we need to take per-clinet state.
305 * Clients are taken in a liked list. */
306 typedef struct redisClient
{
311 robj
**argv
, **mbargv
;
313 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
314 int multibulk
; /* multi bulk command format active */
317 time_t lastinteraction
; /* time of the last interaction, used for timeout */
318 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
319 int slaveseldb
; /* slave selected db, if this client is a slave */
320 int authenticated
; /* when requirepass is non-NULL */
321 int replstate
; /* replication state if this is a slave */
322 int repldbfd
; /* replication DB file descriptor */
323 long repldboff
; /* replication DB file offset */
324 off_t repldbsize
; /* replication DB file size */
325 multiState mstate
; /* MULTI/EXEC state */
326 robj
**blockingkeys
; /* The key we are waiting to terminate a blocking
327 * operation such as BLPOP. Otherwise NULL. */
328 int blockingkeysnum
; /* Number of blocking keys */
329 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
330 * is >= blockingto then the operation timed out. */
331 list
*io_keys
; /* Keys this client is waiting to be loaded from the
332 * swap file in order to continue. */
333 dict
*pubsub_channels
; /* channels a client is interested in (SUBSCRIBE) */
334 list
*pubsub_patterns
; /* patterns a client is interested in (SUBSCRIBE) */
342 /* Global server state structure */
347 long long dirty
; /* changes to DB from the last save */
349 list
*slaves
, *monitors
;
350 char neterr
[ANET_ERR_LEN
];
352 int cronloops
; /* number of times the cron function run */
353 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
354 time_t lastsave
; /* Unix time of last save succeeede */
355 /* Fields used only for stats */
356 time_t stat_starttime
; /* server start time */
357 long long stat_numcommands
; /* number of processed commands */
358 long long stat_numconnections
; /* number of connections received */
359 long long stat_expiredkeys
; /* number of expired keys */
372 pid_t bgsavechildpid
;
373 pid_t bgrewritechildpid
;
374 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
375 struct saveparam
*saveparams
;
380 char *appendfilename
;
384 /* Replication related */
389 redisClient
*master
; /* client that is master for this slave */
391 unsigned int maxclients
;
392 unsigned long long maxmemory
;
393 unsigned int blpop_blocked_clients
;
394 unsigned int vm_blocked_clients
;
395 /* Sort parameters - qsort_r() is only available under BSD so we
396 * have to take this state global, in order to pass it to sortCompare() */
400 /* Virtual memory configuration */
405 unsigned long long vm_max_memory
;
407 size_t hash_max_zipmap_entries
;
408 size_t hash_max_zipmap_value
;
409 /* Virtual memory state */
412 off_t vm_next_page
; /* Next probably empty page */
413 off_t vm_near_pages
; /* Number of pages allocated sequentially */
414 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
415 time_t unixtime
; /* Unix time sampled every second. */
416 /* Virtual memory I/O threads stuff */
417 /* An I/O thread process an element taken from the io_jobs queue and
418 * put the result of the operation in the io_done list. While the
419 * job is being processed, it's put on io_processing queue. */
420 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
421 list
*io_processing
; /* List of VM I/O jobs being processed */
422 list
*io_processed
; /* List of VM I/O jobs already processed */
423 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
424 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
425 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
426 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
427 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
428 int io_active_threads
; /* Number of running I/O threads */
429 int vm_max_threads
; /* Max number of I/O threads running at the same time */
430 /* Our main thread is blocked on the event loop, locking for sockets ready
431 * to be read or written, so when a threaded I/O operation is ready to be
432 * processed by the main thread, the I/O thread will use a unix pipe to
433 * awake the main thread. The followings are the two pipe FDs. */
434 int io_ready_pipe_read
;
435 int io_ready_pipe_write
;
436 /* Virtual memory stats */
437 unsigned long long vm_stats_used_pages
;
438 unsigned long long vm_stats_swapped_objects
;
439 unsigned long long vm_stats_swapouts
;
440 unsigned long long vm_stats_swapins
;
442 dict
*pubsub_channels
; /* Map channels to list of subscribed clients */
443 list
*pubsub_patterns
; /* A list of pubsub_patterns */
448 typedef struct pubsubPattern
{
453 typedef void redisCommandProc(redisClient
*c
);
454 struct redisCommand
{
456 redisCommandProc
*proc
;
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisCommandProc
*vm_preload_proc
;
463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey
; /* THe last argument that's a key */
466 int vm_keystep
; /* The step between first and last key */
469 struct redisFunctionSym
{
471 unsigned long pointer
;
474 typedef struct _redisSortObject
{
482 typedef struct _redisSortOperation
{
485 } redisSortOperation
;
487 /* ZSETs use a specialized version of Skiplists */
489 typedef struct zskiplistNode
{
490 struct zskiplistNode
**forward
;
491 struct zskiplistNode
*backward
;
497 typedef struct zskiplist
{
498 struct zskiplistNode
*header
, *tail
;
499 unsigned long length
;
503 typedef struct zset
{
508 /* Our shared "common" objects */
510 #define REDIS_SHARED_INTEGERS 10000
511 struct sharedObjectsStruct
{
512 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
513 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
514 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
515 *outofrangeerr
, *plus
,
516 *select0
, *select1
, *select2
, *select3
, *select4
,
517 *select5
, *select6
, *select7
, *select8
, *select9
,
518 *messagebulk
, *pmessagebulk
, *subscribebulk
, *unsubscribebulk
, *mbulk3
,
519 *mbulk4
, *psubscribebulk
, *punsubscribebulk
,
520 *integers
[REDIS_SHARED_INTEGERS
];
523 /* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
527 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
529 /* VM threaded I/O request message */
530 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
533 typedef struct iojob
{
534 int type
; /* Request type, REDIS_IOJOB_* */
535 redisDb
*db
;/* Redis database */
536 robj
*key
; /* This I/O request is about swapping this key */
537 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page
; /* Swap page where to read/write the object */
540 off_t pages
; /* Swap pages needed to save object. PREPARE_SWAP return val */
541 int canceled
; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread
; /* ID of the thread processing this entry */
545 /*================================ Prototypes =============================== */
547 static void freeStringObject(robj
*o
);
548 static void freeListObject(robj
*o
);
549 static void freeSetObject(robj
*o
);
550 static void decrRefCount(void *o
);
551 static robj
*createObject(int type
, void *ptr
);
552 static void freeClient(redisClient
*c
);
553 static int rdbLoad(char *filename
);
554 static void addReply(redisClient
*c
, robj
*obj
);
555 static void addReplySds(redisClient
*c
, sds s
);
556 static void incrRefCount(robj
*o
);
557 static int rdbSaveBackground(char *filename
);
558 static robj
*createStringObject(char *ptr
, size_t len
);
559 static robj
*dupStringObject(robj
*o
);
560 static void replicationFeedSlaves(list
*slaves
, int dictid
, robj
**argv
, int argc
);
561 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
562 static int syncWithMaster(void);
563 static robj
*tryObjectEncoding(robj
*o
);
564 static robj
*getDecodedObject(robj
*o
);
565 static int removeExpire(redisDb
*db
, robj
*key
);
566 static int expireIfNeeded(redisDb
*db
, robj
*key
);
567 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
568 static int deleteIfSwapped(redisDb
*db
, robj
*key
);
569 static int deleteKey(redisDb
*db
, robj
*key
);
570 static time_t getExpire(redisDb
*db
, robj
*key
);
571 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
572 static void updateSlavesWaitingBgsave(int bgsaveerr
);
573 static void freeMemoryIfNeeded(void);
574 static int processCommand(redisClient
*c
);
575 static void setupSigSegvAction(void);
576 static void rdbRemoveTempFile(pid_t childpid
);
577 static void aofRemoveTempFile(pid_t childpid
);
578 static size_t stringObjectLen(robj
*o
);
579 static void processInputBuffer(redisClient
*c
);
580 static zskiplist
*zslCreate(void);
581 static void zslFree(zskiplist
*zsl
);
582 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
583 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
584 static void initClientMultiState(redisClient
*c
);
585 static void freeClientMultiState(redisClient
*c
);
586 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
587 static void unblockClientWaitingData(redisClient
*c
);
588 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
589 static void vmInit(void);
590 static void vmMarkPagesFree(off_t page
, off_t count
);
591 static robj
*vmLoadObject(robj
*key
);
592 static robj
*vmPreviewObject(robj
*key
);
593 static int vmSwapOneObjectBlocking(void);
594 static int vmSwapOneObjectThreaded(void);
595 static int vmCanSwapOut(void);
596 static int tryFreeOneObjectFromFreelist(void);
597 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
598 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
599 static void vmCancelThreadedIOJob(robj
*o
);
600 static void lockThreadedIO(void);
601 static void unlockThreadedIO(void);
602 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
603 static void freeIOJob(iojob
*j
);
604 static void queueIOJob(iojob
*j
);
605 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
606 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
607 static void waitEmptyIOJobsQueue(void);
608 static void vmReopenSwapFile(void);
609 static int vmFreePage(off_t page
);
610 static void zunionInterBlockClientOnSwappedKeys(redisClient
*c
);
611 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
);
612 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
613 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
614 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
615 static struct redisCommand
*lookupCommand(char *name
);
616 static void call(redisClient
*c
, struct redisCommand
*cmd
);
617 static void resetClient(redisClient
*c
);
618 static void convertToRealHash(robj
*o
);
619 static int pubsubUnsubscribeAllChannels(redisClient
*c
, int notify
);
620 static int pubsubUnsubscribeAllPatterns(redisClient
*c
, int notify
);
621 static void freePubsubPattern(void *p
);
622 static int listMatchPubsubPattern(void *a
, void *b
);
623 static int compareStringObjects(robj
*a
, robj
*b
);
626 static void authCommand(redisClient
*c
);
627 static void pingCommand(redisClient
*c
);
628 static void echoCommand(redisClient
*c
);
629 static void setCommand(redisClient
*c
);
630 static void setnxCommand(redisClient
*c
);
631 static void setexCommand(redisClient
*c
);
632 static void getCommand(redisClient
*c
);
633 static void delCommand(redisClient
*c
);
634 static void existsCommand(redisClient
*c
);
635 static void incrCommand(redisClient
*c
);
636 static void decrCommand(redisClient
*c
);
637 static void incrbyCommand(redisClient
*c
);
638 static void decrbyCommand(redisClient
*c
);
639 static void selectCommand(redisClient
*c
);
640 static void randomkeyCommand(redisClient
*c
);
641 static void keysCommand(redisClient
*c
);
642 static void dbsizeCommand(redisClient
*c
);
643 static void lastsaveCommand(redisClient
*c
);
644 static void saveCommand(redisClient
*c
);
645 static void bgsaveCommand(redisClient
*c
);
646 static void bgrewriteaofCommand(redisClient
*c
);
647 static void shutdownCommand(redisClient
*c
);
648 static void moveCommand(redisClient
*c
);
649 static void renameCommand(redisClient
*c
);
650 static void renamenxCommand(redisClient
*c
);
651 static void lpushCommand(redisClient
*c
);
652 static void rpushCommand(redisClient
*c
);
653 static void lpopCommand(redisClient
*c
);
654 static void rpopCommand(redisClient
*c
);
655 static void llenCommand(redisClient
*c
);
656 static void lindexCommand(redisClient
*c
);
657 static void lrangeCommand(redisClient
*c
);
658 static void ltrimCommand(redisClient
*c
);
659 static void typeCommand(redisClient
*c
);
660 static void lsetCommand(redisClient
*c
);
661 static void saddCommand(redisClient
*c
);
662 static void sremCommand(redisClient
*c
);
663 static void smoveCommand(redisClient
*c
);
664 static void sismemberCommand(redisClient
*c
);
665 static void scardCommand(redisClient
*c
);
666 static void spopCommand(redisClient
*c
);
667 static void srandmemberCommand(redisClient
*c
);
668 static void sinterCommand(redisClient
*c
);
669 static void sinterstoreCommand(redisClient
*c
);
670 static void sunionCommand(redisClient
*c
);
671 static void sunionstoreCommand(redisClient
*c
);
672 static void sdiffCommand(redisClient
*c
);
673 static void sdiffstoreCommand(redisClient
*c
);
674 static void syncCommand(redisClient
*c
);
675 static void flushdbCommand(redisClient
*c
);
676 static void flushallCommand(redisClient
*c
);
677 static void sortCommand(redisClient
*c
);
678 static void lremCommand(redisClient
*c
);
679 static void rpoplpushcommand(redisClient
*c
);
680 static void infoCommand(redisClient
*c
);
681 static void mgetCommand(redisClient
*c
);
682 static void monitorCommand(redisClient
*c
);
683 static void expireCommand(redisClient
*c
);
684 static void expireatCommand(redisClient
*c
);
685 static void getsetCommand(redisClient
*c
);
686 static void ttlCommand(redisClient
*c
);
687 static void slaveofCommand(redisClient
*c
);
688 static void debugCommand(redisClient
*c
);
689 static void msetCommand(redisClient
*c
);
690 static void msetnxCommand(redisClient
*c
);
691 static void zaddCommand(redisClient
*c
);
692 static void zincrbyCommand(redisClient
*c
);
693 static void zrangeCommand(redisClient
*c
);
694 static void zrangebyscoreCommand(redisClient
*c
);
695 static void zcountCommand(redisClient
*c
);
696 static void zrevrangeCommand(redisClient
*c
);
697 static void zcardCommand(redisClient
*c
);
698 static void zremCommand(redisClient
*c
);
699 static void zscoreCommand(redisClient
*c
);
700 static void zremrangebyscoreCommand(redisClient
*c
);
701 static void multiCommand(redisClient
*c
);
702 static void execCommand(redisClient
*c
);
703 static void discardCommand(redisClient
*c
);
704 static void blpopCommand(redisClient
*c
);
705 static void brpopCommand(redisClient
*c
);
706 static void appendCommand(redisClient
*c
);
707 static void substrCommand(redisClient
*c
);
708 static void zrankCommand(redisClient
*c
);
709 static void zrevrankCommand(redisClient
*c
);
710 static void hsetCommand(redisClient
*c
);
711 static void hsetnxCommand(redisClient
*c
);
712 static void hgetCommand(redisClient
*c
);
713 static void hmsetCommand(redisClient
*c
);
714 static void hmgetCommand(redisClient
*c
);
715 static void hdelCommand(redisClient
*c
);
716 static void hlenCommand(redisClient
*c
);
717 static void zremrangebyrankCommand(redisClient
*c
);
718 static void zunionCommand(redisClient
*c
);
719 static void zinterCommand(redisClient
*c
);
720 static void hkeysCommand(redisClient
*c
);
721 static void hvalsCommand(redisClient
*c
);
722 static void hgetallCommand(redisClient
*c
);
723 static void hexistsCommand(redisClient
*c
);
724 static void configCommand(redisClient
*c
);
725 static void hincrbyCommand(redisClient
*c
);
726 static void subscribeCommand(redisClient
*c
);
727 static void unsubscribeCommand(redisClient
*c
);
728 static void psubscribeCommand(redisClient
*c
);
729 static void punsubscribeCommand(redisClient
*c
);
730 static void publishCommand(redisClient
*c
);
732 /*================================= Globals ================================= */
735 static struct redisServer server
; /* server global state */
736 static struct redisCommand cmdTable
[] = {
737 {"get",getCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
738 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
739 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
740 {"setex",setexCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
741 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
742 {"substr",substrCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
743 {"del",delCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
744 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
745 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
746 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
747 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,NULL
,1,-1,1},
748 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
749 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
750 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
751 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
752 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,NULL
,1,1,1},
753 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,NULL
,1,1,1},
754 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
755 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
756 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
757 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
758 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
759 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,NULL
,1,1,1},
760 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,2,1},
761 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
762 {"srem",sremCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
763 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,NULL
,1,2,1},
764 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
765 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
766 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
767 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
768 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
769 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
770 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
771 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
772 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
773 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
774 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
775 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
776 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
777 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
778 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
779 {"zremrangebyrank",zremrangebyrankCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
780 {"zunion",zunionCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,zunionInterBlockClientOnSwappedKeys
,0,0,0},
781 {"zinter",zinterCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,zunionInterBlockClientOnSwappedKeys
,0,0,0},
782 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
783 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
784 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
785 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
786 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
787 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
788 {"zrank",zrankCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
789 {"zrevrank",zrevrankCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
790 {"hset",hsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
791 {"hsetnx",hsetnxCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
792 {"hget",hgetCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
793 {"hmset",hmsetCommand
,-4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
794 {"hmget",hmgetCommand
,-3,REDIS_CMD_BULK
,NULL
,1,1,1},
795 {"hincrby",hincrbyCommand
,4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
796 {"hdel",hdelCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
797 {"hlen",hlenCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
798 {"hkeys",hkeysCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
799 {"hvals",hvalsCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
800 {"hgetall",hgetallCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
801 {"hexists",hexistsCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
802 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
803 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
804 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
805 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,-1,2},
806 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,-1,2},
807 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
808 {"select",selectCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
809 {"move",moveCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
810 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
811 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
812 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
813 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
814 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
815 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
816 {"auth",authCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
817 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
818 {"echo",echoCommand
,2,REDIS_CMD_BULK
,NULL
,0,0,0},
819 {"save",saveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
820 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
821 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
822 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
823 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
824 {"type",typeCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
825 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
826 {"exec",execCommand
,1,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
827 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
828 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
829 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
830 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
831 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
832 {"info",infoCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
833 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
834 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
835 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
836 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
837 {"config",configCommand
,-2,REDIS_CMD_BULK
,NULL
,0,0,0},
838 {"subscribe",subscribeCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
839 {"unsubscribe",unsubscribeCommand
,-1,REDIS_CMD_INLINE
,NULL
,0,0,0},
840 {"psubscribe",psubscribeCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
841 {"punsubscribe",punsubscribeCommand
,-1,REDIS_CMD_INLINE
,NULL
,0,0,0},
842 {"publish",publishCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_FORCE_REPLICATION
,NULL
,0,0,0},
843 {NULL
,NULL
,0,0,NULL
,0,0,0}
846 /*============================ Utility functions ============================ */
848 /* Glob-style pattern matching. */
849 static int stringmatchlen(const char *pattern
, int patternLen
,
850 const char *string
, int stringLen
, int nocase
)
855 while (pattern
[1] == '*') {
860 return 1; /* match */
862 if (stringmatchlen(pattern
+1, patternLen
-1,
863 string
, stringLen
, nocase
))
864 return 1; /* match */
868 return 0; /* no match */
872 return 0; /* no match */
882 not = pattern
[0] == '^';
889 if (pattern
[0] == '\\') {
892 if (pattern
[0] == string
[0])
894 } else if (pattern
[0] == ']') {
896 } else if (patternLen
== 0) {
900 } else if (pattern
[1] == '-' && patternLen
>= 3) {
901 int start
= pattern
[0];
902 int end
= pattern
[2];
910 start
= tolower(start
);
916 if (c
>= start
&& c
<= end
)
920 if (pattern
[0] == string
[0])
923 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
933 return 0; /* no match */
939 if (patternLen
>= 2) {
946 if (pattern
[0] != string
[0])
947 return 0; /* no match */
949 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
950 return 0; /* no match */
958 if (stringLen
== 0) {
959 while(*pattern
== '*') {
966 if (patternLen
== 0 && stringLen
== 0)
971 static int stringmatch(const char *pattern
, const char *string
, int nocase
) {
972 return stringmatchlen(pattern
,strlen(pattern
),string
,strlen(string
),nocase
);
975 /* Convert a string representing an amount of memory into the number of
976 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
979 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
981 static long long memtoll(const char *p
, int *err
) {
984 long mul
; /* unit multiplier */
989 /* Search the first non digit character. */
992 while(*u
&& isdigit(*u
)) u
++;
993 if (*u
== '\0' || !strcasecmp(u
,"b")) {
995 } else if (!strcasecmp(u
,"k")) {
997 } else if (!strcasecmp(u
,"kb")) {
999 } else if (!strcasecmp(u
,"m")) {
1001 } else if (!strcasecmp(u
,"mb")) {
1003 } else if (!strcasecmp(u
,"g")) {
1004 mul
= 1000L*1000*1000;
1005 } else if (!strcasecmp(u
,"gb")) {
1006 mul
= 1024L*1024*1024;
1012 if (digits
>= sizeof(buf
)) {
1016 memcpy(buf
,p
,digits
);
1018 val
= strtoll(buf
,NULL
,10);
1022 static void redisLog(int level
, const char *fmt
, ...) {
1026 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
1030 if (level
>= server
.verbosity
) {
1036 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
1037 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
1038 vfprintf(fp
, fmt
, ap
);
1044 if (server
.logfile
) fclose(fp
);
1047 /*====================== Hash table type implementation ==================== */
1049 /* This is an hash table type that uses the SDS dynamic strings libary as
1050 * keys and radis objects as values (objects can hold SDS strings,
1053 static void dictVanillaFree(void *privdata
, void *val
)
1055 DICT_NOTUSED(privdata
);
1059 static void dictListDestructor(void *privdata
, void *val
)
1061 DICT_NOTUSED(privdata
);
1062 listRelease((list
*)val
);
1065 static int sdsDictKeyCompare(void *privdata
, const void *key1
,
1069 DICT_NOTUSED(privdata
);
1071 l1
= sdslen((sds
)key1
);
1072 l2
= sdslen((sds
)key2
);
1073 if (l1
!= l2
) return 0;
1074 return memcmp(key1
, key2
, l1
) == 0;
1077 static void dictRedisObjectDestructor(void *privdata
, void *val
)
1079 DICT_NOTUSED(privdata
);
1081 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
1085 static int dictObjKeyCompare(void *privdata
, const void *key1
,
1088 const robj
*o1
= key1
, *o2
= key2
;
1089 return sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1092 static unsigned int dictObjHash(const void *key
) {
1093 const robj
*o
= key
;
1094 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1097 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
1100 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
1103 if (o1
->encoding
== REDIS_ENCODING_INT
&&
1104 o2
->encoding
== REDIS_ENCODING_INT
&&
1105 o1
->ptr
== o2
->ptr
) return 1;
1107 o1
= getDecodedObject(o1
);
1108 o2
= getDecodedObject(o2
);
1109 cmp
= sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1115 static unsigned int dictEncObjHash(const void *key
) {
1116 robj
*o
= (robj
*) key
;
1118 if (o
->encoding
== REDIS_ENCODING_RAW
) {
1119 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1121 if (o
->encoding
== REDIS_ENCODING_INT
) {
1125 len
= snprintf(buf
,32,"%ld",(long)o
->ptr
);
1126 return dictGenHashFunction((unsigned char*)buf
, len
);
1130 o
= getDecodedObject(o
);
1131 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1138 /* Sets type and expires */
1139 static dictType setDictType
= {
1140 dictEncObjHash
, /* hash function */
1143 dictEncObjKeyCompare
, /* key compare */
1144 dictRedisObjectDestructor
, /* key destructor */
1145 NULL
/* val destructor */
1148 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1149 static dictType zsetDictType
= {
1150 dictEncObjHash
, /* hash function */
1153 dictEncObjKeyCompare
, /* key compare */
1154 dictRedisObjectDestructor
, /* key destructor */
1155 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1159 static dictType dbDictType
= {
1160 dictObjHash
, /* hash function */
1163 dictObjKeyCompare
, /* key compare */
1164 dictRedisObjectDestructor
, /* key destructor */
1165 dictRedisObjectDestructor
/* val destructor */
1169 static dictType keyptrDictType
= {
1170 dictObjHash
, /* hash function */
1173 dictObjKeyCompare
, /* key compare */
1174 dictRedisObjectDestructor
, /* key destructor */
1175 NULL
/* val destructor */
1178 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1179 static dictType hashDictType
= {
1180 dictEncObjHash
, /* hash function */
1183 dictEncObjKeyCompare
, /* key compare */
1184 dictRedisObjectDestructor
, /* key destructor */
1185 dictRedisObjectDestructor
/* val destructor */
1188 /* Keylist hash table type has unencoded redis objects as keys and
1189 * lists as values. It's used for blocking operations (BLPOP) and to
1190 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1191 static dictType keylistDictType
= {
1192 dictObjHash
, /* hash function */
1195 dictObjKeyCompare
, /* key compare */
1196 dictRedisObjectDestructor
, /* key destructor */
1197 dictListDestructor
/* val destructor */
1200 static void version();
1202 /* ========================= Random utility functions ======================= */
1204 /* Redis generally does not try to recover from out of memory conditions
1205 * when allocating objects or strings, it is not clear if it will be possible
1206 * to report this condition to the client since the networking layer itself
1207 * is based on heap allocation for send buffers, so we simply abort.
1208 * At least the code will be simpler to read... */
1209 static void oom(const char *msg
) {
1210 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1215 /* ====================== Redis server networking stuff ===================== */
1216 static void closeTimedoutClients(void) {
1219 time_t now
= time(NULL
);
1222 listRewind(server
.clients
,&li
);
1223 while ((ln
= listNext(&li
)) != NULL
) {
1224 c
= listNodeValue(ln
);
1225 if (server
.maxidletime
&&
1226 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1227 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1228 dictSize(c
->pubsub_channels
) == 0 && /* no timeout for pubsub */
1229 listLength(c
->pubsub_patterns
) == 0 &&
1230 (now
- c
->lastinteraction
> server
.maxidletime
))
1232 redisLog(REDIS_VERBOSE
,"Closing idle client");
1234 } else if (c
->flags
& REDIS_BLOCKED
) {
1235 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1236 addReply(c
,shared
.nullmultibulk
);
1237 unblockClientWaitingData(c
);
1243 static int htNeedsResize(dict
*dict
) {
1244 long long size
, used
;
1246 size
= dictSlots(dict
);
1247 used
= dictSize(dict
);
1248 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1249 (used
*100/size
< REDIS_HT_MINFILL
));
1252 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1253 * we resize the hash table to save memory */
1254 static void tryResizeHashTables(void) {
1257 for (j
= 0; j
< server
.dbnum
; j
++) {
1258 if (htNeedsResize(server
.db
[j
].dict
))
1259 dictResize(server
.db
[j
].dict
);
1260 if (htNeedsResize(server
.db
[j
].expires
))
1261 dictResize(server
.db
[j
].expires
);
1265 /* Our hash table implementation performs rehashing incrementally while
1266 * we write/read from the hash table. Still if the server is idle, the hash
1267 * table will use two tables for a long time. So we try to use 1 millisecond
1268 * of CPU time at every serverCron() loop in order to rehash some key. */
1269 static void incrementallyRehash(void) {
1272 for (j
= 0; j
< server
.dbnum
; j
++) {
1273 if (dictIsRehashing(server
.db
[j
].dict
)) {
1274 dictRehashMilliseconds(server
.db
[j
].dict
,1);
1275 break; /* already used our millisecond for this loop... */
1280 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1281 void backgroundSaveDoneHandler(int statloc
) {
1282 int exitcode
= WEXITSTATUS(statloc
);
1283 int bysignal
= WIFSIGNALED(statloc
);
1285 if (!bysignal
&& exitcode
== 0) {
1286 redisLog(REDIS_NOTICE
,
1287 "Background saving terminated with success");
1289 server
.lastsave
= time(NULL
);
1290 } else if (!bysignal
&& exitcode
!= 0) {
1291 redisLog(REDIS_WARNING
, "Background saving error");
1293 redisLog(REDIS_WARNING
,
1294 "Background saving terminated by signal %d", WTERMSIG(statloc
));
1295 rdbRemoveTempFile(server
.bgsavechildpid
);
1297 server
.bgsavechildpid
= -1;
1298 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1299 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1300 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1303 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1305 void backgroundRewriteDoneHandler(int statloc
) {
1306 int exitcode
= WEXITSTATUS(statloc
);
1307 int bysignal
= WIFSIGNALED(statloc
);
1309 if (!bysignal
&& exitcode
== 0) {
1313 redisLog(REDIS_NOTICE
,
1314 "Background append only file rewriting terminated with success");
1315 /* Now it's time to flush the differences accumulated by the parent */
1316 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1317 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1319 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1322 /* Flush our data... */
1323 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1324 (signed) sdslen(server
.bgrewritebuf
)) {
1325 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1329 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1330 /* Now our work is to rename the temp file into the stable file. And
1331 * switch the file descriptor used by the server for append only. */
1332 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1333 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1337 /* Mission completed... almost */
1338 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1339 if (server
.appendfd
!= -1) {
1340 /* If append only is actually enabled... */
1341 close(server
.appendfd
);
1342 server
.appendfd
= fd
;
1344 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1345 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1347 /* If append only is disabled we just generate a dump in this
1348 * format. Why not? */
1351 } else if (!bysignal
&& exitcode
!= 0) {
1352 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1354 redisLog(REDIS_WARNING
,
1355 "Background append only file rewriting terminated by signal %d",
1359 sdsfree(server
.bgrewritebuf
);
1360 server
.bgrewritebuf
= sdsempty();
1361 aofRemoveTempFile(server
.bgrewritechildpid
);
1362 server
.bgrewritechildpid
= -1;
1365 /* This function is called once a background process of some kind terminates,
1366 * as we want to avoid resizing the hash tables when there is a child in order
1367 * to play well with copy-on-write (otherwise when a resize happens lots of
1368 * memory pages are copied). The goal of this function is to update the ability
1369 * for dict.c to resize the hash tables accordingly to the fact we have o not
1370 * running childs. */
1371 static void updateDictResizePolicy(void) {
1372 if (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1)
1375 dictDisableResize();
1378 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1379 int j
, loops
= server
.cronloops
++;
1380 REDIS_NOTUSED(eventLoop
);
1382 REDIS_NOTUSED(clientData
);
1384 /* We take a cached value of the unix time in the global state because
1385 * with virtual memory and aging there is to store the current time
1386 * in objects at every object access, and accuracy is not needed.
1387 * To access a global var is faster than calling time(NULL) */
1388 server
.unixtime
= time(NULL
);
1390 /* Show some info about non-empty databases */
1391 for (j
= 0; j
< server
.dbnum
; j
++) {
1392 long long size
, used
, vkeys
;
1394 size
= dictSlots(server
.db
[j
].dict
);
1395 used
= dictSize(server
.db
[j
].dict
);
1396 vkeys
= dictSize(server
.db
[j
].expires
);
1397 if (!(loops
% 50) && (used
|| vkeys
)) {
1398 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1399 /* dictPrintStats(server.dict); */
1403 /* We don't want to resize the hash tables while a bacground saving
1404 * is in progress: the saving child is created using fork() that is
1405 * implemented with a copy-on-write semantic in most modern systems, so
1406 * if we resize the HT while there is the saving child at work actually
1407 * a lot of memory movements in the parent will cause a lot of pages
1409 if (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1) {
1410 if (!(loops
% 10)) tryResizeHashTables();
1411 if (server
.activerehashing
) incrementallyRehash();
1414 /* Show information about connected clients */
1415 if (!(loops
% 50)) {
1416 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use",
1417 listLength(server
.clients
)-listLength(server
.slaves
),
1418 listLength(server
.slaves
),
1419 zmalloc_used_memory());
1422 /* Close connections of timedout clients */
1423 if ((server
.maxidletime
&& !(loops
% 100)) || server
.blpop_blocked_clients
)
1424 closeTimedoutClients();
1426 /* Check if a background saving or AOF rewrite in progress terminated */
1427 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1431 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1432 if (pid
== server
.bgsavechildpid
) {
1433 backgroundSaveDoneHandler(statloc
);
1435 backgroundRewriteDoneHandler(statloc
);
1437 updateDictResizePolicy();
1440 /* If there is not a background saving in progress check if
1441 * we have to save now */
1442 time_t now
= time(NULL
);
1443 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1444 struct saveparam
*sp
= server
.saveparams
+j
;
1446 if (server
.dirty
>= sp
->changes
&&
1447 now
-server
.lastsave
> sp
->seconds
) {
1448 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1449 sp
->changes
, sp
->seconds
);
1450 rdbSaveBackground(server
.dbfilename
);
1456 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1457 * will use few CPU cycles if there are few expiring keys, otherwise
1458 * it will get more aggressive to avoid that too much memory is used by
1459 * keys that can be removed from the keyspace. */
1460 for (j
= 0; j
< server
.dbnum
; j
++) {
1462 redisDb
*db
= server
.db
+j
;
1464 /* Continue to expire if at the end of the cycle more than 25%
1465 * of the keys were expired. */
1467 long num
= dictSize(db
->expires
);
1468 time_t now
= time(NULL
);
1471 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1472 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1477 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1478 t
= (time_t) dictGetEntryVal(de
);
1480 deleteKey(db
,dictGetEntryKey(de
));
1482 server
.stat_expiredkeys
++;
1485 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1488 /* Swap a few keys on disk if we are over the memory limit and VM
1489 * is enbled. Try to free objects from the free list first. */
1490 if (vmCanSwapOut()) {
1491 while (server
.vm_enabled
&& zmalloc_used_memory() >
1492 server
.vm_max_memory
)
1496 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1497 retval
= (server
.vm_max_threads
== 0) ?
1498 vmSwapOneObjectBlocking() :
1499 vmSwapOneObjectThreaded();
1500 if (retval
== REDIS_ERR
&& !(loops
% 300) &&
1501 zmalloc_used_memory() >
1502 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1504 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1506 /* Note that when using threade I/O we free just one object,
1507 * because anyway when the I/O thread in charge to swap this
1508 * object out will finish, the handler of completed jobs
1509 * will try to swap more objects if we are still out of memory. */
1510 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1514 /* Check if we should connect to a MASTER */
1515 if (server
.replstate
== REDIS_REPL_CONNECT
&& !(loops
% 10)) {
1516 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1517 if (syncWithMaster() == REDIS_OK
) {
1518 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1524 /* This function gets called every time Redis is entering the
1525 * main loop of the event driven library, that is, before to sleep
1526 * for ready file descriptors. */
1527 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1528 REDIS_NOTUSED(eventLoop
);
1530 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1534 listRewind(server
.io_ready_clients
,&li
);
1535 while((ln
= listNext(&li
))) {
1536 redisClient
*c
= ln
->value
;
1537 struct redisCommand
*cmd
;
1539 /* Resume the client. */
1540 listDelNode(server
.io_ready_clients
,ln
);
1541 c
->flags
&= (~REDIS_IO_WAIT
);
1542 server
.vm_blocked_clients
--;
1543 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1544 readQueryFromClient
, c
);
1545 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1546 assert(cmd
!= NULL
);
1549 /* There may be more data to process in the input buffer. */
1550 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1551 processInputBuffer(c
);
1556 static void createSharedObjects(void) {
1559 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1560 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1561 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1562 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1563 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1564 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1565 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1566 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1567 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1568 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1569 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1570 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1571 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1572 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1573 "-ERR no such key\r\n"));
1574 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1575 "-ERR syntax error\r\n"));
1576 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1577 "-ERR source and destination objects are the same\r\n"));
1578 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1579 "-ERR index out of range\r\n"));
1580 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1581 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1582 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1583 shared
.select0
= createStringObject("select 0\r\n",10);
1584 shared
.select1
= createStringObject("select 1\r\n",10);
1585 shared
.select2
= createStringObject("select 2\r\n",10);
1586 shared
.select3
= createStringObject("select 3\r\n",10);
1587 shared
.select4
= createStringObject("select 4\r\n",10);
1588 shared
.select5
= createStringObject("select 5\r\n",10);
1589 shared
.select6
= createStringObject("select 6\r\n",10);
1590 shared
.select7
= createStringObject("select 7\r\n",10);
1591 shared
.select8
= createStringObject("select 8\r\n",10);
1592 shared
.select9
= createStringObject("select 9\r\n",10);
1593 shared
.messagebulk
= createStringObject("$7\r\nmessage\r\n",13);
1594 shared
.pmessagebulk
= createStringObject("$8\r\npmessage\r\n",14);
1595 shared
.subscribebulk
= createStringObject("$9\r\nsubscribe\r\n",15);
1596 shared
.unsubscribebulk
= createStringObject("$11\r\nunsubscribe\r\n",18);
1597 shared
.psubscribebulk
= createStringObject("$10\r\npsubscribe\r\n",17);
1598 shared
.punsubscribebulk
= createStringObject("$12\r\npunsubscribe\r\n",19);
1599 shared
.mbulk3
= createStringObject("*3\r\n",4);
1600 shared
.mbulk4
= createStringObject("*4\r\n",4);
1601 for (j
= 0; j
< REDIS_SHARED_INTEGERS
; j
++) {
1602 shared
.integers
[j
] = createObject(REDIS_STRING
,(void*)(long)j
);
1603 shared
.integers
[j
]->encoding
= REDIS_ENCODING_INT
;
1607 static void appendServerSaveParams(time_t seconds
, int changes
) {
1608 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1609 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1610 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1611 server
.saveparamslen
++;
1614 static void resetServerSaveParams() {
1615 zfree(server
.saveparams
);
1616 server
.saveparams
= NULL
;
1617 server
.saveparamslen
= 0;
1620 static void initServerConfig() {
1621 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1622 server
.port
= REDIS_SERVERPORT
;
1623 server
.verbosity
= REDIS_VERBOSE
;
1624 server
.maxidletime
= REDIS_MAXIDLETIME
;
1625 server
.saveparams
= NULL
;
1626 server
.logfile
= NULL
; /* NULL = log on standard output */
1627 server
.bindaddr
= NULL
;
1628 server
.glueoutputbuf
= 1;
1629 server
.daemonize
= 0;
1630 server
.appendonly
= 0;
1631 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1632 server
.lastfsync
= time(NULL
);
1633 server
.appendfd
= -1;
1634 server
.appendseldb
= -1; /* Make sure the first time will not match */
1635 server
.pidfile
= zstrdup("/var/run/redis.pid");
1636 server
.dbfilename
= zstrdup("dump.rdb");
1637 server
.appendfilename
= zstrdup("appendonly.aof");
1638 server
.requirepass
= NULL
;
1639 server
.rdbcompression
= 1;
1640 server
.activerehashing
= 1;
1641 server
.maxclients
= 0;
1642 server
.blpop_blocked_clients
= 0;
1643 server
.maxmemory
= 0;
1644 server
.vm_enabled
= 0;
1645 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1646 server
.vm_page_size
= 256; /* 256 bytes per page */
1647 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1648 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1649 server
.vm_max_threads
= 4;
1650 server
.vm_blocked_clients
= 0;
1651 server
.hash_max_zipmap_entries
= REDIS_HASH_MAX_ZIPMAP_ENTRIES
;
1652 server
.hash_max_zipmap_value
= REDIS_HASH_MAX_ZIPMAP_VALUE
;
1654 resetServerSaveParams();
1656 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1657 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1658 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1659 /* Replication related */
1661 server
.masterauth
= NULL
;
1662 server
.masterhost
= NULL
;
1663 server
.masterport
= 6379;
1664 server
.master
= NULL
;
1665 server
.replstate
= REDIS_REPL_NONE
;
1667 /* Double constants initialization */
1669 R_PosInf
= 1.0/R_Zero
;
1670 R_NegInf
= -1.0/R_Zero
;
1671 R_Nan
= R_Zero
/R_Zero
;
1674 static void initServer() {
1677 signal(SIGHUP
, SIG_IGN
);
1678 signal(SIGPIPE
, SIG_IGN
);
1679 setupSigSegvAction();
1681 server
.devnull
= fopen("/dev/null","w");
1682 if (server
.devnull
== NULL
) {
1683 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1686 server
.clients
= listCreate();
1687 server
.slaves
= listCreate();
1688 server
.monitors
= listCreate();
1689 server
.objfreelist
= listCreate();
1690 createSharedObjects();
1691 server
.el
= aeCreateEventLoop();
1692 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1693 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1694 if (server
.fd
== -1) {
1695 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1698 for (j
= 0; j
< server
.dbnum
; j
++) {
1699 server
.db
[j
].dict
= dictCreate(&dbDictType
,NULL
);
1700 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1701 server
.db
[j
].blockingkeys
= dictCreate(&keylistDictType
,NULL
);
1702 if (server
.vm_enabled
)
1703 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1704 server
.db
[j
].id
= j
;
1706 server
.pubsub_channels
= dictCreate(&keylistDictType
,NULL
);
1707 server
.pubsub_patterns
= listCreate();
1708 listSetFreeMethod(server
.pubsub_patterns
,freePubsubPattern
);
1709 listSetMatchMethod(server
.pubsub_patterns
,listMatchPubsubPattern
);
1710 server
.cronloops
= 0;
1711 server
.bgsavechildpid
= -1;
1712 server
.bgrewritechildpid
= -1;
1713 server
.bgrewritebuf
= sdsempty();
1714 server
.lastsave
= time(NULL
);
1716 server
.stat_numcommands
= 0;
1717 server
.stat_numconnections
= 0;
1718 server
.stat_expiredkeys
= 0;
1719 server
.stat_starttime
= time(NULL
);
1720 server
.unixtime
= time(NULL
);
1721 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1722 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1723 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1725 if (server
.appendonly
) {
1726 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1727 if (server
.appendfd
== -1) {
1728 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1734 if (server
.vm_enabled
) vmInit();
1737 /* Empty the whole database */
1738 static long long emptyDb() {
1740 long long removed
= 0;
1742 for (j
= 0; j
< server
.dbnum
; j
++) {
1743 removed
+= dictSize(server
.db
[j
].dict
);
1744 dictEmpty(server
.db
[j
].dict
);
1745 dictEmpty(server
.db
[j
].expires
);
1750 static int yesnotoi(char *s
) {
1751 if (!strcasecmp(s
,"yes")) return 1;
1752 else if (!strcasecmp(s
,"no")) return 0;
1756 /* I agree, this is a very rudimental way to load a configuration...
1757 will improve later if the config gets more complex */
1758 static void loadServerConfig(char *filename
) {
1760 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1764 if (filename
[0] == '-' && filename
[1] == '\0')
1767 if ((fp
= fopen(filename
,"r")) == NULL
) {
1768 redisLog(REDIS_WARNING
, "Fatal error, can't open config file '%s'", filename
);
1773 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1779 line
= sdstrim(line
," \t\r\n");
1781 /* Skip comments and blank lines*/
1782 if (line
[0] == '#' || line
[0] == '\0') {
1787 /* Split into arguments */
1788 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1789 sdstolower(argv
[0]);
1791 /* Execute config directives */
1792 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1793 server
.maxidletime
= atoi(argv
[1]);
1794 if (server
.maxidletime
< 0) {
1795 err
= "Invalid timeout value"; goto loaderr
;
1797 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1798 server
.port
= atoi(argv
[1]);
1799 if (server
.port
< 1 || server
.port
> 65535) {
1800 err
= "Invalid port"; goto loaderr
;
1802 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1803 server
.bindaddr
= zstrdup(argv
[1]);
1804 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1805 int seconds
= atoi(argv
[1]);
1806 int changes
= atoi(argv
[2]);
1807 if (seconds
< 1 || changes
< 0) {
1808 err
= "Invalid save parameters"; goto loaderr
;
1810 appendServerSaveParams(seconds
,changes
);
1811 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1812 if (chdir(argv
[1]) == -1) {
1813 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1814 argv
[1], strerror(errno
));
1817 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1818 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1819 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1820 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1821 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1823 err
= "Invalid log level. Must be one of debug, notice, warning";
1826 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1829 server
.logfile
= zstrdup(argv
[1]);
1830 if (!strcasecmp(server
.logfile
,"stdout")) {
1831 zfree(server
.logfile
);
1832 server
.logfile
= NULL
;
1834 if (server
.logfile
) {
1835 /* Test if we are able to open the file. The server will not
1836 * be able to abort just for this problem later... */
1837 logfp
= fopen(server
.logfile
,"a");
1838 if (logfp
== NULL
) {
1839 err
= sdscatprintf(sdsempty(),
1840 "Can't open the log file: %s", strerror(errno
));
1845 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1846 server
.dbnum
= atoi(argv
[1]);
1847 if (server
.dbnum
< 1) {
1848 err
= "Invalid number of databases"; goto loaderr
;
1850 } else if (!strcasecmp(argv
[0],"include") && argc
== 2) {
1851 loadServerConfig(argv
[1]);
1852 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1853 server
.maxclients
= atoi(argv
[1]);
1854 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1855 server
.maxmemory
= memtoll(argv
[1],NULL
);
1856 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1857 server
.masterhost
= sdsnew(argv
[1]);
1858 server
.masterport
= atoi(argv
[2]);
1859 server
.replstate
= REDIS_REPL_CONNECT
;
1860 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1861 server
.masterauth
= zstrdup(argv
[1]);
1862 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1863 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1864 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1866 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1867 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1868 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1870 } else if (!strcasecmp(argv
[0],"activerehashing") && argc
== 2) {
1871 if ((server
.activerehashing
= yesnotoi(argv
[1])) == -1) {
1872 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1874 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1875 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1876 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1878 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
1879 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
1880 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1882 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
1883 if (!strcasecmp(argv
[1],"no")) {
1884 server
.appendfsync
= APPENDFSYNC_NO
;
1885 } else if (!strcasecmp(argv
[1],"always")) {
1886 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1887 } else if (!strcasecmp(argv
[1],"everysec")) {
1888 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1890 err
= "argument must be 'no', 'always' or 'everysec'";
1893 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
1894 server
.requirepass
= zstrdup(argv
[1]);
1895 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
1896 zfree(server
.pidfile
);
1897 server
.pidfile
= zstrdup(argv
[1]);
1898 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
1899 zfree(server
.dbfilename
);
1900 server
.dbfilename
= zstrdup(argv
[1]);
1901 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
1902 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
1903 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1905 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
1906 zfree(server
.vm_swap_file
);
1907 server
.vm_swap_file
= zstrdup(argv
[1]);
1908 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
1909 server
.vm_max_memory
= memtoll(argv
[1],NULL
);
1910 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
1911 server
.vm_page_size
= memtoll(argv
[1], NULL
);
1912 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
1913 server
.vm_pages
= memtoll(argv
[1], NULL
);
1914 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1915 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1916 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-entries") && argc
== 2){
1917 server
.hash_max_zipmap_entries
= memtoll(argv
[1], NULL
);
1918 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-value") && argc
== 2){
1919 server
.hash_max_zipmap_value
= memtoll(argv
[1], NULL
);
1921 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
1923 for (j
= 0; j
< argc
; j
++)
1928 if (fp
!= stdin
) fclose(fp
);
1932 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
1933 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
1934 fprintf(stderr
, ">>> '%s'\n", line
);
1935 fprintf(stderr
, "%s\n", err
);
1939 static void freeClientArgv(redisClient
*c
) {
1942 for (j
= 0; j
< c
->argc
; j
++)
1943 decrRefCount(c
->argv
[j
]);
1944 for (j
= 0; j
< c
->mbargc
; j
++)
1945 decrRefCount(c
->mbargv
[j
]);
1950 static void freeClient(redisClient
*c
) {
1953 /* Note that if the client we are freeing is blocked into a blocking
1954 * call, we have to set querybuf to NULL *before* to call
1955 * unblockClientWaitingData() to avoid processInputBuffer() will get
1956 * called. Also it is important to remove the file events after
1957 * this, because this call adds the READABLE event. */
1958 sdsfree(c
->querybuf
);
1960 if (c
->flags
& REDIS_BLOCKED
)
1961 unblockClientWaitingData(c
);
1963 /* Unsubscribe from all the pubsub channels */
1964 pubsubUnsubscribeAllChannels(c
,0);
1965 pubsubUnsubscribeAllPatterns(c
,0);
1966 dictRelease(c
->pubsub_channels
);
1967 listRelease(c
->pubsub_patterns
);
1968 /* Obvious cleanup */
1969 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
1970 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1971 listRelease(c
->reply
);
1974 /* Remove from the list of clients */
1975 ln
= listSearchKey(server
.clients
,c
);
1976 redisAssert(ln
!= NULL
);
1977 listDelNode(server
.clients
,ln
);
1978 /* Remove from the list of clients waiting for swapped keys */
1979 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
1980 ln
= listSearchKey(server
.io_ready_clients
,c
);
1982 listDelNode(server
.io_ready_clients
,ln
);
1983 server
.vm_blocked_clients
--;
1986 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
1987 ln
= listFirst(c
->io_keys
);
1988 dontWaitForSwappedKey(c
,ln
->value
);
1990 listRelease(c
->io_keys
);
1991 /* Master/slave cleanup */
1992 if (c
->flags
& REDIS_SLAVE
) {
1993 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
1995 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
1996 ln
= listSearchKey(l
,c
);
1997 redisAssert(ln
!= NULL
);
2000 if (c
->flags
& REDIS_MASTER
) {
2001 server
.master
= NULL
;
2002 server
.replstate
= REDIS_REPL_CONNECT
;
2004 /* Release memory */
2007 freeClientMultiState(c
);
2011 #define GLUEREPLY_UP_TO (1024)
2012 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
2014 char buf
[GLUEREPLY_UP_TO
];
2019 listRewind(c
->reply
,&li
);
2020 while((ln
= listNext(&li
))) {
2024 objlen
= sdslen(o
->ptr
);
2025 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
2026 memcpy(buf
+copylen
,o
->ptr
,objlen
);
2028 listDelNode(c
->reply
,ln
);
2030 if (copylen
== 0) return;
2034 /* Now the output buffer is empty, add the new single element */
2035 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
2036 listAddNodeHead(c
->reply
,o
);
2039 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2040 redisClient
*c
= privdata
;
2041 int nwritten
= 0, totwritten
= 0, objlen
;
2044 REDIS_NOTUSED(mask
);
2046 /* Use writev() if we have enough buffers to send */
2047 if (!server
.glueoutputbuf
&&
2048 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
2049 !(c
->flags
& REDIS_MASTER
))
2051 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
2055 while(listLength(c
->reply
)) {
2056 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
2057 glueReplyBuffersIfNeeded(c
);
2059 o
= listNodeValue(listFirst(c
->reply
));
2060 objlen
= sdslen(o
->ptr
);
2063 listDelNode(c
->reply
,listFirst(c
->reply
));
2067 if (c
->flags
& REDIS_MASTER
) {
2068 /* Don't reply to a master */
2069 nwritten
= objlen
- c
->sentlen
;
2071 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
2072 if (nwritten
<= 0) break;
2074 c
->sentlen
+= nwritten
;
2075 totwritten
+= nwritten
;
2076 /* If we fully sent the object on head go to the next one */
2077 if (c
->sentlen
== objlen
) {
2078 listDelNode(c
->reply
,listFirst(c
->reply
));
2081 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2082 * bytes, in a single threaded server it's a good idea to serve
2083 * other clients as well, even if a very large request comes from
2084 * super fast link that is always able to accept data (in real world
2085 * scenario think about 'KEYS *' against the loopback interfae) */
2086 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
2088 if (nwritten
== -1) {
2089 if (errno
== EAGAIN
) {
2092 redisLog(REDIS_VERBOSE
,
2093 "Error writing to client: %s", strerror(errno
));
2098 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
2099 if (listLength(c
->reply
) == 0) {
2101 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2105 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
2107 redisClient
*c
= privdata
;
2108 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
2110 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
2111 int offset
, ion
= 0;
2113 REDIS_NOTUSED(mask
);
2116 while (listLength(c
->reply
)) {
2117 offset
= c
->sentlen
;
2121 /* fill-in the iov[] array */
2122 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
2123 o
= listNodeValue(node
);
2124 objlen
= sdslen(o
->ptr
);
2126 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
2129 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
2130 break; /* no more iovecs */
2132 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
2133 iov
[ion
].iov_len
= objlen
- offset
;
2134 willwrite
+= objlen
- offset
;
2135 offset
= 0; /* just for the first item */
2142 /* write all collected blocks at once */
2143 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
2144 if (errno
!= EAGAIN
) {
2145 redisLog(REDIS_VERBOSE
,
2146 "Error writing to client: %s", strerror(errno
));
2153 totwritten
+= nwritten
;
2154 offset
= c
->sentlen
;
2156 /* remove written robjs from c->reply */
2157 while (nwritten
&& listLength(c
->reply
)) {
2158 o
= listNodeValue(listFirst(c
->reply
));
2159 objlen
= sdslen(o
->ptr
);
2161 if(nwritten
>= objlen
- offset
) {
2162 listDelNode(c
->reply
, listFirst(c
->reply
));
2163 nwritten
-= objlen
- offset
;
2167 c
->sentlen
+= nwritten
;
2175 c
->lastinteraction
= time(NULL
);
2177 if (listLength(c
->reply
) == 0) {
2179 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2183 static struct redisCommand
*lookupCommand(char *name
) {
2185 while(cmdTable
[j
].name
!= NULL
) {
2186 if (!strcasecmp(name
,cmdTable
[j
].name
)) return &cmdTable
[j
];
2192 /* resetClient prepare the client to process the next command */
2193 static void resetClient(redisClient
*c
) {
2199 /* Call() is the core of Redis execution of a command */
2200 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
2203 dirty
= server
.dirty
;
2205 dirty
= server
.dirty
-dirty
;
2207 if (server
.appendonly
&& dirty
)
2208 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2209 if ((dirty
|| cmd
->flags
& REDIS_CMD_FORCE_REPLICATION
) &&
2210 listLength(server
.slaves
))
2211 replicationFeedSlaves(server
.slaves
,c
->db
->id
,c
->argv
,c
->argc
);
2212 if (listLength(server
.monitors
))
2213 replicationFeedSlaves(server
.monitors
,c
->db
->id
,c
->argv
,c
->argc
);
2214 server
.stat_numcommands
++;
2217 /* If this function gets called we already read a whole
2218 * command, argments are in the client argv/argc fields.
2219 * processCommand() execute the command or prepare the
2220 * server for a bulk read from the client.
2222 * If 1 is returned the client is still alive and valid and
2223 * and other operations can be performed by the caller. Otherwise
2224 * if 0 is returned the client was destroied (i.e. after QUIT). */
2225 static int processCommand(redisClient
*c
) {
2226 struct redisCommand
*cmd
;
2228 /* Free some memory if needed (maxmemory setting) */
2229 if (server
.maxmemory
) freeMemoryIfNeeded();
2231 /* Handle the multi bulk command type. This is an alternative protocol
2232 * supported by Redis in order to receive commands that are composed of
2233 * multiple binary-safe "bulk" arguments. The latency of processing is
2234 * a bit higher but this allows things like multi-sets, so if this
2235 * protocol is used only for MSET and similar commands this is a big win. */
2236 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2237 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2238 if (c
->multibulk
<= 0) {
2242 decrRefCount(c
->argv
[c
->argc
-1]);
2246 } else if (c
->multibulk
) {
2247 if (c
->bulklen
== -1) {
2248 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2249 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2253 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2254 decrRefCount(c
->argv
[0]);
2255 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2257 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2262 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2266 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2267 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2271 if (c
->multibulk
== 0) {
2275 /* Here we need to swap the multi-bulk argc/argv with the
2276 * normal argc/argv of the client structure. */
2278 c
->argv
= c
->mbargv
;
2279 c
->mbargv
= auxargv
;
2282 c
->argc
= c
->mbargc
;
2283 c
->mbargc
= auxargc
;
2285 /* We need to set bulklen to something different than -1
2286 * in order for the code below to process the command without
2287 * to try to read the last argument of a bulk command as
2288 * a special argument. */
2290 /* continue below and process the command */
2297 /* -- end of multi bulk commands processing -- */
2299 /* The QUIT command is handled as a special case. Normal command
2300 * procs are unable to close the client connection safely */
2301 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2306 /* Now lookup the command and check ASAP about trivial error conditions
2307 * such wrong arity, bad command name and so forth. */
2308 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2311 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2312 (char*)c
->argv
[0]->ptr
));
2315 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2316 (c
->argc
< -cmd
->arity
)) {
2318 sdscatprintf(sdsempty(),
2319 "-ERR wrong number of arguments for '%s' command\r\n",
2323 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2324 /* This is a bulk command, we have to read the last argument yet. */
2325 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2327 decrRefCount(c
->argv
[c
->argc
-1]);
2328 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2330 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2335 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2336 /* It is possible that the bulk read is already in the
2337 * buffer. Check this condition and handle it accordingly.
2338 * This is just a fast path, alternative to call processInputBuffer().
2339 * It's a good idea since the code is small and this condition
2340 * happens most of the times. */
2341 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2342 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2344 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2346 /* Otherwise return... there is to read the last argument
2347 * from the socket. */
2351 /* Let's try to encode the bulk object to save space. */
2352 if (cmd
->flags
& REDIS_CMD_BULK
)
2353 c
->argv
[c
->argc
-1] = tryObjectEncoding(c
->argv
[c
->argc
-1]);
2355 /* Check if the user is authenticated */
2356 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2357 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2362 /* Handle the maxmemory directive */
2363 if (server
.maxmemory
&& (cmd
->flags
& REDIS_CMD_DENYOOM
) &&
2364 zmalloc_used_memory() > server
.maxmemory
)
2366 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2371 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2372 if ((dictSize(c
->pubsub_channels
) > 0 || listLength(c
->pubsub_patterns
) > 0)
2374 cmd
->proc
!= subscribeCommand
&& cmd
->proc
!= unsubscribeCommand
&&
2375 cmd
->proc
!= psubscribeCommand
&& cmd
->proc
!= punsubscribeCommand
) {
2376 addReplySds(c
,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2381 /* Exec the command */
2382 if (c
->flags
& REDIS_MULTI
&& cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
) {
2383 queueMultiCommand(c
,cmd
);
2384 addReply(c
,shared
.queued
);
2386 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2387 blockClientOnSwappedKeys(cmd
,c
)) return 1;
2391 /* Prepare the client for the next command */
2396 static void replicationFeedSlaves(list
*slaves
, int dictid
, robj
**argv
, int argc
) {
2401 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2402 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2403 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2404 robj
*static_outv
[REDIS_STATIC_ARGS
*3+1];
2407 if (argc
<= REDIS_STATIC_ARGS
) {
2410 outv
= zmalloc(sizeof(robj
*)*(argc
*3+1));
2413 lenobj
= createObject(REDIS_STRING
,
2414 sdscatprintf(sdsempty(), "*%d\r\n", argc
));
2415 lenobj
->refcount
= 0;
2416 outv
[outc
++] = lenobj
;
2417 for (j
= 0; j
< argc
; j
++) {
2418 lenobj
= createObject(REDIS_STRING
,
2419 sdscatprintf(sdsempty(),"$%lu\r\n",
2420 (unsigned long) stringObjectLen(argv
[j
])));
2421 lenobj
->refcount
= 0;
2422 outv
[outc
++] = lenobj
;
2423 outv
[outc
++] = argv
[j
];
2424 outv
[outc
++] = shared
.crlf
;
2427 /* Increment all the refcounts at start and decrement at end in order to
2428 * be sure to free objects if there is no slave in a replication state
2429 * able to be feed with commands */
2430 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2431 listRewind(slaves
,&li
);
2432 while((ln
= listNext(&li
))) {
2433 redisClient
*slave
= ln
->value
;
2435 /* Don't feed slaves that are still waiting for BGSAVE to start */
2436 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2438 /* Feed all the other slaves, MONITORs and so on */
2439 if (slave
->slaveseldb
!= dictid
) {
2443 case 0: selectcmd
= shared
.select0
; break;
2444 case 1: selectcmd
= shared
.select1
; break;
2445 case 2: selectcmd
= shared
.select2
; break;
2446 case 3: selectcmd
= shared
.select3
; break;
2447 case 4: selectcmd
= shared
.select4
; break;
2448 case 5: selectcmd
= shared
.select5
; break;
2449 case 6: selectcmd
= shared
.select6
; break;
2450 case 7: selectcmd
= shared
.select7
; break;
2451 case 8: selectcmd
= shared
.select8
; break;
2452 case 9: selectcmd
= shared
.select9
; break;
2454 selectcmd
= createObject(REDIS_STRING
,
2455 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2456 selectcmd
->refcount
= 0;
2459 addReply(slave
,selectcmd
);
2460 slave
->slaveseldb
= dictid
;
2462 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2464 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2465 if (outv
!= static_outv
) zfree(outv
);
2468 static void processInputBuffer(redisClient
*c
) {
2470 /* Before to process the input buffer, make sure the client is not
2471 * waitig for a blocking operation such as BLPOP. Note that the first
2472 * iteration the client is never blocked, otherwise the processInputBuffer
2473 * would not be called at all, but after the execution of the first commands
2474 * in the input buffer the client may be blocked, and the "goto again"
2475 * will try to reiterate. The following line will make it return asap. */
2476 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2477 if (c
->bulklen
== -1) {
2478 /* Read the first line of the query */
2479 char *p
= strchr(c
->querybuf
,'\n');
2486 query
= c
->querybuf
;
2487 c
->querybuf
= sdsempty();
2488 querylen
= 1+(p
-(query
));
2489 if (sdslen(query
) > querylen
) {
2490 /* leave data after the first line of the query in the buffer */
2491 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2493 *p
= '\0'; /* remove "\n" */
2494 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2495 sdsupdatelen(query
);
2497 /* Now we can split the query in arguments */
2498 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2501 if (c
->argv
) zfree(c
->argv
);
2502 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2504 for (j
= 0; j
< argc
; j
++) {
2505 if (sdslen(argv
[j
])) {
2506 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2514 /* Execute the command. If the client is still valid
2515 * after processCommand() return and there is something
2516 * on the query buffer try to process the next command. */
2517 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2519 /* Nothing to process, argc == 0. Just process the query
2520 * buffer if it's not empty or return to the caller */
2521 if (sdslen(c
->querybuf
)) goto again
;
2524 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2525 redisLog(REDIS_VERBOSE
, "Client protocol error");
2530 /* Bulk read handling. Note that if we are at this point
2531 the client already sent a command terminated with a newline,
2532 we are reading the bulk data that is actually the last
2533 argument of the command. */
2534 int qbl
= sdslen(c
->querybuf
);
2536 if (c
->bulklen
<= qbl
) {
2537 /* Copy everything but the final CRLF as final argument */
2538 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2540 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2541 /* Process the command. If the client is still valid after
2542 * the processing and there is more data in the buffer
2543 * try to parse it. */
2544 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2550 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2551 redisClient
*c
= (redisClient
*) privdata
;
2552 char buf
[REDIS_IOBUF_LEN
];
2555 REDIS_NOTUSED(mask
);
2557 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2559 if (errno
== EAGAIN
) {
2562 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2566 } else if (nread
== 0) {
2567 redisLog(REDIS_VERBOSE
, "Client closed connection");
2572 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2573 c
->lastinteraction
= time(NULL
);
2577 processInputBuffer(c
);
2580 static int selectDb(redisClient
*c
, int id
) {
2581 if (id
< 0 || id
>= server
.dbnum
)
2583 c
->db
= &server
.db
[id
];
2587 static void *dupClientReplyValue(void *o
) {
2588 incrRefCount((robj
*)o
);
2592 static int listMatchObjects(void *a
, void *b
) {
2593 return compareStringObjects(a
,b
) == 0;
2596 static redisClient
*createClient(int fd
) {
2597 redisClient
*c
= zmalloc(sizeof(*c
));
2599 anetNonBlock(NULL
,fd
);
2600 anetTcpNoDelay(NULL
,fd
);
2601 if (!c
) return NULL
;
2604 c
->querybuf
= sdsempty();
2613 c
->lastinteraction
= time(NULL
);
2614 c
->authenticated
= 0;
2615 c
->replstate
= REDIS_REPL_NONE
;
2616 c
->reply
= listCreate();
2617 listSetFreeMethod(c
->reply
,decrRefCount
);
2618 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2619 c
->blockingkeys
= NULL
;
2620 c
->blockingkeysnum
= 0;
2621 c
->io_keys
= listCreate();
2622 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2623 c
->pubsub_channels
= dictCreate(&setDictType
,NULL
);
2624 c
->pubsub_patterns
= listCreate();
2625 listSetFreeMethod(c
->pubsub_patterns
,decrRefCount
);
2626 listSetMatchMethod(c
->pubsub_patterns
,listMatchObjects
);
2627 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2628 readQueryFromClient
, c
) == AE_ERR
) {
2632 listAddNodeTail(server
.clients
,c
);
2633 initClientMultiState(c
);
2637 static void addReply(redisClient
*c
, robj
*obj
) {
2638 if (listLength(c
->reply
) == 0 &&
2639 (c
->replstate
== REDIS_REPL_NONE
||
2640 c
->replstate
== REDIS_REPL_ONLINE
) &&
2641 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2642 sendReplyToClient
, c
) == AE_ERR
) return;
2644 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2645 obj
= dupStringObject(obj
);
2646 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2648 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2651 static void addReplySds(redisClient
*c
, sds s
) {
2652 robj
*o
= createObject(REDIS_STRING
,s
);
2657 static void addReplyDouble(redisClient
*c
, double d
) {
2660 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2661 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2662 (unsigned long) strlen(buf
),buf
));
2665 static void addReplyLong(redisClient
*c
, long l
) {
2670 addReply(c
,shared
.czero
);
2672 } else if (l
== 1) {
2673 addReply(c
,shared
.cone
);
2676 len
= snprintf(buf
,sizeof(buf
),":%ld\r\n",l
);
2677 addReplySds(c
,sdsnewlen(buf
,len
));
2680 static void addReplyLongLong(redisClient
*c
, long long ll
) {
2685 addReply(c
,shared
.czero
);
2687 } else if (ll
== 1) {
2688 addReply(c
,shared
.cone
);
2691 len
= snprintf(buf
,sizeof(buf
),":%lld\r\n",ll
);
2692 addReplySds(c
,sdsnewlen(buf
,len
));
2695 static void addReplyUlong(redisClient
*c
, unsigned long ul
) {
2700 addReply(c
,shared
.czero
);
2702 } else if (ul
== 1) {
2703 addReply(c
,shared
.cone
);
2706 len
= snprintf(buf
,sizeof(buf
),":%lu\r\n",ul
);
2707 addReplySds(c
,sdsnewlen(buf
,len
));
2710 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2713 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2714 len
= sdslen(obj
->ptr
);
2716 long n
= (long)obj
->ptr
;
2718 /* Compute how many bytes will take this integer as a radix 10 string */
2724 while((n
= n
/10) != 0) {
2728 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len
));
2731 static void addReplyBulk(redisClient
*c
, robj
*obj
) {
2732 addReplyBulkLen(c
,obj
);
2734 addReply(c
,shared
.crlf
);
2737 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2738 static void addReplyBulkCString(redisClient
*c
, char *s
) {
2740 addReply(c
,shared
.nullbulk
);
2742 robj
*o
= createStringObject(s
,strlen(s
));
2748 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2753 REDIS_NOTUSED(mask
);
2754 REDIS_NOTUSED(privdata
);
2756 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2757 if (cfd
== AE_ERR
) {
2758 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2761 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2762 if ((c
= createClient(cfd
)) == NULL
) {
2763 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2764 close(cfd
); /* May be already closed, just ingore errors */
2767 /* If maxclient directive is set and this is one client more... close the
2768 * connection. Note that we create the client instead to check before
2769 * for this condition, since now the socket is already set in nonblocking
2770 * mode and we can send an error for free using the Kernel I/O */
2771 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2772 char *err
= "-ERR max number of clients reached\r\n";
2774 /* That's a best effort error message, don't check write errors */
2775 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2776 /* Nothing to do, Just to avoid the warning... */
2781 server
.stat_numconnections
++;
2784 /* ======================= Redis objects implementation ===================== */
2786 static robj
*createObject(int type
, void *ptr
) {
2789 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2790 if (listLength(server
.objfreelist
)) {
2791 listNode
*head
= listFirst(server
.objfreelist
);
2792 o
= listNodeValue(head
);
2793 listDelNode(server
.objfreelist
,head
);
2794 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2796 if (server
.vm_enabled
) {
2797 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2798 o
= zmalloc(sizeof(*o
));
2800 o
= zmalloc(sizeof(*o
)-sizeof(struct redisObjectVM
));
2804 o
->encoding
= REDIS_ENCODING_RAW
;
2807 if (server
.vm_enabled
) {
2808 /* Note that this code may run in the context of an I/O thread
2809 * and accessing to server.unixtime in theory is an error
2810 * (no locks). But in practice this is safe, and even if we read
2811 * garbage Redis will not fail, as it's just a statistical info */
2812 o
->vm
.atime
= server
.unixtime
;
2813 o
->storage
= REDIS_VM_MEMORY
;
2818 static robj
*createStringObject(char *ptr
, size_t len
) {
2819 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
2822 static robj
*createStringObjectFromLongLong(long long value
) {
2824 if (value
>= 0 && value
< REDIS_SHARED_INTEGERS
) {
2825 incrRefCount(shared
.integers
[value
]);
2826 o
= shared
.integers
[value
];
2828 o
= createObject(REDIS_STRING
, NULL
);
2829 if (value
>= LONG_MIN
&& value
<= LONG_MAX
) {
2830 o
->encoding
= REDIS_ENCODING_INT
;
2831 o
->ptr
= (void*)((long)value
);
2833 o
->ptr
= sdscatprintf(sdsempty(),"%lld",value
);
2839 static robj
*dupStringObject(robj
*o
) {
2840 assert(o
->encoding
== REDIS_ENCODING_RAW
);
2841 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
2844 static robj
*createListObject(void) {
2845 list
*l
= listCreate();
2847 listSetFreeMethod(l
,decrRefCount
);
2848 return createObject(REDIS_LIST
,l
);
2851 static robj
*createSetObject(void) {
2852 dict
*d
= dictCreate(&setDictType
,NULL
);
2853 return createObject(REDIS_SET
,d
);
2856 static robj
*createHashObject(void) {
2857 /* All the Hashes start as zipmaps. Will be automatically converted
2858 * into hash tables if there are enough elements or big elements
2860 unsigned char *zm
= zipmapNew();
2861 robj
*o
= createObject(REDIS_HASH
,zm
);
2862 o
->encoding
= REDIS_ENCODING_ZIPMAP
;
2866 static robj
*createZsetObject(void) {
2867 zset
*zs
= zmalloc(sizeof(*zs
));
2869 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
2870 zs
->zsl
= zslCreate();
2871 return createObject(REDIS_ZSET
,zs
);
2874 static void freeStringObject(robj
*o
) {
2875 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2880 static void freeListObject(robj
*o
) {
2881 listRelease((list
*) o
->ptr
);
2884 static void freeSetObject(robj
*o
) {
2885 dictRelease((dict
*) o
->ptr
);
2888 static void freeZsetObject(robj
*o
) {
2891 dictRelease(zs
->dict
);
2896 static void freeHashObject(robj
*o
) {
2897 switch (o
->encoding
) {
2898 case REDIS_ENCODING_HT
:
2899 dictRelease((dict
*) o
->ptr
);
2901 case REDIS_ENCODING_ZIPMAP
:
2905 redisPanic("Unknown hash encoding type");
2910 static void incrRefCount(robj
*o
) {
2914 static void decrRefCount(void *obj
) {
2917 if (o
->refcount
<= 0) redisPanic("decrRefCount against refcount <= 0");
2918 /* Object is a key of a swapped out value, or in the process of being
2920 if (server
.vm_enabled
&&
2921 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
2923 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(obj
);
2924 redisAssert(o
->type
== REDIS_STRING
);
2925 freeStringObject(o
);
2926 vmMarkPagesFree(o
->vm
.page
,o
->vm
.usedpages
);
2927 pthread_mutex_lock(&server
.obj_freelist_mutex
);
2928 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2929 !listAddNodeHead(server
.objfreelist
,o
))
2931 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2932 server
.vm_stats_swapped_objects
--;
2935 /* Object is in memory, or in the process of being swapped out. */
2936 if (--(o
->refcount
) == 0) {
2937 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
2938 vmCancelThreadedIOJob(obj
);
2940 case REDIS_STRING
: freeStringObject(o
); break;
2941 case REDIS_LIST
: freeListObject(o
); break;
2942 case REDIS_SET
: freeSetObject(o
); break;
2943 case REDIS_ZSET
: freeZsetObject(o
); break;
2944 case REDIS_HASH
: freeHashObject(o
); break;
2945 default: redisPanic("Unknown object type"); break;
2947 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2948 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2949 !listAddNodeHead(server
.objfreelist
,o
))
2951 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2955 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
2956 dictEntry
*de
= dictFind(db
->dict
,key
);
2958 robj
*key
= dictGetEntryKey(de
);
2959 robj
*val
= dictGetEntryVal(de
);
2961 if (server
.vm_enabled
) {
2962 if (key
->storage
== REDIS_VM_MEMORY
||
2963 key
->storage
== REDIS_VM_SWAPPING
)
2965 /* If we were swapping the object out, stop it, this key
2967 if (key
->storage
== REDIS_VM_SWAPPING
)
2968 vmCancelThreadedIOJob(key
);
2969 /* Update the access time of the key for the aging algorithm. */
2970 key
->vm
.atime
= server
.unixtime
;
2972 int notify
= (key
->storage
== REDIS_VM_LOADING
);
2974 /* Our value was swapped on disk. Bring it at home. */
2975 redisAssert(val
== NULL
);
2976 val
= vmLoadObject(key
);
2977 dictGetEntryVal(de
) = val
;
2979 /* Clients blocked by the VM subsystem may be waiting for
2981 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
2990 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
2991 expireIfNeeded(db
,key
);
2992 return lookupKey(db
,key
);
2995 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
2996 deleteIfVolatile(db
,key
);
2997 return lookupKey(db
,key
);
3000 static robj
*lookupKeyReadOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
3001 robj
*o
= lookupKeyRead(c
->db
, key
);
3002 if (!o
) addReply(c
,reply
);
3006 static robj
*lookupKeyWriteOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
3007 robj
*o
= lookupKeyWrite(c
->db
, key
);
3008 if (!o
) addReply(c
,reply
);
3012 static int checkType(redisClient
*c
, robj
*o
, int type
) {
3013 if (o
->type
!= type
) {
3014 addReply(c
,shared
.wrongtypeerr
);
3020 static int deleteKey(redisDb
*db
, robj
*key
) {
3023 /* We need to protect key from destruction: after the first dictDelete()
3024 * it may happen that 'key' is no longer valid if we don't increment
3025 * it's count. This may happen when we get the object reference directly
3026 * from the hash table with dictRandomKey() or dict iterators */
3028 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
);
3029 retval
= dictDelete(db
->dict
,key
);
3032 return retval
== DICT_OK
;
3035 /* Check if the nul-terminated string 's' can be represented by a long
3036 * (that is, is a number that fits into long without any other space or
3037 * character before or after the digits).
3039 * If so, the function returns REDIS_OK and *longval is set to the value
3040 * of the number. Otherwise REDIS_ERR is returned */
3041 static int isStringRepresentableAsLong(sds s
, long *longval
) {
3042 char buf
[32], *endptr
;
3046 value
= strtol(s
, &endptr
, 10);
3047 if (endptr
[0] != '\0') return REDIS_ERR
;
3048 slen
= snprintf(buf
,32,"%ld",value
);
3050 /* If the number converted back into a string is not identical
3051 * then it's not possible to encode the string as integer */
3052 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
3053 if (longval
) *longval
= value
;
3057 /* Try to encode a string object in order to save space */
3058 static robj
*tryObjectEncoding(robj
*o
) {
3062 if (o
->encoding
!= REDIS_ENCODING_RAW
)
3063 return o
; /* Already encoded */
3065 /* It's not safe to encode shared objects: shared objects can be shared
3066 * everywhere in the "object space" of Redis. Encoded objects can only
3067 * appear as "values" (and not, for instance, as keys) */
3068 if (o
->refcount
> 1) return o
;
3070 /* Currently we try to encode only strings */
3071 redisAssert(o
->type
== REDIS_STRING
);
3073 /* Check if we can represent this string as a long integer */
3074 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return o
;
3076 /* Ok, this object can be encoded */
3077 if (value
>= 0 && value
< REDIS_SHARED_INTEGERS
) {
3079 incrRefCount(shared
.integers
[value
]);
3080 return shared
.integers
[value
];
3082 o
->encoding
= REDIS_ENCODING_INT
;
3084 o
->ptr
= (void*) value
;
3089 /* Get a decoded version of an encoded object (returned as a new object).
3090 * If the object is already raw-encoded just increment the ref count. */
3091 static robj
*getDecodedObject(robj
*o
) {
3094 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3098 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
3101 snprintf(buf
,32,"%ld",(long)o
->ptr
);
3102 dec
= createStringObject(buf
,strlen(buf
));
3105 redisPanic("Unknown encoding type");
3109 /* Compare two string objects via strcmp() or alike.
3110 * Note that the objects may be integer-encoded. In such a case we
3111 * use snprintf() to get a string representation of the numbers on the stack
3112 * and compare the strings, it's much faster than calling getDecodedObject().
3114 * Important note: if objects are not integer encoded, but binary-safe strings,
3115 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3117 static int compareStringObjects(robj
*a
, robj
*b
) {
3118 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
3119 char bufa
[128], bufb
[128], *astr
, *bstr
;
3122 if (a
== b
) return 0;
3123 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
3124 snprintf(bufa
,sizeof(bufa
),"%ld",(long) a
->ptr
);
3130 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
3131 snprintf(bufb
,sizeof(bufb
),"%ld",(long) b
->ptr
);
3137 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
3140 static size_t stringObjectLen(robj
*o
) {
3141 redisAssert(o
->type
== REDIS_STRING
);
3142 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3143 return sdslen(o
->ptr
);
3147 return snprintf(buf
,32,"%ld",(long)o
->ptr
);
3151 static int getDoubleFromObject(robj
*o
, double *target
) {
3158 redisAssert(o
->type
== REDIS_STRING
);
3159 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3160 value
= strtod(o
->ptr
, &eptr
);
3161 if (eptr
[0] != '\0') return REDIS_ERR
;
3162 } else if (o
->encoding
== REDIS_ENCODING_INT
) {
3163 value
= (long)o
->ptr
;
3165 redisAssert(1 != 1);
3173 static int getDoubleFromObjectOrReply(redisClient
*c
, robj
*o
, double *target
, const char *msg
) {
3175 if (getDoubleFromObject(o
, &value
) != REDIS_OK
) {
3177 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3179 addReplySds(c
, sdsnew("-ERR value is not a double\r\n"));
3188 static int getLongLongFromObject(robj
*o
, long long *target
) {
3195 redisAssert(o
->type
== REDIS_STRING
);
3196 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3197 value
= strtoll(o
->ptr
, &eptr
, 10);
3198 if (eptr
[0] != '\0') return REDIS_ERR
;
3199 } else if (o
->encoding
== REDIS_ENCODING_INT
) {
3200 value
= (long)o
->ptr
;
3202 redisAssert(1 != 1);
3210 static int getLongLongFromObjectOrReply(redisClient
*c
, robj
*o
, long long *target
, const char *msg
) {
3212 if (getLongLongFromObject(o
, &value
) != REDIS_OK
) {
3214 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3216 addReplySds(c
, sdsnew("-ERR value is not an integer\r\n"));
3225 static int getLongFromObjectOrReply(redisClient
*c
, robj
*o
, long *target
, const char *msg
) {
3228 if (getLongLongFromObjectOrReply(c
, o
, &value
, msg
) != REDIS_OK
) return REDIS_ERR
;
3229 if (value
< LONG_MIN
|| value
> LONG_MAX
) {
3231 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3233 addReplySds(c
, sdsnew("-ERR value is out of range\r\n"));
3242 /*============================ RDB saving/loading =========================== */
3244 static int rdbSaveType(FILE *fp
, unsigned char type
) {
3245 if (fwrite(&type
,1,1,fp
) == 0) return -1;
3249 static int rdbSaveTime(FILE *fp
, time_t t
) {
3250 int32_t t32
= (int32_t) t
;
3251 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
3255 /* check rdbLoadLen() comments for more info */
3256 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
3257 unsigned char buf
[2];
3260 /* Save a 6 bit len */
3261 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
3262 if (fwrite(buf
,1,1,fp
) == 0) return -1;
3263 } else if (len
< (1<<14)) {
3264 /* Save a 14 bit len */
3265 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
3267 if (fwrite(buf
,2,1,fp
) == 0) return -1;
3269 /* Save a 32 bit len */
3270 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
3271 if (fwrite(buf
,1,1,fp
) == 0) return -1;
3273 if (fwrite(&len
,4,1,fp
) == 0) return -1;
3278 /* String objects in the form "2391" "-100" without any space and with a
3279 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3280 * encoded as integers to save space */
3281 static int rdbTryIntegerEncoding(char *s
, size_t len
, unsigned char *enc
) {
3283 char *endptr
, buf
[32];
3285 /* Check if it's possible to encode this value as a number */
3286 value
= strtoll(s
, &endptr
, 10);
3287 if (endptr
[0] != '\0') return 0;
3288 snprintf(buf
,32,"%lld",value
);
3290 /* If the number converted back into a string is not identical
3291 * then it's not possible to encode the string as integer */
3292 if (strlen(buf
) != len
|| memcmp(buf
,s
,len
)) return 0;
3294 /* Finally check if it fits in our ranges */
3295 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
3296 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
3297 enc
[1] = value
&0xFF;
3299 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
3300 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
3301 enc
[1] = value
&0xFF;
3302 enc
[2] = (value
>>8)&0xFF;
3304 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
3305 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
3306 enc
[1] = value
&0xFF;
3307 enc
[2] = (value
>>8)&0xFF;
3308 enc
[3] = (value
>>16)&0xFF;
3309 enc
[4] = (value
>>24)&0xFF;
3316 static int rdbSaveLzfStringObject(FILE *fp
, unsigned char *s
, size_t len
) {
3317 size_t comprlen
, outlen
;
3321 /* We require at least four bytes compression for this to be worth it */
3322 if (len
<= 4) return 0;
3324 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
3325 comprlen
= lzf_compress(s
, len
, out
, outlen
);
3326 if (comprlen
== 0) {
3330 /* Data compressed! Let's save it on disk */
3331 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
3332 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
3333 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
3334 if (rdbSaveLen(fp
,len
) == -1) goto writeerr
;
3335 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
3344 /* Save a string objet as [len][data] on disk. If the object is a string
3345 * representation of an integer value we try to safe it in a special form */
3346 static int rdbSaveRawString(FILE *fp
, unsigned char *s
, size_t len
) {
3349 /* Try integer encoding */
3351 unsigned char buf
[5];
3352 if ((enclen
= rdbTryIntegerEncoding((char*)s
,len
,buf
)) > 0) {
3353 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
3358 /* Try LZF compression - under 20 bytes it's unable to compress even
3359 * aaaaaaaaaaaaaaaaaa so skip it */
3360 if (server
.rdbcompression
&& len
> 20) {
3363 retval
= rdbSaveLzfStringObject(fp
,s
,len
);
3364 if (retval
== -1) return -1;
3365 if (retval
> 0) return 0;
3366 /* retval == 0 means data can't be compressed, save the old way */
3369 /* Store verbatim */
3370 if (rdbSaveLen(fp
,len
) == -1) return -1;
3371 if (len
&& fwrite(s
,len
,1,fp
) == 0) return -1;
3375 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3376 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
3379 /* Avoid incr/decr ref count business when possible.
3380 * This plays well with copy-on-write given that we are probably
3381 * in a child process (BGSAVE). Also this makes sure key objects
3382 * of swapped objects are not incRefCount-ed (an assert does not allow
3383 * this in order to avoid bugs) */
3384 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
3385 obj
= getDecodedObject(obj
);
3386 retval
= rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3389 retval
= rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3394 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3395 * 8 bit integer specifing the length of the representation.
3396 * This 8 bit integer has special values in order to specify the following
3402 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3403 unsigned char buf
[128];
3409 } else if (!isfinite(val
)) {
3411 buf
[0] = (val
< 0) ? 255 : 254;
3413 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3414 buf
[0] = strlen((char*)buf
+1);
3417 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3421 /* Save a Redis object. */
3422 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3423 if (o
->type
== REDIS_STRING
) {
3424 /* Save a string value */
3425 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3426 } else if (o
->type
== REDIS_LIST
) {
3427 /* Save a list value */
3428 list
*list
= o
->ptr
;
3432 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3433 listRewind(list
,&li
);
3434 while((ln
= listNext(&li
))) {
3435 robj
*eleobj
= listNodeValue(ln
);
3437 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3439 } else if (o
->type
== REDIS_SET
) {
3440 /* Save a set value */
3442 dictIterator
*di
= dictGetIterator(set
);
3445 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3446 while((de
= dictNext(di
)) != NULL
) {
3447 robj
*eleobj
= dictGetEntryKey(de
);
3449 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3451 dictReleaseIterator(di
);
3452 } else if (o
->type
== REDIS_ZSET
) {
3453 /* Save a set value */
3455 dictIterator
*di
= dictGetIterator(zs
->dict
);
3458 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3459 while((de
= dictNext(di
)) != NULL
) {
3460 robj
*eleobj
= dictGetEntryKey(de
);
3461 double *score
= dictGetEntryVal(de
);
3463 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3464 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3466 dictReleaseIterator(di
);
3467 } else if (o
->type
== REDIS_HASH
) {
3468 /* Save a hash value */
3469 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3470 unsigned char *p
= zipmapRewind(o
->ptr
);
3471 unsigned int count
= zipmapLen(o
->ptr
);
3472 unsigned char *key
, *val
;
3473 unsigned int klen
, vlen
;
3475 if (rdbSaveLen(fp
,count
) == -1) return -1;
3476 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
3477 if (rdbSaveRawString(fp
,key
,klen
) == -1) return -1;
3478 if (rdbSaveRawString(fp
,val
,vlen
) == -1) return -1;
3481 dictIterator
*di
= dictGetIterator(o
->ptr
);
3484 if (rdbSaveLen(fp
,dictSize((dict
*)o
->ptr
)) == -1) return -1;
3485 while((de
= dictNext(di
)) != NULL
) {
3486 robj
*key
= dictGetEntryKey(de
);
3487 robj
*val
= dictGetEntryVal(de
);
3489 if (rdbSaveStringObject(fp
,key
) == -1) return -1;
3490 if (rdbSaveStringObject(fp
,val
) == -1) return -1;
3492 dictReleaseIterator(di
);
3495 redisPanic("Unknown object type");
3500 /* Return the length the object will have on disk if saved with
3501 * the rdbSaveObject() function. Currently we use a trick to get
3502 * this length with very little changes to the code. In the future
3503 * we could switch to a faster solution. */
3504 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3505 if (fp
== NULL
) fp
= server
.devnull
;
3507 assert(rdbSaveObject(fp
,o
) != 1);
3511 /* Return the number of pages required to save this object in the swap file */
3512 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3513 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3515 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3518 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3519 static int rdbSave(char *filename
) {
3520 dictIterator
*di
= NULL
;
3525 time_t now
= time(NULL
);
3527 /* Wait for I/O therads to terminate, just in case this is a
3528 * foreground-saving, to avoid seeking the swap file descriptor at the
3530 if (server
.vm_enabled
)
3531 waitEmptyIOJobsQueue();
3533 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3534 fp
= fopen(tmpfile
,"w");
3536 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3539 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3540 for (j
= 0; j
< server
.dbnum
; j
++) {
3541 redisDb
*db
= server
.db
+j
;
3543 if (dictSize(d
) == 0) continue;
3544 di
= dictGetIterator(d
);
3550 /* Write the SELECT DB opcode */
3551 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3552 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3554 /* Iterate this DB writing every entry */
3555 while((de
= dictNext(di
)) != NULL
) {
3556 robj
*key
= dictGetEntryKey(de
);
3557 robj
*o
= dictGetEntryVal(de
);
3558 time_t expiretime
= getExpire(db
,key
);
3560 /* Save the expire time */
3561 if (expiretime
!= -1) {
3562 /* If this key is already expired skip it */
3563 if (expiretime
< now
) continue;
3564 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3565 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3567 /* Save the key and associated value. This requires special
3568 * handling if the value is swapped out. */
3569 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
3570 key
->storage
== REDIS_VM_SWAPPING
) {
3571 /* Save type, key, value */
3572 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3573 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3574 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3576 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3578 /* Get a preview of the object in memory */
3579 po
= vmPreviewObject(key
);
3580 /* Save type, key, value */
3581 if (rdbSaveType(fp
,key
->vtype
) == -1) goto werr
;
3582 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3583 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3584 /* Remove the loaded object from memory */
3588 dictReleaseIterator(di
);
3591 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3593 /* Make sure data will not remain on the OS's output buffers */
3598 /* Use RENAME to make sure the DB file is changed atomically only
3599 * if the generate DB file is ok. */
3600 if (rename(tmpfile
,filename
) == -1) {
3601 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3605 redisLog(REDIS_NOTICE
,"DB saved on disk");
3607 server
.lastsave
= time(NULL
);
3613 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
3614 if (di
) dictReleaseIterator(di
);
3618 static int rdbSaveBackground(char *filename
) {
3621 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
3622 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
3623 if ((childpid
= fork()) == 0) {
3625 if (server
.vm_enabled
) vmReopenSwapFile();
3627 if (rdbSave(filename
) == REDIS_OK
) {
3634 if (childpid
== -1) {
3635 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
3639 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
3640 server
.bgsavechildpid
= childpid
;
3641 updateDictResizePolicy();
3644 return REDIS_OK
; /* unreached */
3647 static void rdbRemoveTempFile(pid_t childpid
) {
3650 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
3654 static int rdbLoadType(FILE *fp
) {
3656 if (fread(&type
,1,1,fp
) == 0) return -1;
3660 static time_t rdbLoadTime(FILE *fp
) {
3662 if (fread(&t32
,4,1,fp
) == 0) return -1;
3663 return (time_t) t32
;
3666 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3667 * of this file for a description of how this are stored on disk.
3669 * isencoded is set to 1 if the readed length is not actually a length but
3670 * an "encoding type", check the above comments for more info */
3671 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
3672 unsigned char buf
[2];
3676 if (isencoded
) *isencoded
= 0;
3677 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3678 type
= (buf
[0]&0xC0)>>6;
3679 if (type
== REDIS_RDB_6BITLEN
) {
3680 /* Read a 6 bit len */
3682 } else if (type
== REDIS_RDB_ENCVAL
) {
3683 /* Read a 6 bit len encoding type */
3684 if (isencoded
) *isencoded
= 1;
3686 } else if (type
== REDIS_RDB_14BITLEN
) {
3687 /* Read a 14 bit len */
3688 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3689 return ((buf
[0]&0x3F)<<8)|buf
[1];
3691 /* Read a 32 bit len */
3692 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
3697 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
) {
3698 unsigned char enc
[4];
3701 if (enctype
== REDIS_RDB_ENC_INT8
) {
3702 if (fread(enc
,1,1,fp
) == 0) return NULL
;
3703 val
= (signed char)enc
[0];
3704 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
3706 if (fread(enc
,2,1,fp
) == 0) return NULL
;
3707 v
= enc
[0]|(enc
[1]<<8);
3709 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
3711 if (fread(enc
,4,1,fp
) == 0) return NULL
;
3712 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
3715 val
= 0; /* anti-warning */
3716 redisPanic("Unknown RDB integer encoding type");
3718 return createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",val
));
3721 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
3722 unsigned int len
, clen
;
3723 unsigned char *c
= NULL
;
3726 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3727 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3728 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
3729 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
3730 if (fread(c
,clen
,1,fp
) == 0) goto err
;
3731 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
3733 return createObject(REDIS_STRING
,val
);
3740 static robj
*rdbLoadStringObject(FILE*fp
) {
3745 len
= rdbLoadLen(fp
,&isencoded
);
3748 case REDIS_RDB_ENC_INT8
:
3749 case REDIS_RDB_ENC_INT16
:
3750 case REDIS_RDB_ENC_INT32
:
3751 return rdbLoadIntegerObject(fp
,len
);
3752 case REDIS_RDB_ENC_LZF
:
3753 return rdbLoadLzfStringObject(fp
);
3755 redisPanic("Unknown RDB encoding type");
3759 if (len
== REDIS_RDB_LENERR
) return NULL
;
3760 val
= sdsnewlen(NULL
,len
);
3761 if (len
&& fread(val
,len
,1,fp
) == 0) {
3765 return createObject(REDIS_STRING
,val
);
3768 /* For information about double serialization check rdbSaveDoubleValue() */
3769 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
3773 if (fread(&len
,1,1,fp
) == 0) return -1;
3775 case 255: *val
= R_NegInf
; return 0;
3776 case 254: *val
= R_PosInf
; return 0;
3777 case 253: *val
= R_Nan
; return 0;
3779 if (fread(buf
,len
,1,fp
) == 0) return -1;
3781 sscanf(buf
, "%lg", val
);
3786 /* Load a Redis object of the specified type from the specified file.
3787 * On success a newly allocated object is returned, otherwise NULL. */
3788 static robj
*rdbLoadObject(int type
, FILE *fp
) {
3791 redisLog(REDIS_DEBUG
,"LOADING OBJECT %d (at %d)\n",type
,ftell(fp
));
3792 if (type
== REDIS_STRING
) {
3793 /* Read string value */
3794 if ((o
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3795 o
= tryObjectEncoding(o
);
3796 } else if (type
== REDIS_LIST
|| type
== REDIS_SET
) {
3797 /* Read list/set value */
3800 if ((listlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3801 o
= (type
== REDIS_LIST
) ? createListObject() : createSetObject();
3802 /* It's faster to expand the dict to the right size asap in order
3803 * to avoid rehashing */
3804 if (type
== REDIS_SET
&& listlen
> DICT_HT_INITIAL_SIZE
)
3805 dictExpand(o
->ptr
,listlen
);
3806 /* Load every single element of the list/set */
3810 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3811 ele
= tryObjectEncoding(ele
);
3812 if (type
== REDIS_LIST
) {
3813 listAddNodeTail((list
*)o
->ptr
,ele
);
3815 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
3818 } else if (type
== REDIS_ZSET
) {
3819 /* Read list/set value */
3823 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3824 o
= createZsetObject();
3826 /* Load every single element of the list/set */
3829 double *score
= zmalloc(sizeof(double));
3831 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3832 ele
= tryObjectEncoding(ele
);
3833 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
3834 dictAdd(zs
->dict
,ele
,score
);
3835 zslInsert(zs
->zsl
,*score
,ele
);
3836 incrRefCount(ele
); /* added to skiplist */
3838 } else if (type
== REDIS_HASH
) {
3841 if ((hashlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3842 o
= createHashObject();
3843 /* Too many entries? Use an hash table. */
3844 if (hashlen
> server
.hash_max_zipmap_entries
)
3845 convertToRealHash(o
);
3846 /* Load every key/value, then set it into the zipmap or hash
3847 * table, as needed. */
3851 if ((key
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3852 if ((val
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3853 /* If we are using a zipmap and there are too big values
3854 * the object is converted to real hash table encoding. */
3855 if (o
->encoding
!= REDIS_ENCODING_HT
&&
3856 (sdslen(key
->ptr
) > server
.hash_max_zipmap_value
||
3857 sdslen(val
->ptr
) > server
.hash_max_zipmap_value
))
3859 convertToRealHash(o
);
3862 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3863 unsigned char *zm
= o
->ptr
;
3865 zm
= zipmapSet(zm
,key
->ptr
,sdslen(key
->ptr
),
3866 val
->ptr
,sdslen(val
->ptr
),NULL
);
3871 key
= tryObjectEncoding(key
);
3872 val
= tryObjectEncoding(val
);
3873 dictAdd((dict
*)o
->ptr
,key
,val
);
3877 redisPanic("Unknown object type");
3882 static int rdbLoad(char *filename
) {
3884 robj
*keyobj
= NULL
;
3886 int type
, retval
, rdbver
;
3887 dict
*d
= server
.db
[0].dict
;
3888 redisDb
*db
= server
.db
+0;
3890 time_t expiretime
= -1, now
= time(NULL
);
3891 long long loadedkeys
= 0;
3893 fp
= fopen(filename
,"r");
3894 if (!fp
) return REDIS_ERR
;
3895 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
3897 if (memcmp(buf
,"REDIS",5) != 0) {
3899 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
3902 rdbver
= atoi(buf
+5);
3905 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
3912 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3913 if (type
== REDIS_EXPIRETIME
) {
3914 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
3915 /* We read the time so we need to read the object type again */
3916 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3918 if (type
== REDIS_EOF
) break;
3919 /* Handle SELECT DB opcode as a special case */
3920 if (type
== REDIS_SELECTDB
) {
3921 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
3923 if (dbid
>= (unsigned)server
.dbnum
) {
3924 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
3927 db
= server
.db
+dbid
;
3932 if ((keyobj
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
3934 if ((o
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
3935 /* Add the new object in the hash table */
3936 retval
= dictAdd(d
,keyobj
,o
);
3937 if (retval
== DICT_ERR
) {
3938 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj
->ptr
);
3941 /* Set the expire time if needed */
3942 if (expiretime
!= -1) {
3943 setExpire(db
,keyobj
,expiretime
);
3944 /* Delete this key if already expired */
3945 if (expiretime
< now
) deleteKey(db
,keyobj
);
3949 /* Handle swapping while loading big datasets when VM is on */
3951 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
3952 while (zmalloc_used_memory() > server
.vm_max_memory
) {
3953 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
3960 eoferr
: /* unexpected end of file is handled here with a fatal exit */
3961 if (keyobj
) decrRefCount(keyobj
);
3962 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3964 return REDIS_ERR
; /* Just to avoid warning */
3967 /*================================== Commands =============================== */
3969 static void authCommand(redisClient
*c
) {
3970 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
3971 c
->authenticated
= 1;
3972 addReply(c
,shared
.ok
);
3974 c
->authenticated
= 0;
3975 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3979 static void pingCommand(redisClient
*c
) {
3980 addReply(c
,shared
.pong
);
3983 static void echoCommand(redisClient
*c
) {
3984 addReplyBulk(c
,c
->argv
[1]);
3987 /*=================================== Strings =============================== */
3989 static void setGenericCommand(redisClient
*c
, int nx
, robj
*key
, robj
*val
, robj
*expire
) {
3994 if (getLongFromObjectOrReply(c
, expire
, &seconds
, NULL
) != REDIS_OK
)
3997 addReplySds(c
,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4002 if (nx
) deleteIfVolatile(c
->db
,key
);
4003 retval
= dictAdd(c
->db
->dict
,key
,val
);
4004 if (retval
== DICT_ERR
) {
4006 /* If the key is about a swapped value, we want a new key object
4007 * to overwrite the old. So we delete the old key in the database.
4008 * This will also make sure that swap pages about the old object
4009 * will be marked as free. */
4010 if (server
.vm_enabled
&& deleteIfSwapped(c
->db
,key
))
4012 dictReplace(c
->db
->dict
,key
,val
);
4015 addReply(c
,shared
.czero
);
4023 removeExpire(c
->db
,key
);
4024 if (expire
) setExpire(c
->db
,key
,time(NULL
)+seconds
);
4025 addReply(c
, nx
? shared
.cone
: shared
.ok
);
4028 static void setCommand(redisClient
*c
) {
4029 setGenericCommand(c
,0,c
->argv
[1],c
->argv
[2],NULL
);
4032 static void setnxCommand(redisClient
*c
) {
4033 setGenericCommand(c
,1,c
->argv
[1],c
->argv
[2],NULL
);
4036 static void setexCommand(redisClient
*c
) {
4037 setGenericCommand(c
,0,c
->argv
[1],c
->argv
[3],c
->argv
[2]);
4040 static int getGenericCommand(redisClient
*c
) {
4043 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
)
4046 if (o
->type
!= REDIS_STRING
) {
4047 addReply(c
,shared
.wrongtypeerr
);
4055 static void getCommand(redisClient
*c
) {
4056 getGenericCommand(c
);
4059 static void getsetCommand(redisClient
*c
) {
4060 if (getGenericCommand(c
) == REDIS_ERR
) return;
4061 if (dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]) == DICT_ERR
) {
4062 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
4064 incrRefCount(c
->argv
[1]);
4066 incrRefCount(c
->argv
[2]);
4068 removeExpire(c
->db
,c
->argv
[1]);
4071 static void mgetCommand(redisClient
*c
) {
4074 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
4075 for (j
= 1; j
< c
->argc
; j
++) {
4076 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
4078 addReply(c
,shared
.nullbulk
);
4080 if (o
->type
!= REDIS_STRING
) {
4081 addReply(c
,shared
.nullbulk
);
4089 static void msetGenericCommand(redisClient
*c
, int nx
) {
4090 int j
, busykeys
= 0;
4092 if ((c
->argc
% 2) == 0) {
4093 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4096 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4097 * set nothing at all if at least one already key exists. */
4099 for (j
= 1; j
< c
->argc
; j
+= 2) {
4100 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
4106 addReply(c
, shared
.czero
);
4110 for (j
= 1; j
< c
->argc
; j
+= 2) {
4113 c
->argv
[j
+1] = tryObjectEncoding(c
->argv
[j
+1]);
4114 retval
= dictAdd(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
4115 if (retval
== DICT_ERR
) {
4116 dictReplace(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
4117 incrRefCount(c
->argv
[j
+1]);
4119 incrRefCount(c
->argv
[j
]);
4120 incrRefCount(c
->argv
[j
+1]);
4122 removeExpire(c
->db
,c
->argv
[j
]);
4124 server
.dirty
+= (c
->argc
-1)/2;
4125 addReply(c
, nx
? shared
.cone
: shared
.ok
);
4128 static void msetCommand(redisClient
*c
) {
4129 msetGenericCommand(c
,0);
4132 static void msetnxCommand(redisClient
*c
) {
4133 msetGenericCommand(c
,1);
4136 static void incrDecrCommand(redisClient
*c
, long long incr
) {
4141 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4143 if (getLongLongFromObjectOrReply(c
, o
, &value
, NULL
) != REDIS_OK
) return;
4146 o
= createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",value
));
4147 o
= tryObjectEncoding(o
);
4148 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],o
);
4149 if (retval
== DICT_ERR
) {
4150 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
4151 removeExpire(c
->db
,c
->argv
[1]);
4153 incrRefCount(c
->argv
[1]);
4156 addReply(c
,shared
.colon
);
4158 addReply(c
,shared
.crlf
);
4161 static void incrCommand(redisClient
*c
) {
4162 incrDecrCommand(c
,1);
4165 static void decrCommand(redisClient
*c
) {
4166 incrDecrCommand(c
,-1);
4169 static void incrbyCommand(redisClient
*c
) {
4172 if (getLongLongFromObjectOrReply(c
, c
->argv
[2], &incr
, NULL
) != REDIS_OK
) return;
4173 incrDecrCommand(c
,incr
);
4176 static void decrbyCommand(redisClient
*c
) {
4179 if (getLongLongFromObjectOrReply(c
, c
->argv
[2], &incr
, NULL
) != REDIS_OK
) return;
4180 incrDecrCommand(c
,-incr
);
4183 static void appendCommand(redisClient
*c
) {
4188 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4190 /* Create the key */
4191 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
4192 incrRefCount(c
->argv
[1]);
4193 incrRefCount(c
->argv
[2]);
4194 totlen
= stringObjectLen(c
->argv
[2]);
4198 de
= dictFind(c
->db
->dict
,c
->argv
[1]);
4201 o
= dictGetEntryVal(de
);
4202 if (o
->type
!= REDIS_STRING
) {
4203 addReply(c
,shared
.wrongtypeerr
);
4206 /* If the object is specially encoded or shared we have to make
4208 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
4209 robj
*decoded
= getDecodedObject(o
);
4211 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
4212 decrRefCount(decoded
);
4213 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
4216 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
4217 o
->ptr
= sdscatlen(o
->ptr
,
4218 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
4220 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
4221 (unsigned long) c
->argv
[2]->ptr
);
4223 totlen
= sdslen(o
->ptr
);
4226 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
4229 static void substrCommand(redisClient
*c
) {
4231 long start
= atoi(c
->argv
[2]->ptr
);
4232 long end
= atoi(c
->argv
[3]->ptr
);
4233 size_t rangelen
, strlen
;
4236 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4237 checkType(c
,o
,REDIS_STRING
)) return;
4239 o
= getDecodedObject(o
);
4240 strlen
= sdslen(o
->ptr
);
4242 /* convert negative indexes */
4243 if (start
< 0) start
= strlen
+start
;
4244 if (end
< 0) end
= strlen
+end
;
4245 if (start
< 0) start
= 0;
4246 if (end
< 0) end
= 0;
4248 /* indexes sanity checks */
4249 if (start
> end
|| (size_t)start
>= strlen
) {
4250 /* Out of range start or start > end result in null reply */
4251 addReply(c
,shared
.nullbulk
);
4255 if ((size_t)end
>= strlen
) end
= strlen
-1;
4256 rangelen
= (end
-start
)+1;
4258 /* Return the result */
4259 addReplySds(c
,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen
));
4260 range
= sdsnewlen((char*)o
->ptr
+start
,rangelen
);
4261 addReplySds(c
,range
);
4262 addReply(c
,shared
.crlf
);
4266 /* ========================= Type agnostic commands ========================= */
4268 static void delCommand(redisClient
*c
) {
4271 for (j
= 1; j
< c
->argc
; j
++) {
4272 if (deleteKey(c
->db
,c
->argv
[j
])) {
4277 addReplyLong(c
,deleted
);
4280 static void existsCommand(redisClient
*c
) {
4281 addReply(c
,lookupKeyRead(c
->db
,c
->argv
[1]) ? shared
.cone
: shared
.czero
);
4284 static void selectCommand(redisClient
*c
) {
4285 int id
= atoi(c
->argv
[1]->ptr
);
4287 if (selectDb(c
,id
) == REDIS_ERR
) {
4288 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
4290 addReply(c
,shared
.ok
);
4294 static void randomkeyCommand(redisClient
*c
) {
4299 de
= dictGetRandomKey(c
->db
->dict
);
4300 if (!de
|| expireIfNeeded(c
->db
,dictGetEntryKey(de
)) == 0) break;
4304 addReply(c
,shared
.nullbulk
);
4308 key
= dictGetEntryKey(de
);
4309 if (server
.vm_enabled
) {
4310 key
= dupStringObject(key
);
4311 addReplyBulk(c
,key
);
4314 addReplyBulk(c
,key
);
4318 static void keysCommand(redisClient
*c
) {
4321 sds pattern
= c
->argv
[1]->ptr
;
4322 int plen
= sdslen(pattern
);
4323 unsigned long numkeys
= 0;
4324 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
4326 di
= dictGetIterator(c
->db
->dict
);
4328 decrRefCount(lenobj
);
4329 while((de
= dictNext(di
)) != NULL
) {
4330 robj
*keyobj
= dictGetEntryKey(de
);
4332 sds key
= keyobj
->ptr
;
4333 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
4334 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
4335 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
4336 addReplyBulk(c
,keyobj
);
4341 dictReleaseIterator(di
);
4342 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
4345 static void dbsizeCommand(redisClient
*c
) {
4347 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
4350 static void lastsaveCommand(redisClient
*c
) {
4352 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
4355 static void typeCommand(redisClient
*c
) {
4359 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4364 case REDIS_STRING
: type
= "+string"; break;
4365 case REDIS_LIST
: type
= "+list"; break;
4366 case REDIS_SET
: type
= "+set"; break;
4367 case REDIS_ZSET
: type
= "+zset"; break;
4368 case REDIS_HASH
: type
= "+hash"; break;
4369 default: type
= "+unknown"; break;
4372 addReplySds(c
,sdsnew(type
));
4373 addReply(c
,shared
.crlf
);
4376 static void saveCommand(redisClient
*c
) {
4377 if (server
.bgsavechildpid
!= -1) {
4378 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
4381 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4382 addReply(c
,shared
.ok
);
4384 addReply(c
,shared
.err
);
4388 static void bgsaveCommand(redisClient
*c
) {
4389 if (server
.bgsavechildpid
!= -1) {
4390 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
4393 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
4394 char *status
= "+Background saving started\r\n";
4395 addReplySds(c
,sdsnew(status
));
4397 addReply(c
,shared
.err
);
4401 static void shutdownCommand(redisClient
*c
) {
4402 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
4403 /* Kill the saving child if there is a background saving in progress.
4404 We want to avoid race conditions, for instance our saving child may
4405 overwrite the synchronous saving did by SHUTDOWN. */
4406 if (server
.bgsavechildpid
!= -1) {
4407 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
4408 kill(server
.bgsavechildpid
,SIGKILL
);
4409 rdbRemoveTempFile(server
.bgsavechildpid
);
4411 if (server
.appendonly
) {
4412 /* Append only file: fsync() the AOF and exit */
4413 fsync(server
.appendfd
);
4414 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4417 /* Snapshotting. Perform a SYNC SAVE and exit */
4418 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4419 if (server
.daemonize
)
4420 unlink(server
.pidfile
);
4421 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
4422 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
4423 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4426 /* Ooops.. error saving! The best we can do is to continue
4427 * operating. Note that if there was a background saving process,
4428 * in the next cron() Redis will be notified that the background
4429 * saving aborted, handling special stuff like slaves pending for
4430 * synchronization... */
4431 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
4433 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4438 static void renameGenericCommand(redisClient
*c
, int nx
) {
4441 /* To use the same key as src and dst is probably an error */
4442 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
4443 addReply(c
,shared
.sameobjecterr
);
4447 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nokeyerr
)) == NULL
)
4451 deleteIfVolatile(c
->db
,c
->argv
[2]);
4452 if (dictAdd(c
->db
->dict
,c
->argv
[2],o
) == DICT_ERR
) {
4455 addReply(c
,shared
.czero
);
4458 dictReplace(c
->db
->dict
,c
->argv
[2],o
);
4460 incrRefCount(c
->argv
[2]);
4462 deleteKey(c
->db
,c
->argv
[1]);
4464 addReply(c
,nx
? shared
.cone
: shared
.ok
);
4467 static void renameCommand(redisClient
*c
) {
4468 renameGenericCommand(c
,0);
4471 static void renamenxCommand(redisClient
*c
) {
4472 renameGenericCommand(c
,1);
4475 static void moveCommand(redisClient
*c
) {
4480 /* Obtain source and target DB pointers */
4483 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
4484 addReply(c
,shared
.outofrangeerr
);
4488 selectDb(c
,srcid
); /* Back to the source DB */
4490 /* If the user is moving using as target the same
4491 * DB as the source DB it is probably an error. */
4493 addReply(c
,shared
.sameobjecterr
);
4497 /* Check if the element exists and get a reference */
4498 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4500 addReply(c
,shared
.czero
);
4504 /* Try to add the element to the target DB */
4505 deleteIfVolatile(dst
,c
->argv
[1]);
4506 if (dictAdd(dst
->dict
,c
->argv
[1],o
) == DICT_ERR
) {
4507 addReply(c
,shared
.czero
);
4510 incrRefCount(c
->argv
[1]);
4513 /* OK! key moved, free the entry in the source DB */
4514 deleteKey(src
,c
->argv
[1]);
4516 addReply(c
,shared
.cone
);
4519 /* =================================== Lists ================================ */
4520 static void pushGenericCommand(redisClient
*c
, int where
) {
4524 lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4526 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4527 addReply(c
,shared
.cone
);
4530 lobj
= createListObject();
4532 if (where
== REDIS_HEAD
) {
4533 listAddNodeHead(list
,c
->argv
[2]);
4535 listAddNodeTail(list
,c
->argv
[2]);
4537 dictAdd(c
->db
->dict
,c
->argv
[1],lobj
);
4538 incrRefCount(c
->argv
[1]);
4539 incrRefCount(c
->argv
[2]);
4541 if (lobj
->type
!= REDIS_LIST
) {
4542 addReply(c
,shared
.wrongtypeerr
);
4545 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4546 addReply(c
,shared
.cone
);
4550 if (where
== REDIS_HEAD
) {
4551 listAddNodeHead(list
,c
->argv
[2]);
4553 listAddNodeTail(list
,c
->argv
[2]);
4555 incrRefCount(c
->argv
[2]);
4558 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(list
)));
4561 static void lpushCommand(redisClient
*c
) {
4562 pushGenericCommand(c
,REDIS_HEAD
);
4565 static void rpushCommand(redisClient
*c
) {
4566 pushGenericCommand(c
,REDIS_TAIL
);
4569 static void llenCommand(redisClient
*c
) {
4573 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4574 checkType(c
,o
,REDIS_LIST
)) return;
4577 addReplyUlong(c
,listLength(l
));
4580 static void lindexCommand(redisClient
*c
) {
4582 int index
= atoi(c
->argv
[2]->ptr
);
4586 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4587 checkType(c
,o
,REDIS_LIST
)) return;
4590 ln
= listIndex(list
, index
);
4592 addReply(c
,shared
.nullbulk
);
4594 robj
*ele
= listNodeValue(ln
);
4595 addReplyBulk(c
,ele
);
4599 static void lsetCommand(redisClient
*c
) {
4601 int index
= atoi(c
->argv
[2]->ptr
);
4605 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nokeyerr
)) == NULL
||
4606 checkType(c
,o
,REDIS_LIST
)) return;
4609 ln
= listIndex(list
, index
);
4611 addReply(c
,shared
.outofrangeerr
);
4613 robj
*ele
= listNodeValue(ln
);
4616 listNodeValue(ln
) = c
->argv
[3];
4617 incrRefCount(c
->argv
[3]);
4618 addReply(c
,shared
.ok
);
4623 static void popGenericCommand(redisClient
*c
, int where
) {
4628 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4629 checkType(c
,o
,REDIS_LIST
)) return;
4632 if (where
== REDIS_HEAD
)
4633 ln
= listFirst(list
);
4635 ln
= listLast(list
);
4638 addReply(c
,shared
.nullbulk
);
4640 robj
*ele
= listNodeValue(ln
);
4641 addReplyBulk(c
,ele
);
4642 listDelNode(list
,ln
);
4643 if (listLength(list
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4648 static void lpopCommand(redisClient
*c
) {
4649 popGenericCommand(c
,REDIS_HEAD
);
4652 static void rpopCommand(redisClient
*c
) {
4653 popGenericCommand(c
,REDIS_TAIL
);
4656 static void lrangeCommand(redisClient
*c
) {
4658 int start
= atoi(c
->argv
[2]->ptr
);
4659 int end
= atoi(c
->argv
[3]->ptr
);
4666 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
4667 || checkType(c
,o
,REDIS_LIST
)) return;
4669 llen
= listLength(list
);
4671 /* convert negative indexes */
4672 if (start
< 0) start
= llen
+start
;
4673 if (end
< 0) end
= llen
+end
;
4674 if (start
< 0) start
= 0;
4675 if (end
< 0) end
= 0;
4677 /* indexes sanity checks */
4678 if (start
> end
|| start
>= llen
) {
4679 /* Out of range start or start > end result in empty list */
4680 addReply(c
,shared
.emptymultibulk
);
4683 if (end
>= llen
) end
= llen
-1;
4684 rangelen
= (end
-start
)+1;
4686 /* Return the result in form of a multi-bulk reply */
4687 ln
= listIndex(list
, start
);
4688 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
4689 for (j
= 0; j
< rangelen
; j
++) {
4690 ele
= listNodeValue(ln
);
4691 addReplyBulk(c
,ele
);
4696 static void ltrimCommand(redisClient
*c
) {
4698 int start
= atoi(c
->argv
[2]->ptr
);
4699 int end
= atoi(c
->argv
[3]->ptr
);
4701 int j
, ltrim
, rtrim
;
4705 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.ok
)) == NULL
||
4706 checkType(c
,o
,REDIS_LIST
)) return;
4708 llen
= listLength(list
);
4710 /* convert negative indexes */
4711 if (start
< 0) start
= llen
+start
;
4712 if (end
< 0) end
= llen
+end
;
4713 if (start
< 0) start
= 0;
4714 if (end
< 0) end
= 0;
4716 /* indexes sanity checks */
4717 if (start
> end
|| start
>= llen
) {
4718 /* Out of range start or start > end result in empty list */
4722 if (end
>= llen
) end
= llen
-1;
4727 /* Remove list elements to perform the trim */
4728 for (j
= 0; j
< ltrim
; j
++) {
4729 ln
= listFirst(list
);
4730 listDelNode(list
,ln
);
4732 for (j
= 0; j
< rtrim
; j
++) {
4733 ln
= listLast(list
);
4734 listDelNode(list
,ln
);
4736 if (listLength(list
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4738 addReply(c
,shared
.ok
);
4741 static void lremCommand(redisClient
*c
) {
4744 listNode
*ln
, *next
;
4745 int toremove
= atoi(c
->argv
[2]->ptr
);
4749 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4750 checkType(c
,o
,REDIS_LIST
)) return;
4754 toremove
= -toremove
;
4757 ln
= fromtail
? list
->tail
: list
->head
;
4759 robj
*ele
= listNodeValue(ln
);
4761 next
= fromtail
? ln
->prev
: ln
->next
;
4762 if (compareStringObjects(ele
,c
->argv
[3]) == 0) {
4763 listDelNode(list
,ln
);
4766 if (toremove
&& removed
== toremove
) break;
4770 if (listLength(list
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4771 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
4774 /* This is the semantic of this command:
4775 * RPOPLPUSH srclist dstlist:
4776 * IF LLEN(srclist) > 0
4777 * element = RPOP srclist
4778 * LPUSH dstlist element
4785 * The idea is to be able to get an element from a list in a reliable way
4786 * since the element is not just returned but pushed against another list
4787 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4789 static void rpoplpushcommand(redisClient
*c
) {
4794 if ((sobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4795 checkType(c
,sobj
,REDIS_LIST
)) return;
4796 srclist
= sobj
->ptr
;
4797 ln
= listLast(srclist
);
4800 addReply(c
,shared
.nullbulk
);
4802 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4803 robj
*ele
= listNodeValue(ln
);
4806 if (dobj
&& dobj
->type
!= REDIS_LIST
) {
4807 addReply(c
,shared
.wrongtypeerr
);
4811 /* Add the element to the target list (unless it's directly
4812 * passed to some BLPOP-ing client */
4813 if (!handleClientsWaitingListPush(c
,c
->argv
[2],ele
)) {
4815 /* Create the list if the key does not exist */
4816 dobj
= createListObject();
4817 dictAdd(c
->db
->dict
,c
->argv
[2],dobj
);
4818 incrRefCount(c
->argv
[2]);
4820 dstlist
= dobj
->ptr
;
4821 listAddNodeHead(dstlist
,ele
);
4825 /* Send the element to the client as reply as well */
4826 addReplyBulk(c
,ele
);
4828 /* Finally remove the element from the source list */
4829 listDelNode(srclist
,ln
);
4830 if (listLength(srclist
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4835 /* ==================================== Sets ================================ */
4837 static void saddCommand(redisClient
*c
) {
4840 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4842 set
= createSetObject();
4843 dictAdd(c
->db
->dict
,c
->argv
[1],set
);
4844 incrRefCount(c
->argv
[1]);
4846 if (set
->type
!= REDIS_SET
) {
4847 addReply(c
,shared
.wrongtypeerr
);
4851 if (dictAdd(set
->ptr
,c
->argv
[2],NULL
) == DICT_OK
) {
4852 incrRefCount(c
->argv
[2]);
4854 addReply(c
,shared
.cone
);
4856 addReply(c
,shared
.czero
);
4860 static void sremCommand(redisClient
*c
) {
4863 if ((set
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4864 checkType(c
,set
,REDIS_SET
)) return;
4866 if (dictDelete(set
->ptr
,c
->argv
[2]) == DICT_OK
) {
4868 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4869 if (dictSize((dict
*)set
->ptr
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4870 addReply(c
,shared
.cone
);
4872 addReply(c
,shared
.czero
);
4876 static void smoveCommand(redisClient
*c
) {
4877 robj
*srcset
, *dstset
;
4879 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4880 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4882 /* If the source key does not exist return 0, if it's of the wrong type
4884 if (srcset
== NULL
|| srcset
->type
!= REDIS_SET
) {
4885 addReply(c
, srcset
? shared
.wrongtypeerr
: shared
.czero
);
4888 /* Error if the destination key is not a set as well */
4889 if (dstset
&& dstset
->type
!= REDIS_SET
) {
4890 addReply(c
,shared
.wrongtypeerr
);
4893 /* Remove the element from the source set */
4894 if (dictDelete(srcset
->ptr
,c
->argv
[3]) == DICT_ERR
) {
4895 /* Key not found in the src set! return zero */
4896 addReply(c
,shared
.czero
);
4899 if (dictSize((dict
*)srcset
->ptr
) == 0 && srcset
!= dstset
)
4900 deleteKey(c
->db
,c
->argv
[1]);
4902 /* Add the element to the destination set */
4904 dstset
= createSetObject();
4905 dictAdd(c
->db
->dict
,c
->argv
[2],dstset
);
4906 incrRefCount(c
->argv
[2]);
4908 if (dictAdd(dstset
->ptr
,c
->argv
[3],NULL
) == DICT_OK
)
4909 incrRefCount(c
->argv
[3]);
4910 addReply(c
,shared
.cone
);
4913 static void sismemberCommand(redisClient
*c
) {
4916 if ((set
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4917 checkType(c
,set
,REDIS_SET
)) return;
4919 if (dictFind(set
->ptr
,c
->argv
[2]))
4920 addReply(c
,shared
.cone
);
4922 addReply(c
,shared
.czero
);
4925 static void scardCommand(redisClient
*c
) {
4929 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4930 checkType(c
,o
,REDIS_SET
)) return;
4933 addReplyUlong(c
,dictSize(s
));
4936 static void spopCommand(redisClient
*c
) {
4940 if ((set
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4941 checkType(c
,set
,REDIS_SET
)) return;
4943 de
= dictGetRandomKey(set
->ptr
);
4945 addReply(c
,shared
.nullbulk
);
4947 robj
*ele
= dictGetEntryKey(de
);
4949 addReplyBulk(c
,ele
);
4950 dictDelete(set
->ptr
,ele
);
4951 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4952 if (dictSize((dict
*)set
->ptr
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4957 static void srandmemberCommand(redisClient
*c
) {
4961 if ((set
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4962 checkType(c
,set
,REDIS_SET
)) return;
4964 de
= dictGetRandomKey(set
->ptr
);
4966 addReply(c
,shared
.nullbulk
);
4968 robj
*ele
= dictGetEntryKey(de
);
4970 addReplyBulk(c
,ele
);
4974 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
4975 dict
**d1
= (void*) s1
, **d2
= (void*) s2
;
4977 return dictSize(*d1
)-dictSize(*d2
);
4980 static void sinterGenericCommand(redisClient
*c
, robj
**setskeys
, unsigned long setsnum
, robj
*dstkey
) {
4981 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4984 robj
*lenobj
= NULL
, *dstset
= NULL
;
4985 unsigned long j
, cardinality
= 0;
4987 for (j
= 0; j
< setsnum
; j
++) {
4991 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4992 lookupKeyRead(c
->db
,setskeys
[j
]);
4996 if (deleteKey(c
->db
,dstkey
))
4998 addReply(c
,shared
.czero
);
5000 addReply(c
,shared
.emptymultibulk
);
5004 if (setobj
->type
!= REDIS_SET
) {
5006 addReply(c
,shared
.wrongtypeerr
);
5009 dv
[j
] = setobj
->ptr
;
5011 /* Sort sets from the smallest to largest, this will improve our
5012 * algorithm's performace */
5013 qsort(dv
,setsnum
,sizeof(dict
*),qsortCompareSetsByCardinality
);
5015 /* The first thing we should output is the total number of elements...
5016 * since this is a multi-bulk write, but at this stage we don't know
5017 * the intersection set size, so we use a trick, append an empty object
5018 * to the output list and save the pointer to later modify it with the
5021 lenobj
= createObject(REDIS_STRING
,NULL
);
5023 decrRefCount(lenobj
);
5025 /* If we have a target key where to store the resulting set
5026 * create this key with an empty set inside */
5027 dstset
= createSetObject();
5030 /* Iterate all the elements of the first (smallest) set, and test
5031 * the element against all the other sets, if at least one set does
5032 * not include the element it is discarded */
5033 di
= dictGetIterator(dv
[0]);
5035 while((de
= dictNext(di
)) != NULL
) {
5038 for (j
= 1; j
< setsnum
; j
++)
5039 if (dictFind(dv
[j
],dictGetEntryKey(de
)) == NULL
) break;
5041 continue; /* at least one set does not contain the member */
5042 ele
= dictGetEntryKey(de
);
5044 addReplyBulk(c
,ele
);
5047 dictAdd(dstset
->ptr
,ele
,NULL
);
5051 dictReleaseIterator(di
);
5054 /* Store the resulting set into the target, if the intersection
5055 * is not an empty set. */
5056 deleteKey(c
->db
,dstkey
);
5057 if (dictSize((dict
*)dstset
->ptr
) > 0) {
5058 dictAdd(c
->db
->dict
,dstkey
,dstset
);
5059 incrRefCount(dstkey
);
5060 addReplyLong(c
,dictSize((dict
*)dstset
->ptr
));
5062 decrRefCount(dstset
);
5063 addReply(c
,shared
.czero
);
5067 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
5072 static void sinterCommand(redisClient
*c
) {
5073 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
5076 static void sinterstoreCommand(redisClient
*c
) {
5077 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
5080 #define REDIS_OP_UNION 0
5081 #define REDIS_OP_DIFF 1
5082 #define REDIS_OP_INTER 2
5084 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setskeys
, int setsnum
, robj
*dstkey
, int op
) {
5085 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
5088 robj
*dstset
= NULL
;
5089 int j
, cardinality
= 0;
5091 for (j
= 0; j
< setsnum
; j
++) {
5095 lookupKeyWrite(c
->db
,setskeys
[j
]) :
5096 lookupKeyRead(c
->db
,setskeys
[j
]);
5101 if (setobj
->type
!= REDIS_SET
) {
5103 addReply(c
,shared
.wrongtypeerr
);
5106 dv
[j
] = setobj
->ptr
;
5109 /* We need a temp set object to store our union. If the dstkey
5110 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5111 * this set object will be the resulting object to set into the target key*/
5112 dstset
= createSetObject();
5114 /* Iterate all the elements of all the sets, add every element a single
5115 * time to the result set */
5116 for (j
= 0; j
< setsnum
; j
++) {
5117 if (op
== REDIS_OP_DIFF
&& j
== 0 && !dv
[j
]) break; /* result set is empty */
5118 if (!dv
[j
]) continue; /* non existing keys are like empty sets */
5120 di
= dictGetIterator(dv
[j
]);
5122 while((de
= dictNext(di
)) != NULL
) {
5125 /* dictAdd will not add the same element multiple times */
5126 ele
= dictGetEntryKey(de
);
5127 if (op
== REDIS_OP_UNION
|| j
== 0) {
5128 if (dictAdd(dstset
->ptr
,ele
,NULL
) == DICT_OK
) {
5132 } else if (op
== REDIS_OP_DIFF
) {
5133 if (dictDelete(dstset
->ptr
,ele
) == DICT_OK
) {
5138 dictReleaseIterator(di
);
5140 /* result set is empty? Exit asap. */
5141 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break;
5144 /* Output the content of the resulting set, if not in STORE mode */
5146 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
5147 di
= dictGetIterator(dstset
->ptr
);
5148 while((de
= dictNext(di
)) != NULL
) {
5151 ele
= dictGetEntryKey(de
);
5152 addReplyBulk(c
,ele
);
5154 dictReleaseIterator(di
);
5155 decrRefCount(dstset
);
5157 /* If we have a target key where to store the resulting set
5158 * create this key with the result set inside */
5159 deleteKey(c
->db
,dstkey
);
5160 if (dictSize((dict
*)dstset
->ptr
) > 0) {
5161 dictAdd(c
->db
->dict
,dstkey
,dstset
);
5162 incrRefCount(dstkey
);
5163 addReplyLong(c
,dictSize((dict
*)dstset
->ptr
));
5165 decrRefCount(dstset
);
5166 addReply(c
,shared
.czero
);
5173 static void sunionCommand(redisClient
*c
) {
5174 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
5177 static void sunionstoreCommand(redisClient
*c
) {
5178 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
5181 static void sdiffCommand(redisClient
*c
) {
5182 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
5185 static void sdiffstoreCommand(redisClient
*c
) {
5186 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
5189 /* ==================================== ZSets =============================== */
5191 /* ZSETs are ordered sets using two data structures to hold the same elements
5192 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5195 * The elements are added to an hash table mapping Redis objects to scores.
5196 * At the same time the elements are added to a skip list mapping scores
5197 * to Redis objects (so objects are sorted by scores in this "view"). */
5199 /* This skiplist implementation is almost a C translation of the original
5200 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5201 * Alternative to Balanced Trees", modified in three ways:
5202 * a) this implementation allows for repeated values.
5203 * b) the comparison is not just by key (our 'score') but by satellite data.
5204 * c) there is a back pointer, so it's a doubly linked list with the back
5205 * pointers being only at "level 1". This allows to traverse the list
5206 * from tail to head, useful for ZREVRANGE. */
5208 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
5209 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
5211 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
5213 zn
->span
= zmalloc(sizeof(unsigned int) * (level
- 1));
5219 static zskiplist
*zslCreate(void) {
5223 zsl
= zmalloc(sizeof(*zsl
));
5226 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
5227 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
5228 zsl
->header
->forward
[j
] = NULL
;
5230 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5231 if (j
< ZSKIPLIST_MAXLEVEL
-1)
5232 zsl
->header
->span
[j
] = 0;
5234 zsl
->header
->backward
= NULL
;
5239 static void zslFreeNode(zskiplistNode
*node
) {
5240 decrRefCount(node
->obj
);
5241 zfree(node
->forward
);
5246 static void zslFree(zskiplist
*zsl
) {
5247 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
5249 zfree(zsl
->header
->forward
);
5250 zfree(zsl
->header
->span
);
5253 next
= node
->forward
[0];
5260 static int zslRandomLevel(void) {
5262 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
5264 return (level
<ZSKIPLIST_MAXLEVEL
) ? level
: ZSKIPLIST_MAXLEVEL
;
5267 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
5268 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5269 unsigned int rank
[ZSKIPLIST_MAXLEVEL
];
5273 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5274 /* store rank that is crossed to reach the insert position */
5275 rank
[i
] = i
== (zsl
->level
-1) ? 0 : rank
[i
+1];
5277 while (x
->forward
[i
] &&
5278 (x
->forward
[i
]->score
< score
||
5279 (x
->forward
[i
]->score
== score
&&
5280 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
5281 rank
[i
] += i
> 0 ? x
->span
[i
-1] : 1;
5286 /* we assume the key is not already inside, since we allow duplicated
5287 * scores, and the re-insertion of score and redis object should never
5288 * happpen since the caller of zslInsert() should test in the hash table
5289 * if the element is already inside or not. */
5290 level
= zslRandomLevel();
5291 if (level
> zsl
->level
) {
5292 for (i
= zsl
->level
; i
< level
; i
++) {
5294 update
[i
] = zsl
->header
;
5295 update
[i
]->span
[i
-1] = zsl
->length
;
5299 x
= zslCreateNode(level
,score
,obj
);
5300 for (i
= 0; i
< level
; i
++) {
5301 x
->forward
[i
] = update
[i
]->forward
[i
];
5302 update
[i
]->forward
[i
] = x
;
5304 /* update span covered by update[i] as x is inserted here */
5306 x
->span
[i
-1] = update
[i
]->span
[i
-1] - (rank
[0] - rank
[i
]);
5307 update
[i
]->span
[i
-1] = (rank
[0] - rank
[i
]) + 1;
5311 /* increment span for untouched levels */
5312 for (i
= level
; i
< zsl
->level
; i
++) {
5313 update
[i
]->span
[i
-1]++;
5316 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
5318 x
->forward
[0]->backward
= x
;
5324 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5325 void zslDeleteNode(zskiplist
*zsl
, zskiplistNode
*x
, zskiplistNode
**update
) {
5327 for (i
= 0; i
< zsl
->level
; i
++) {
5328 if (update
[i
]->forward
[i
] == x
) {
5330 update
[i
]->span
[i
-1] += x
->span
[i
-1] - 1;
5332 update
[i
]->forward
[i
] = x
->forward
[i
];
5334 /* invariant: i > 0, because update[0]->forward[0]
5335 * is always equal to x */
5336 update
[i
]->span
[i
-1] -= 1;
5339 if (x
->forward
[0]) {
5340 x
->forward
[0]->backward
= x
->backward
;
5342 zsl
->tail
= x
->backward
;
5344 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
5349 /* Delete an element with matching score/object from the skiplist. */
5350 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
5351 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5355 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5356 while (x
->forward
[i
] &&
5357 (x
->forward
[i
]->score
< score
||
5358 (x
->forward
[i
]->score
== score
&&
5359 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
5363 /* We may have multiple elements with the same score, what we need
5364 * is to find the element with both the right score and object. */
5366 if (x
&& score
== x
->score
&& compareStringObjects(x
->obj
,obj
) == 0) {
5367 zslDeleteNode(zsl
, x
, update
);
5371 return 0; /* not found */
5373 return 0; /* not found */
5376 /* Delete all the elements with score between min and max from the skiplist.
5377 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5378 * Note that this function takes the reference to the hash table view of the
5379 * sorted set, in order to remove the elements from the hash table too. */
5380 static unsigned long zslDeleteRangeByScore(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
5381 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5382 unsigned long removed
= 0;
5386 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5387 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
5391 /* We may have multiple elements with the same score, what we need
5392 * is to find the element with both the right score and object. */
5394 while (x
&& x
->score
<= max
) {
5395 zskiplistNode
*next
= x
->forward
[0];
5396 zslDeleteNode(zsl
, x
, update
);
5397 dictDelete(dict
,x
->obj
);
5402 return removed
; /* not found */
5405 /* Delete all the elements with rank between start and end from the skiplist.
5406 * Start and end are inclusive. Note that start and end need to be 1-based */
5407 static unsigned long zslDeleteRangeByRank(zskiplist
*zsl
, unsigned int start
, unsigned int end
, dict
*dict
) {
5408 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5409 unsigned long traversed
= 0, removed
= 0;
5413 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5414 while (x
->forward
[i
] && (traversed
+ (i
> 0 ? x
->span
[i
-1] : 1)) < start
) {
5415 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5423 while (x
&& traversed
<= end
) {
5424 zskiplistNode
*next
= x
->forward
[0];
5425 zslDeleteNode(zsl
, x
, update
);
5426 dictDelete(dict
,x
->obj
);
5435 /* Find the first node having a score equal or greater than the specified one.
5436 * Returns NULL if there is no match. */
5437 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
5442 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5443 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
5446 /* We may have multiple elements with the same score, what we need
5447 * is to find the element with both the right score and object. */
5448 return x
->forward
[0];
5451 /* Find the rank for an element by both score and key.
5452 * Returns 0 when the element cannot be found, rank otherwise.
5453 * Note that the rank is 1-based due to the span of zsl->header to the
5455 static unsigned long zslGetRank(zskiplist
*zsl
, double score
, robj
*o
) {
5457 unsigned long rank
= 0;
5461 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5462 while (x
->forward
[i
] &&
5463 (x
->forward
[i
]->score
< score
||
5464 (x
->forward
[i
]->score
== score
&&
5465 compareStringObjects(x
->forward
[i
]->obj
,o
) <= 0))) {
5466 rank
+= i
> 0 ? x
->span
[i
-1] : 1;
5470 /* x might be equal to zsl->header, so test if obj is non-NULL */
5471 if (x
->obj
&& compareStringObjects(x
->obj
,o
) == 0) {
5478 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5479 zskiplistNode
* zslGetElementByRank(zskiplist
*zsl
, unsigned long rank
) {
5481 unsigned long traversed
= 0;
5485 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5486 while (x
->forward
[i
] && (traversed
+ (i
>0 ? x
->span
[i
-1] : 1)) <= rank
)
5488 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5491 if (traversed
== rank
) {
5498 /* The actual Z-commands implementations */
5500 /* This generic command implements both ZADD and ZINCRBY.
5501 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5502 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5503 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
5508 zsetobj
= lookupKeyWrite(c
->db
,key
);
5509 if (zsetobj
== NULL
) {
5510 zsetobj
= createZsetObject();
5511 dictAdd(c
->db
->dict
,key
,zsetobj
);
5514 if (zsetobj
->type
!= REDIS_ZSET
) {
5515 addReply(c
,shared
.wrongtypeerr
);
5521 /* Ok now since we implement both ZADD and ZINCRBY here the code
5522 * needs to handle the two different conditions. It's all about setting
5523 * '*score', that is, the new score to set, to the right value. */
5524 score
= zmalloc(sizeof(double));
5528 /* Read the old score. If the element was not present starts from 0 */
5529 de
= dictFind(zs
->dict
,ele
);
5531 double *oldscore
= dictGetEntryVal(de
);
5532 *score
= *oldscore
+ scoreval
;
5540 /* What follows is a simple remove and re-insert operation that is common
5541 * to both ZADD and ZINCRBY... */
5542 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
5543 /* case 1: New element */
5544 incrRefCount(ele
); /* added to hash */
5545 zslInsert(zs
->zsl
,*score
,ele
);
5546 incrRefCount(ele
); /* added to skiplist */
5549 addReplyDouble(c
,*score
);
5551 addReply(c
,shared
.cone
);
5556 /* case 2: Score update operation */
5557 de
= dictFind(zs
->dict
,ele
);
5558 redisAssert(de
!= NULL
);
5559 oldscore
= dictGetEntryVal(de
);
5560 if (*score
!= *oldscore
) {
5563 /* Remove and insert the element in the skip list with new score */
5564 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
5565 redisAssert(deleted
!= 0);
5566 zslInsert(zs
->zsl
,*score
,ele
);
5568 /* Update the score in the hash table */
5569 dictReplace(zs
->dict
,ele
,score
);
5575 addReplyDouble(c
,*score
);
5577 addReply(c
,shared
.czero
);
5581 static void zaddCommand(redisClient
*c
) {
5584 if (getDoubleFromObjectOrReply(c
, c
->argv
[2], &scoreval
, NULL
) != REDIS_OK
) return;
5585 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
5588 static void zincrbyCommand(redisClient
*c
) {
5591 if (getDoubleFromObjectOrReply(c
, c
->argv
[2], &scoreval
, NULL
) != REDIS_OK
) return;
5592 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
5595 static void zremCommand(redisClient
*c
) {
5602 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5603 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
5606 de
= dictFind(zs
->dict
,c
->argv
[2]);
5608 addReply(c
,shared
.czero
);
5611 /* Delete from the skiplist */
5612 oldscore
= dictGetEntryVal(de
);
5613 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
5614 redisAssert(deleted
!= 0);
5616 /* Delete from the hash table */
5617 dictDelete(zs
->dict
,c
->argv
[2]);
5618 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5619 if (dictSize(zs
->dict
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5621 addReply(c
,shared
.cone
);
5624 static void zremrangebyscoreCommand(redisClient
*c
) {
5631 if ((getDoubleFromObjectOrReply(c
, c
->argv
[2], &min
, NULL
) != REDIS_OK
) ||
5632 (getDoubleFromObjectOrReply(c
, c
->argv
[3], &max
, NULL
) != REDIS_OK
)) return;
5634 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5635 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
5638 deleted
= zslDeleteRangeByScore(zs
->zsl
,min
,max
,zs
->dict
);
5639 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5640 if (dictSize(zs
->dict
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5641 server
.dirty
+= deleted
;
5642 addReplyLong(c
,deleted
);
5645 static void zremrangebyrankCommand(redisClient
*c
) {
5653 if ((getLongFromObjectOrReply(c
, c
->argv
[2], &start
, NULL
) != REDIS_OK
) ||
5654 (getLongFromObjectOrReply(c
, c
->argv
[3], &end
, NULL
) != REDIS_OK
)) return;
5656 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5657 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
5659 llen
= zs
->zsl
->length
;
5661 /* convert negative indexes */
5662 if (start
< 0) start
= llen
+start
;
5663 if (end
< 0) end
= llen
+end
;
5664 if (start
< 0) start
= 0;
5665 if (end
< 0) end
= 0;
5667 /* indexes sanity checks */
5668 if (start
> end
|| start
>= llen
) {
5669 addReply(c
,shared
.czero
);
5672 if (end
>= llen
) end
= llen
-1;
5674 /* increment start and end because zsl*Rank functions
5675 * use 1-based rank */
5676 deleted
= zslDeleteRangeByRank(zs
->zsl
,start
+1,end
+1,zs
->dict
);
5677 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5678 if (dictSize(zs
->dict
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5679 server
.dirty
+= deleted
;
5680 addReplyLong(c
, deleted
);
5688 static int qsortCompareZsetopsrcByCardinality(const void *s1
, const void *s2
) {
5689 zsetopsrc
*d1
= (void*) s1
, *d2
= (void*) s2
;
5690 unsigned long size1
, size2
;
5691 size1
= d1
->dict
? dictSize(d1
->dict
) : 0;
5692 size2
= d2
->dict
? dictSize(d2
->dict
) : 0;
5693 return size1
- size2
;
5696 #define REDIS_AGGR_SUM 1
5697 #define REDIS_AGGR_MIN 2
5698 #define REDIS_AGGR_MAX 3
5700 inline static void zunionInterAggregate(double *target
, double val
, int aggregate
) {
5701 if (aggregate
== REDIS_AGGR_SUM
) {
5702 *target
= *target
+ val
;
5703 } else if (aggregate
== REDIS_AGGR_MIN
) {
5704 *target
= val
< *target
? val
: *target
;
5705 } else if (aggregate
== REDIS_AGGR_MAX
) {
5706 *target
= val
> *target
? val
: *target
;
5709 redisPanic("Unknown ZUNION/INTER aggregate type");
5713 static void zunionInterGenericCommand(redisClient
*c
, robj
*dstkey
, int op
) {
5715 int aggregate
= REDIS_AGGR_SUM
;
5722 /* expect zsetnum input keys to be given */
5723 zsetnum
= atoi(c
->argv
[2]->ptr
);
5725 addReplySds(c
,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5729 /* test if the expected number of keys would overflow */
5730 if (3+zsetnum
> c
->argc
) {
5731 addReply(c
,shared
.syntaxerr
);
5735 /* read keys to be used for input */
5736 src
= zmalloc(sizeof(zsetopsrc
) * zsetnum
);
5737 for (i
= 0, j
= 3; i
< zsetnum
; i
++, j
++) {
5738 robj
*zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
5742 if (zsetobj
->type
!= REDIS_ZSET
) {
5744 addReply(c
,shared
.wrongtypeerr
);
5747 src
[i
].dict
= ((zset
*)zsetobj
->ptr
)->dict
;
5750 /* default all weights to 1 */
5751 src
[i
].weight
= 1.0;
5754 /* parse optional extra arguments */
5756 int remaining
= c
->argc
- j
;
5759 if (remaining
>= (zsetnum
+ 1) && !strcasecmp(c
->argv
[j
]->ptr
,"weights")) {
5761 for (i
= 0; i
< zsetnum
; i
++, j
++, remaining
--) {
5762 if (getDoubleFromObjectOrReply(c
, c
->argv
[j
], &src
[i
].weight
, NULL
) != REDIS_OK
)
5765 } else if (remaining
>= 2 && !strcasecmp(c
->argv
[j
]->ptr
,"aggregate")) {
5767 if (!strcasecmp(c
->argv
[j
]->ptr
,"sum")) {
5768 aggregate
= REDIS_AGGR_SUM
;
5769 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"min")) {
5770 aggregate
= REDIS_AGGR_MIN
;
5771 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"max")) {
5772 aggregate
= REDIS_AGGR_MAX
;
5775 addReply(c
,shared
.syntaxerr
);
5781 addReply(c
,shared
.syntaxerr
);
5787 /* sort sets from the smallest to largest, this will improve our
5788 * algorithm's performance */
5789 qsort(src
,zsetnum
,sizeof(zsetopsrc
), qsortCompareZsetopsrcByCardinality
);
5791 dstobj
= createZsetObject();
5792 dstzset
= dstobj
->ptr
;
5794 if (op
== REDIS_OP_INTER
) {
5795 /* skip going over all entries if the smallest zset is NULL or empty */
5796 if (src
[0].dict
&& dictSize(src
[0].dict
) > 0) {
5797 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5798 * from small to large, all src[i > 0].dict are non-empty too */
5799 di
= dictGetIterator(src
[0].dict
);
5800 while((de
= dictNext(di
)) != NULL
) {
5801 double *score
= zmalloc(sizeof(double)), value
;
5802 *score
= src
[0].weight
* (*(double*)dictGetEntryVal(de
));
5804 for (j
= 1; j
< zsetnum
; j
++) {
5805 dictEntry
*other
= dictFind(src
[j
].dict
,dictGetEntryKey(de
));
5807 value
= src
[j
].weight
* (*(double*)dictGetEntryVal(other
));
5808 zunionInterAggregate(score
, value
, aggregate
);
5814 /* skip entry when not present in every source dict */
5818 robj
*o
= dictGetEntryKey(de
);
5819 dictAdd(dstzset
->dict
,o
,score
);
5820 incrRefCount(o
); /* added to dictionary */
5821 zslInsert(dstzset
->zsl
,*score
,o
);
5822 incrRefCount(o
); /* added to skiplist */
5825 dictReleaseIterator(di
);
5827 } else if (op
== REDIS_OP_UNION
) {
5828 for (i
= 0; i
< zsetnum
; i
++) {
5829 if (!src
[i
].dict
) continue;
5831 di
= dictGetIterator(src
[i
].dict
);
5832 while((de
= dictNext(di
)) != NULL
) {
5833 /* skip key when already processed */
5834 if (dictFind(dstzset
->dict
,dictGetEntryKey(de
)) != NULL
) continue;
5836 double *score
= zmalloc(sizeof(double)), value
;
5837 *score
= src
[i
].weight
* (*(double*)dictGetEntryVal(de
));
5839 /* because the zsets are sorted by size, its only possible
5840 * for sets at larger indices to hold this entry */
5841 for (j
= (i
+1); j
< zsetnum
; j
++) {
5842 dictEntry
*other
= dictFind(src
[j
].dict
,dictGetEntryKey(de
));
5844 value
= src
[j
].weight
* (*(double*)dictGetEntryVal(other
));
5845 zunionInterAggregate(score
, value
, aggregate
);
5849 robj
*o
= dictGetEntryKey(de
);
5850 dictAdd(dstzset
->dict
,o
,score
);
5851 incrRefCount(o
); /* added to dictionary */
5852 zslInsert(dstzset
->zsl
,*score
,o
);
5853 incrRefCount(o
); /* added to skiplist */
5855 dictReleaseIterator(di
);
5858 /* unknown operator */
5859 redisAssert(op
== REDIS_OP_INTER
|| op
== REDIS_OP_UNION
);
5862 deleteKey(c
->db
,dstkey
);
5863 if (dstzset
->zsl
->length
) {
5864 dictAdd(c
->db
->dict
,dstkey
,dstobj
);
5865 incrRefCount(dstkey
);
5866 addReplyLong(c
, dstzset
->zsl
->length
);
5869 decrRefCount(dstobj
);
5870 addReply(c
, shared
.czero
);
5875 static void zunionCommand(redisClient
*c
) {
5876 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_UNION
);
5879 static void zinterCommand(redisClient
*c
) {
5880 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_INTER
);
5883 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
5895 if ((getLongFromObjectOrReply(c
, c
->argv
[2], &start
, NULL
) != REDIS_OK
) ||
5896 (getLongFromObjectOrReply(c
, c
->argv
[3], &end
, NULL
) != REDIS_OK
)) return;
5898 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
5900 } else if (c
->argc
>= 5) {
5901 addReply(c
,shared
.syntaxerr
);
5905 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
5906 || checkType(c
,o
,REDIS_ZSET
)) return;
5911 /* convert negative indexes */
5912 if (start
< 0) start
= llen
+start
;
5913 if (end
< 0) end
= llen
+end
;
5914 if (start
< 0) start
= 0;
5915 if (end
< 0) end
= 0;
5917 /* indexes sanity checks */
5918 if (start
> end
|| start
>= llen
) {
5919 /* Out of range start or start > end result in empty list */
5920 addReply(c
,shared
.emptymultibulk
);
5923 if (end
>= llen
) end
= llen
-1;
5924 rangelen
= (end
-start
)+1;
5926 /* check if starting point is trivial, before searching
5927 * the element in log(N) time */
5929 ln
= start
== 0 ? zsl
->tail
: zslGetElementByRank(zsl
, llen
-start
);
5932 zsl
->header
->forward
[0] : zslGetElementByRank(zsl
, start
+1);
5935 /* Return the result in form of a multi-bulk reply */
5936 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
5937 withscores
? (rangelen
*2) : rangelen
));
5938 for (j
= 0; j
< rangelen
; j
++) {
5940 addReplyBulk(c
,ele
);
5942 addReplyDouble(c
,ln
->score
);
5943 ln
= reverse
? ln
->backward
: ln
->forward
[0];
5947 static void zrangeCommand(redisClient
*c
) {
5948 zrangeGenericCommand(c
,0);
5951 static void zrevrangeCommand(redisClient
*c
) {
5952 zrangeGenericCommand(c
,1);
5955 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5956 * If justcount is non-zero, just the count is returned. */
5957 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
5960 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
5961 int offset
= 0, limit
= -1;
5965 /* Parse the min-max interval. If one of the values is prefixed
5966 * by the "(" character, it's considered "open". For instance
5967 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5968 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5969 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
5970 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
5973 min
= strtod(c
->argv
[2]->ptr
,NULL
);
5975 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
5976 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
5979 max
= strtod(c
->argv
[3]->ptr
,NULL
);
5982 /* Parse "WITHSCORES": note that if the command was called with
5983 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5984 * enter the following paths to parse WITHSCORES and LIMIT. */
5985 if (c
->argc
== 5 || c
->argc
== 8) {
5986 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
5991 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
5995 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6000 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
6001 addReply(c
,shared
.syntaxerr
);
6003 } else if (c
->argc
== (7 + withscores
)) {
6004 offset
= atoi(c
->argv
[5]->ptr
);
6005 limit
= atoi(c
->argv
[6]->ptr
);
6006 if (offset
< 0) offset
= 0;
6009 /* Ok, lookup the key and get the range */
6010 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
6012 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
6014 if (o
->type
!= REDIS_ZSET
) {
6015 addReply(c
,shared
.wrongtypeerr
);
6017 zset
*zsetobj
= o
->ptr
;
6018 zskiplist
*zsl
= zsetobj
->zsl
;
6020 robj
*ele
, *lenobj
= NULL
;
6021 unsigned long rangelen
= 0;
6023 /* Get the first node with the score >= min, or with
6024 * score > min if 'minex' is true. */
6025 ln
= zslFirstWithScore(zsl
,min
);
6026 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
6029 /* No element matching the speciifed interval */
6030 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
6034 /* We don't know in advance how many matching elements there
6035 * are in the list, so we push this object that will represent
6036 * the multi-bulk length in the output buffer, and will "fix"
6039 lenobj
= createObject(REDIS_STRING
,NULL
);
6041 decrRefCount(lenobj
);
6044 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
6047 ln
= ln
->forward
[0];
6050 if (limit
== 0) break;
6053 addReplyBulk(c
,ele
);
6055 addReplyDouble(c
,ln
->score
);
6057 ln
= ln
->forward
[0];
6059 if (limit
> 0) limit
--;
6062 addReplyLong(c
,(long)rangelen
);
6064 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
6065 withscores
? (rangelen
*2) : rangelen
);
6071 static void zrangebyscoreCommand(redisClient
*c
) {
6072 genericZrangebyscoreCommand(c
,0);
6075 static void zcountCommand(redisClient
*c
) {
6076 genericZrangebyscoreCommand(c
,1);
6079 static void zcardCommand(redisClient
*c
) {
6083 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6084 checkType(c
,o
,REDIS_ZSET
)) return;
6087 addReplyUlong(c
,zs
->zsl
->length
);
6090 static void zscoreCommand(redisClient
*c
) {
6095 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6096 checkType(c
,o
,REDIS_ZSET
)) return;
6099 de
= dictFind(zs
->dict
,c
->argv
[2]);
6101 addReply(c
,shared
.nullbulk
);
6103 double *score
= dictGetEntryVal(de
);
6105 addReplyDouble(c
,*score
);
6109 static void zrankGenericCommand(redisClient
*c
, int reverse
) {
6117 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6118 checkType(c
,o
,REDIS_ZSET
)) return;
6122 de
= dictFind(zs
->dict
,c
->argv
[2]);
6124 addReply(c
,shared
.nullbulk
);
6128 score
= dictGetEntryVal(de
);
6129 rank
= zslGetRank(zsl
, *score
, c
->argv
[2]);
6132 addReplyLong(c
, zsl
->length
- rank
);
6134 addReplyLong(c
, rank
-1);
6137 addReply(c
,shared
.nullbulk
);
6141 static void zrankCommand(redisClient
*c
) {
6142 zrankGenericCommand(c
, 0);
6145 static void zrevrankCommand(redisClient
*c
) {
6146 zrankGenericCommand(c
, 1);
6149 /* ========================= Hashes utility functions ======================= */
6150 #define REDIS_HASH_KEY 1
6151 #define REDIS_HASH_VALUE 2
6153 /* Check the length of a number of objects to see if we need to convert a
6154 * zipmap to a real hash. Note that we only check string encoded objects
6155 * as their string length can be queried in constant time. */
6156 static void hashTryConversion(robj
*subject
, robj
**argv
, int start
, int end
) {
6158 if (subject
->encoding
!= REDIS_ENCODING_ZIPMAP
) return;
6160 for (i
= start
; i
<= end
; i
++) {
6161 if (argv
[i
]->encoding
== REDIS_ENCODING_RAW
&&
6162 sdslen(argv
[i
]->ptr
) > server
.hash_max_zipmap_value
)
6164 convertToRealHash(subject
);
6170 /* Encode given objects in-place when the hash uses a dict. */
6171 static void hashTryObjectEncoding(robj
*subject
, robj
**o1
, robj
**o2
) {
6172 if (subject
->encoding
== REDIS_ENCODING_HT
) {
6173 if (o1
) *o1
= tryObjectEncoding(*o1
);
6174 if (o2
) *o2
= tryObjectEncoding(*o2
);
6178 /* Get the value from a hash identified by key. Returns either a string
6179 * object or NULL if the value cannot be found. The refcount of the object
6180 * is always increased by 1 when the value was found. */
6181 static robj
*hashGet(robj
*o
, robj
*key
) {
6183 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6186 key
= getDecodedObject(key
);
6187 if (zipmapGet(o
->ptr
,key
->ptr
,sdslen(key
->ptr
),&v
,&vlen
)) {
6188 value
= createStringObject((char*)v
,vlen
);
6192 dictEntry
*de
= dictFind(o
->ptr
,key
);
6194 value
= dictGetEntryVal(de
);
6195 incrRefCount(value
);
6201 /* Test if the key exists in the given hash. Returns 1 if the key
6202 * exists and 0 when it doesn't. */
6203 static int hashExists(robj
*o
, robj
*key
) {
6204 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6205 key
= getDecodedObject(key
);
6206 if (zipmapExists(o
->ptr
,key
->ptr
,sdslen(key
->ptr
))) {
6212 if (dictFind(o
->ptr
,key
) != NULL
) {
6219 /* Add an element, discard the old if the key already exists.
6220 * Return 0 on insert and 1 on update. */
6221 static int hashSet(robj
*o
, robj
*key
, robj
*value
) {
6223 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6224 key
= getDecodedObject(key
);
6225 value
= getDecodedObject(value
);
6226 o
->ptr
= zipmapSet(o
->ptr
,
6227 key
->ptr
,sdslen(key
->ptr
),
6228 value
->ptr
,sdslen(value
->ptr
), &update
);
6230 decrRefCount(value
);
6232 /* Check if the zipmap needs to be upgraded to a real hash table */
6233 if (zipmapLen(o
->ptr
) > server
.hash_max_zipmap_entries
)
6234 convertToRealHash(o
);
6236 if (dictReplace(o
->ptr
,key
,value
)) {
6243 incrRefCount(value
);
6248 /* Delete an element from a hash.
6249 * Return 1 on deleted and 0 on not found. */
6250 static int hashDelete(robj
*o
, robj
*key
) {
6252 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6253 key
= getDecodedObject(key
);
6254 o
->ptr
= zipmapDel(o
->ptr
,key
->ptr
,sdslen(key
->ptr
), &deleted
);
6257 deleted
= dictDelete((dict
*)o
->ptr
,key
) == DICT_OK
;
6258 /* Always check if the dictionary needs a resize after a delete. */
6259 if (deleted
&& htNeedsResize(o
->ptr
)) dictResize(o
->ptr
);
6264 /* Return the number of elements in a hash. */
6265 static unsigned long hashLength(robj
*o
) {
6266 return (o
->encoding
== REDIS_ENCODING_ZIPMAP
) ?
6267 zipmapLen((unsigned char*)o
->ptr
) : dictSize((dict
*)o
->ptr
);
6270 /* Structure to hold hash iteration abstration. Note that iteration over
6271 * hashes involves both fields and values. Because it is possible that
6272 * not both are required, store pointers in the iterator to avoid
6273 * unnecessary memory allocation for fields/values. */
6277 unsigned char *zk
, *zv
;
6278 unsigned int zklen
, zvlen
;
6284 static hashIterator
*hashInitIterator(robj
*subject
) {
6285 hashIterator
*hi
= zmalloc(sizeof(hashIterator
));
6286 hi
->encoding
= subject
->encoding
;
6287 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6288 hi
->zi
= zipmapRewind(subject
->ptr
);
6289 } else if (hi
->encoding
== REDIS_ENCODING_HT
) {
6290 hi
->di
= dictGetIterator(subject
->ptr
);
6297 static void hashReleaseIterator(hashIterator
*hi
) {
6298 if (hi
->encoding
== REDIS_ENCODING_HT
) {
6299 dictReleaseIterator(hi
->di
);
6304 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6305 * could be found and REDIS_ERR when the iterator reaches the end. */
6306 static int hashNext(hashIterator
*hi
) {
6307 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6308 if ((hi
->zi
= zipmapNext(hi
->zi
, &hi
->zk
, &hi
->zklen
,
6309 &hi
->zv
, &hi
->zvlen
)) == NULL
) return REDIS_ERR
;
6311 if ((hi
->de
= dictNext(hi
->di
)) == NULL
) return REDIS_ERR
;
6316 /* Get key or value object at current iteration position.
6317 * This increases the refcount of the field object by 1. */
6318 static robj
*hashCurrent(hashIterator
*hi
, int what
) {
6320 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6321 if (what
& REDIS_HASH_KEY
) {
6322 o
= createStringObject((char*)hi
->zk
,hi
->zklen
);
6324 o
= createStringObject((char*)hi
->zv
,hi
->zvlen
);
6327 if (what
& REDIS_HASH_KEY
) {
6328 o
= dictGetEntryKey(hi
->de
);
6330 o
= dictGetEntryVal(hi
->de
);
6337 static robj
*hashLookupWriteOrCreate(redisClient
*c
, robj
*key
) {
6338 robj
*o
= lookupKeyWrite(c
->db
,key
);
6340 o
= createHashObject();
6341 dictAdd(c
->db
->dict
,key
,o
);
6344 if (o
->type
!= REDIS_HASH
) {
6345 addReply(c
,shared
.wrongtypeerr
);
6352 /* ============================= Hash commands ============================== */
6353 static void hsetCommand(redisClient
*c
) {
6357 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6358 hashTryConversion(o
,c
->argv
,2,3);
6359 hashTryObjectEncoding(o
,&c
->argv
[2], &c
->argv
[3]);
6360 update
= hashSet(o
,c
->argv
[2],c
->argv
[3]);
6361 addReply(c
, update
? shared
.czero
: shared
.cone
);
6365 static void hsetnxCommand(redisClient
*c
) {
6367 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6368 hashTryConversion(o
,c
->argv
,2,3);
6370 if (hashExists(o
, c
->argv
[2])) {
6371 addReply(c
, shared
.czero
);
6373 hashTryObjectEncoding(o
,&c
->argv
[2], &c
->argv
[3]);
6374 hashSet(o
,c
->argv
[2],c
->argv
[3]);
6375 addReply(c
, shared
.cone
);
6380 static void hmsetCommand(redisClient
*c
) {
6384 if ((c
->argc
% 2) == 1) {
6385 addReplySds(c
,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6389 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6390 hashTryConversion(o
,c
->argv
,2,c
->argc
-1);
6391 for (i
= 2; i
< c
->argc
; i
+= 2) {
6392 hashTryObjectEncoding(o
,&c
->argv
[i
], &c
->argv
[i
+1]);
6393 hashSet(o
,c
->argv
[i
],c
->argv
[i
+1]);
6395 addReply(c
, shared
.ok
);
6399 static void hincrbyCommand(redisClient
*c
) {
6400 long long value
, incr
;
6401 robj
*o
, *current
, *new;
6403 if (getLongLongFromObjectOrReply(c
,c
->argv
[3],&incr
,NULL
) != REDIS_OK
) return;
6404 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6405 if ((current
= hashGet(o
,c
->argv
[2])) != NULL
) {
6406 if (current
->encoding
== REDIS_ENCODING_RAW
)
6407 value
= strtoll(current
->ptr
,NULL
,10);
6408 else if (current
->encoding
== REDIS_ENCODING_INT
)
6409 value
= (long)current
->ptr
;
6411 redisAssert(1 != 1);
6412 decrRefCount(current
);
6418 new = createStringObjectFromLongLong(value
);
6419 hashTryObjectEncoding(o
,&c
->argv
[2],NULL
);
6420 hashSet(o
,c
->argv
[2],new);
6422 addReplyLongLong(c
,value
);
6426 static void hgetCommand(redisClient
*c
) {
6428 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6429 checkType(c
,o
,REDIS_HASH
)) return;
6431 if ((value
= hashGet(o
,c
->argv
[2])) != NULL
) {
6432 addReplyBulk(c
,value
);
6433 decrRefCount(value
);
6435 addReply(c
,shared
.nullbulk
);
6439 static void hmgetCommand(redisClient
*c
) {
6442 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
6443 if (o
!= NULL
&& o
->type
!= REDIS_HASH
) {
6444 addReply(c
,shared
.wrongtypeerr
);
6447 /* Note the check for o != NULL happens inside the loop. This is
6448 * done because objects that cannot be found are considered to be
6449 * an empty hash. The reply should then be a series of NULLs. */
6450 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-2));
6451 for (i
= 2; i
< c
->argc
; i
++) {
6452 if (o
!= NULL
&& (value
= hashGet(o
,c
->argv
[i
])) != NULL
) {
6453 addReplyBulk(c
,value
);
6454 decrRefCount(value
);
6456 addReply(c
,shared
.nullbulk
);
6461 static void hdelCommand(redisClient
*c
) {
6463 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6464 checkType(c
,o
,REDIS_HASH
)) return;
6466 if (hashDelete(o
,c
->argv
[2])) {
6467 if (hashLength(o
) == 0) deleteKey(c
->db
,c
->argv
[1]);
6468 addReply(c
,shared
.cone
);
6471 addReply(c
,shared
.czero
);
6475 static void hlenCommand(redisClient
*c
) {
6477 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6478 checkType(c
,o
,REDIS_HASH
)) return;
6480 addReplyUlong(c
,hashLength(o
));
6483 static void genericHgetallCommand(redisClient
*c
, int flags
) {
6484 robj
*o
, *lenobj
, *obj
;
6485 unsigned long count
= 0;
6488 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
6489 || checkType(c
,o
,REDIS_HASH
)) return;
6491 lenobj
= createObject(REDIS_STRING
,NULL
);
6493 decrRefCount(lenobj
);
6495 hi
= hashInitIterator(o
);
6496 while (hashNext(hi
) != REDIS_ERR
) {
6497 if (flags
& REDIS_HASH_KEY
) {
6498 obj
= hashCurrent(hi
,REDIS_HASH_KEY
);
6499 addReplyBulk(c
,obj
);
6503 if (flags
& REDIS_HASH_VALUE
) {
6504 obj
= hashCurrent(hi
,REDIS_HASH_VALUE
);
6505 addReplyBulk(c
,obj
);
6510 hashReleaseIterator(hi
);
6512 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",count
);
6515 static void hkeysCommand(redisClient
*c
) {
6516 genericHgetallCommand(c
,REDIS_HASH_KEY
);
6519 static void hvalsCommand(redisClient
*c
) {
6520 genericHgetallCommand(c
,REDIS_HASH_VALUE
);
6523 static void hgetallCommand(redisClient
*c
) {
6524 genericHgetallCommand(c
,REDIS_HASH_KEY
|REDIS_HASH_VALUE
);
6527 static void hexistsCommand(redisClient
*c
) {
6529 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6530 checkType(c
,o
,REDIS_HASH
)) return;
6532 addReply(c
, hashExists(o
,c
->argv
[2]) ? shared
.cone
: shared
.czero
);
6535 static void convertToRealHash(robj
*o
) {
6536 unsigned char *key
, *val
, *p
, *zm
= o
->ptr
;
6537 unsigned int klen
, vlen
;
6538 dict
*dict
= dictCreate(&hashDictType
,NULL
);
6540 assert(o
->type
== REDIS_HASH
&& o
->encoding
!= REDIS_ENCODING_HT
);
6541 p
= zipmapRewind(zm
);
6542 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
6543 robj
*keyobj
, *valobj
;
6545 keyobj
= createStringObject((char*)key
,klen
);
6546 valobj
= createStringObject((char*)val
,vlen
);
6547 keyobj
= tryObjectEncoding(keyobj
);
6548 valobj
= tryObjectEncoding(valobj
);
6549 dictAdd(dict
,keyobj
,valobj
);
6551 o
->encoding
= REDIS_ENCODING_HT
;
6556 /* ========================= Non type-specific commands ==================== */
6558 static void flushdbCommand(redisClient
*c
) {
6559 server
.dirty
+= dictSize(c
->db
->dict
);
6560 dictEmpty(c
->db
->dict
);
6561 dictEmpty(c
->db
->expires
);
6562 addReply(c
,shared
.ok
);
6565 static void flushallCommand(redisClient
*c
) {
6566 server
.dirty
+= emptyDb();
6567 addReply(c
,shared
.ok
);
6568 if (server
.bgsavechildpid
!= -1) {
6569 kill(server
.bgsavechildpid
,SIGKILL
);
6570 rdbRemoveTempFile(server
.bgsavechildpid
);
6572 rdbSave(server
.dbfilename
);
6576 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
6577 redisSortOperation
*so
= zmalloc(sizeof(*so
));
6579 so
->pattern
= pattern
;
6583 /* Return the value associated to the key with a name obtained
6584 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6585 * The returned object will always have its refcount increased by 1
6586 * when it is non-NULL. */
6587 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
6590 robj keyobj
, fieldobj
, *o
;
6591 int prefixlen
, sublen
, postfixlen
, fieldlen
;
6592 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6596 char buf
[REDIS_SORTKEY_MAX
+1];
6597 } keyname
, fieldname
;
6599 /* If the pattern is "#" return the substitution object itself in order
6600 * to implement the "SORT ... GET #" feature. */
6601 spat
= pattern
->ptr
;
6602 if (spat
[0] == '#' && spat
[1] == '\0') {
6603 incrRefCount(subst
);
6607 /* The substitution object may be specially encoded. If so we create
6608 * a decoded object on the fly. Otherwise getDecodedObject will just
6609 * increment the ref count, that we'll decrement later. */
6610 subst
= getDecodedObject(subst
);
6613 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
6614 p
= strchr(spat
,'*');
6616 decrRefCount(subst
);
6620 /* Find out if we're dealing with a hash dereference. */
6621 if ((f
= strstr(p
+1, "->")) != NULL
) {
6622 fieldlen
= sdslen(spat
)-(f
-spat
);
6623 /* this also copies \0 character */
6624 memcpy(fieldname
.buf
,f
+2,fieldlen
-1);
6625 fieldname
.len
= fieldlen
-2;
6631 sublen
= sdslen(ssub
);
6632 postfixlen
= sdslen(spat
)-(prefixlen
+1)-fieldlen
;
6633 memcpy(keyname
.buf
,spat
,prefixlen
);
6634 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
6635 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
6636 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
6637 keyname
.len
= prefixlen
+sublen
+postfixlen
;
6638 decrRefCount(subst
);
6640 /* Lookup substituted key */
6641 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2));
6642 o
= lookupKeyRead(db
,&keyobj
);
6643 if (o
== NULL
) return NULL
;
6646 if (o
->type
!= REDIS_HASH
|| fieldname
.len
< 1) return NULL
;
6648 /* Retrieve value from hash by the field name. This operation
6649 * already increases the refcount of the returned object. */
6650 initStaticStringObject(fieldobj
,((char*)&fieldname
)+(sizeof(long)*2));
6651 o
= hashGet(o
, &fieldobj
);
6653 if (o
->type
!= REDIS_STRING
) return NULL
;
6655 /* Every object that this function returns needs to have its refcount
6656 * increased. sortCommand decreases it again. */
6663 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6664 * the additional parameter is not standard but a BSD-specific we have to
6665 * pass sorting parameters via the global 'server' structure */
6666 static int sortCompare(const void *s1
, const void *s2
) {
6667 const redisSortObject
*so1
= s1
, *so2
= s2
;
6670 if (!server
.sort_alpha
) {
6671 /* Numeric sorting. Here it's trivial as we precomputed scores */
6672 if (so1
->u
.score
> so2
->u
.score
) {
6674 } else if (so1
->u
.score
< so2
->u
.score
) {
6680 /* Alphanumeric sorting */
6681 if (server
.sort_bypattern
) {
6682 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
6683 /* At least one compare object is NULL */
6684 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
6686 else if (so1
->u
.cmpobj
== NULL
)
6691 /* We have both the objects, use strcoll */
6692 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
6695 /* Compare elements directly. */
6696 cmp
= compareStringObjects(so1
->obj
,so2
->obj
);
6699 return server
.sort_desc
? -cmp
: cmp
;
6702 /* The SORT command is the most complex command in Redis. Warning: this code
6703 * is optimized for speed and a bit less for readability */
6704 static void sortCommand(redisClient
*c
) {
6707 int desc
= 0, alpha
= 0;
6708 int limit_start
= 0, limit_count
= -1, start
, end
;
6709 int j
, dontsort
= 0, vectorlen
;
6710 int getop
= 0; /* GET operation counter */
6711 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
6712 redisSortObject
*vector
; /* Resulting vector to sort */
6714 /* Lookup the key to sort. It must be of the right types */
6715 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
6716 if (sortval
== NULL
) {
6717 addReply(c
,shared
.emptymultibulk
);
6720 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
6721 sortval
->type
!= REDIS_ZSET
)
6723 addReply(c
,shared
.wrongtypeerr
);
6727 /* Create a list of operations to perform for every sorted element.
6728 * Operations can be GET/DEL/INCR/DECR */
6729 operations
= listCreate();
6730 listSetFreeMethod(operations
,zfree
);
6733 /* Now we need to protect sortval incrementing its count, in the future
6734 * SORT may have options able to overwrite/delete keys during the sorting
6735 * and the sorted key itself may get destroied */
6736 incrRefCount(sortval
);
6738 /* The SORT command has an SQL-alike syntax, parse it */
6739 while(j
< c
->argc
) {
6740 int leftargs
= c
->argc
-j
-1;
6741 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
6743 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
6745 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
6747 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
6748 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
6749 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
6751 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
6752 storekey
= c
->argv
[j
+1];
6754 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
6755 sortby
= c
->argv
[j
+1];
6756 /* If the BY pattern does not contain '*', i.e. it is constant,
6757 * we don't need to sort nor to lookup the weight keys. */
6758 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
6760 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
6761 listAddNodeTail(operations
,createSortOperation(
6762 REDIS_SORT_GET
,c
->argv
[j
+1]));
6766 decrRefCount(sortval
);
6767 listRelease(operations
);
6768 addReply(c
,shared
.syntaxerr
);
6774 /* Load the sorting vector with all the objects to sort */
6775 switch(sortval
->type
) {
6776 case REDIS_LIST
: vectorlen
= listLength((list
*)sortval
->ptr
); break;
6777 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
6778 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
6779 default: vectorlen
= 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6781 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
6784 if (sortval
->type
== REDIS_LIST
) {
6785 list
*list
= sortval
->ptr
;
6789 listRewind(list
,&li
);
6790 while((ln
= listNext(&li
))) {
6791 robj
*ele
= ln
->value
;
6792 vector
[j
].obj
= ele
;
6793 vector
[j
].u
.score
= 0;
6794 vector
[j
].u
.cmpobj
= NULL
;
6802 if (sortval
->type
== REDIS_SET
) {
6805 zset
*zs
= sortval
->ptr
;
6809 di
= dictGetIterator(set
);
6810 while((setele
= dictNext(di
)) != NULL
) {
6811 vector
[j
].obj
= dictGetEntryKey(setele
);
6812 vector
[j
].u
.score
= 0;
6813 vector
[j
].u
.cmpobj
= NULL
;
6816 dictReleaseIterator(di
);
6818 redisAssert(j
== vectorlen
);
6820 /* Now it's time to load the right scores in the sorting vector */
6821 if (dontsort
== 0) {
6822 for (j
= 0; j
< vectorlen
; j
++) {
6825 /* lookup value to sort by */
6826 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
6827 if (!byval
) continue;
6829 /* use object itself to sort by */
6830 byval
= vector
[j
].obj
;
6834 if (sortby
) vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
6836 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
6837 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
6838 } else if (byval
->encoding
== REDIS_ENCODING_INT
) {
6839 /* Don't need to decode the object if it's
6840 * integer-encoded (the only encoding supported) so
6841 * far. We can just cast it */
6842 vector
[j
].u
.score
= (long)byval
->ptr
;
6844 redisAssert(1 != 1);
6848 /* when the object was retrieved using lookupKeyByPattern,
6849 * its refcount needs to be decreased. */
6851 decrRefCount(byval
);
6856 /* We are ready to sort the vector... perform a bit of sanity check
6857 * on the LIMIT option too. We'll use a partial version of quicksort. */
6858 start
= (limit_start
< 0) ? 0 : limit_start
;
6859 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
6860 if (start
>= vectorlen
) {
6861 start
= vectorlen
-1;
6864 if (end
>= vectorlen
) end
= vectorlen
-1;
6866 if (dontsort
== 0) {
6867 server
.sort_desc
= desc
;
6868 server
.sort_alpha
= alpha
;
6869 server
.sort_bypattern
= sortby
? 1 : 0;
6870 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
6871 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
6873 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
6876 /* Send command output to the output buffer, performing the specified
6877 * GET/DEL/INCR/DECR operations if any. */
6878 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
6879 if (storekey
== NULL
) {
6880 /* STORE option not specified, sent the sorting result to client */
6881 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
6882 for (j
= start
; j
<= end
; j
++) {
6886 if (!getop
) addReplyBulk(c
,vector
[j
].obj
);
6887 listRewind(operations
,&li
);
6888 while((ln
= listNext(&li
))) {
6889 redisSortOperation
*sop
= ln
->value
;
6890 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
6893 if (sop
->type
== REDIS_SORT_GET
) {
6895 addReply(c
,shared
.nullbulk
);
6897 addReplyBulk(c
,val
);
6901 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
6906 robj
*listObject
= createListObject();
6907 list
*listPtr
= (list
*) listObject
->ptr
;
6909 /* STORE option specified, set the sorting result as a List object */
6910 for (j
= start
; j
<= end
; j
++) {
6915 listAddNodeTail(listPtr
,vector
[j
].obj
);
6916 incrRefCount(vector
[j
].obj
);
6918 listRewind(operations
,&li
);
6919 while((ln
= listNext(&li
))) {
6920 redisSortOperation
*sop
= ln
->value
;
6921 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
6924 if (sop
->type
== REDIS_SORT_GET
) {
6926 listAddNodeTail(listPtr
,createStringObject("",0));
6928 /* We should do a incrRefCount on val because it is
6929 * added to the list, but also a decrRefCount because
6930 * it is returned by lookupKeyByPattern. This results
6931 * in doing nothing at all. */
6932 listAddNodeTail(listPtr
,val
);
6935 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
6939 if (dictReplace(c
->db
->dict
,storekey
,listObject
)) {
6940 incrRefCount(storekey
);
6942 /* Note: we add 1 because the DB is dirty anyway since even if the
6943 * SORT result is empty a new key is set and maybe the old content
6945 server
.dirty
+= 1+outputlen
;
6946 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
6950 decrRefCount(sortval
);
6951 listRelease(operations
);
6952 for (j
= 0; j
< vectorlen
; j
++) {
6953 if (alpha
&& vector
[j
].u
.cmpobj
)
6954 decrRefCount(vector
[j
].u
.cmpobj
);
6959 /* Convert an amount of bytes into a human readable string in the form
6960 * of 100B, 2G, 100M, 4K, and so forth. */
6961 static void bytesToHuman(char *s
, unsigned long long n
) {
6966 sprintf(s
,"%lluB",n
);
6968 } else if (n
< (1024*1024)) {
6969 d
= (double)n
/(1024);
6970 sprintf(s
,"%.2fK",d
);
6971 } else if (n
< (1024LL*1024*1024)) {
6972 d
= (double)n
/(1024*1024);
6973 sprintf(s
,"%.2fM",d
);
6974 } else if (n
< (1024LL*1024*1024*1024)) {
6975 d
= (double)n
/(1024LL*1024*1024);
6976 sprintf(s
,"%.2fG",d
);
6980 /* Create the string returned by the INFO command. This is decoupled
6981 * by the INFO command itself as we need to report the same information
6982 * on memory corruption problems. */
6983 static sds
genRedisInfoString(void) {
6985 time_t uptime
= time(NULL
)-server
.stat_starttime
;
6989 bytesToHuman(hmem
,zmalloc_used_memory());
6990 info
= sdscatprintf(sdsempty(),
6991 "redis_version:%s\r\n"
6993 "multiplexing_api:%s\r\n"
6994 "process_id:%ld\r\n"
6995 "uptime_in_seconds:%ld\r\n"
6996 "uptime_in_days:%ld\r\n"
6997 "connected_clients:%d\r\n"
6998 "connected_slaves:%d\r\n"
6999 "blocked_clients:%d\r\n"
7000 "used_memory:%zu\r\n"
7001 "used_memory_human:%s\r\n"
7002 "changes_since_last_save:%lld\r\n"
7003 "bgsave_in_progress:%d\r\n"
7004 "last_save_time:%ld\r\n"
7005 "bgrewriteaof_in_progress:%d\r\n"
7006 "total_connections_received:%lld\r\n"
7007 "total_commands_processed:%lld\r\n"
7008 "expired_keys:%lld\r\n"
7009 "hash_max_zipmap_entries:%ld\r\n"
7010 "hash_max_zipmap_value:%ld\r\n"
7011 "pubsub_channels:%ld\r\n"
7012 "pubsub_patterns:%u\r\n"
7016 (sizeof(long) == 8) ? "64" : "32",
7021 listLength(server
.clients
)-listLength(server
.slaves
),
7022 listLength(server
.slaves
),
7023 server
.blpop_blocked_clients
,
7024 zmalloc_used_memory(),
7027 server
.bgsavechildpid
!= -1,
7029 server
.bgrewritechildpid
!= -1,
7030 server
.stat_numconnections
,
7031 server
.stat_numcommands
,
7032 server
.stat_expiredkeys
,
7033 server
.hash_max_zipmap_entries
,
7034 server
.hash_max_zipmap_value
,
7035 dictSize(server
.pubsub_channels
),
7036 listLength(server
.pubsub_patterns
),
7037 server
.vm_enabled
!= 0,
7038 server
.masterhost
== NULL
? "master" : "slave"
7040 if (server
.masterhost
) {
7041 info
= sdscatprintf(info
,
7042 "master_host:%s\r\n"
7043 "master_port:%d\r\n"
7044 "master_link_status:%s\r\n"
7045 "master_last_io_seconds_ago:%d\r\n"
7048 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
7050 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
7053 if (server
.vm_enabled
) {
7055 info
= sdscatprintf(info
,
7056 "vm_conf_max_memory:%llu\r\n"
7057 "vm_conf_page_size:%llu\r\n"
7058 "vm_conf_pages:%llu\r\n"
7059 "vm_stats_used_pages:%llu\r\n"
7060 "vm_stats_swapped_objects:%llu\r\n"
7061 "vm_stats_swappin_count:%llu\r\n"
7062 "vm_stats_swappout_count:%llu\r\n"
7063 "vm_stats_io_newjobs_len:%lu\r\n"
7064 "vm_stats_io_processing_len:%lu\r\n"
7065 "vm_stats_io_processed_len:%lu\r\n"
7066 "vm_stats_io_active_threads:%lu\r\n"
7067 "vm_stats_blocked_clients:%lu\r\n"
7068 ,(unsigned long long) server
.vm_max_memory
,
7069 (unsigned long long) server
.vm_page_size
,
7070 (unsigned long long) server
.vm_pages
,
7071 (unsigned long long) server
.vm_stats_used_pages
,
7072 (unsigned long long) server
.vm_stats_swapped_objects
,
7073 (unsigned long long) server
.vm_stats_swapins
,
7074 (unsigned long long) server
.vm_stats_swapouts
,
7075 (unsigned long) listLength(server
.io_newjobs
),
7076 (unsigned long) listLength(server
.io_processing
),
7077 (unsigned long) listLength(server
.io_processed
),
7078 (unsigned long) server
.io_active_threads
,
7079 (unsigned long) server
.vm_blocked_clients
7083 for (j
= 0; j
< server
.dbnum
; j
++) {
7084 long long keys
, vkeys
;
7086 keys
= dictSize(server
.db
[j
].dict
);
7087 vkeys
= dictSize(server
.db
[j
].expires
);
7088 if (keys
|| vkeys
) {
7089 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
7096 static void infoCommand(redisClient
*c
) {
7097 sds info
= genRedisInfoString();
7098 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
7099 (unsigned long)sdslen(info
)));
7100 addReplySds(c
,info
);
7101 addReply(c
,shared
.crlf
);
7104 static void monitorCommand(redisClient
*c
) {
7105 /* ignore MONITOR if aleady slave or in monitor mode */
7106 if (c
->flags
& REDIS_SLAVE
) return;
7108 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
7110 listAddNodeTail(server
.monitors
,c
);
7111 addReply(c
,shared
.ok
);
7114 /* ================================= Expire ================================= */
7115 static int removeExpire(redisDb
*db
, robj
*key
) {
7116 if (dictDelete(db
->expires
,key
) == DICT_OK
) {
7123 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
7124 if (dictAdd(db
->expires
,key
,(void*)when
) == DICT_ERR
) {
7132 /* Return the expire time of the specified key, or -1 if no expire
7133 * is associated with this key (i.e. the key is non volatile) */
7134 static time_t getExpire(redisDb
*db
, robj
*key
) {
7137 /* No expire? return ASAP */
7138 if (dictSize(db
->expires
) == 0 ||
7139 (de
= dictFind(db
->expires
,key
)) == NULL
) return -1;
7141 return (time_t) dictGetEntryVal(de
);
7144 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
7148 /* No expire? return ASAP */
7149 if (dictSize(db
->expires
) == 0 ||
7150 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
7152 /* Lookup the expire */
7153 when
= (time_t) dictGetEntryVal(de
);
7154 if (time(NULL
) <= when
) return 0;
7156 /* Delete the key */
7157 dictDelete(db
->expires
,key
);
7158 server
.stat_expiredkeys
++;
7159 return dictDelete(db
->dict
,key
) == DICT_OK
;
7162 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
7165 /* No expire? return ASAP */
7166 if (dictSize(db
->expires
) == 0 ||
7167 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
7169 /* Delete the key */
7171 server
.stat_expiredkeys
++;
7172 dictDelete(db
->expires
,key
);
7173 return dictDelete(db
->dict
,key
) == DICT_OK
;
7176 static void expireGenericCommand(redisClient
*c
, robj
*key
, robj
*param
, long offset
) {
7180 if (getLongFromObjectOrReply(c
, param
, &seconds
, NULL
) != REDIS_OK
) return;
7184 de
= dictFind(c
->db
->dict
,key
);
7186 addReply(c
,shared
.czero
);
7190 if (deleteKey(c
->db
,key
)) server
.dirty
++;
7191 addReply(c
, shared
.cone
);
7194 time_t when
= time(NULL
)+seconds
;
7195 if (setExpire(c
->db
,key
,when
)) {
7196 addReply(c
,shared
.cone
);
7199 addReply(c
,shared
.czero
);
7205 static void expireCommand(redisClient
*c
) {
7206 expireGenericCommand(c
,c
->argv
[1],c
->argv
[2],0);
7209 static void expireatCommand(redisClient
*c
) {
7210 expireGenericCommand(c
,c
->argv
[1],c
->argv
[2],time(NULL
));
7213 static void ttlCommand(redisClient
*c
) {
7217 expire
= getExpire(c
->db
,c
->argv
[1]);
7219 ttl
= (int) (expire
-time(NULL
));
7220 if (ttl
< 0) ttl
= -1;
7222 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
7225 /* ================================ MULTI/EXEC ============================== */
7227 /* Client state initialization for MULTI/EXEC */
7228 static void initClientMultiState(redisClient
*c
) {
7229 c
->mstate
.commands
= NULL
;
7230 c
->mstate
.count
= 0;
7233 /* Release all the resources associated with MULTI/EXEC state */
7234 static void freeClientMultiState(redisClient
*c
) {
7237 for (j
= 0; j
< c
->mstate
.count
; j
++) {
7239 multiCmd
*mc
= c
->mstate
.commands
+j
;
7241 for (i
= 0; i
< mc
->argc
; i
++)
7242 decrRefCount(mc
->argv
[i
]);
7245 zfree(c
->mstate
.commands
);
7248 /* Add a new command into the MULTI commands queue */
7249 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
7253 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
7254 sizeof(multiCmd
)*(c
->mstate
.count
+1));
7255 mc
= c
->mstate
.commands
+c
->mstate
.count
;
7258 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
7259 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
7260 for (j
= 0; j
< c
->argc
; j
++)
7261 incrRefCount(mc
->argv
[j
]);
7265 static void multiCommand(redisClient
*c
) {
7266 c
->flags
|= REDIS_MULTI
;
7267 addReply(c
,shared
.ok
);
7270 static void discardCommand(redisClient
*c
) {
7271 if (!(c
->flags
& REDIS_MULTI
)) {
7272 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
7276 freeClientMultiState(c
);
7277 initClientMultiState(c
);
7278 c
->flags
&= (~REDIS_MULTI
);
7279 addReply(c
,shared
.ok
);
7282 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7283 * implememntation for more information. */
7284 static void execCommandReplicateMulti(redisClient
*c
) {
7285 struct redisCommand
*cmd
;
7286 robj
*multistring
= createStringObject("MULTI",5);
7288 cmd
= lookupCommand("multi");
7289 if (server
.appendonly
)
7290 feedAppendOnlyFile(cmd
,c
->db
->id
,&multistring
,1);
7291 if (listLength(server
.slaves
))
7292 replicationFeedSlaves(server
.slaves
,c
->db
->id
,&multistring
,1);
7293 decrRefCount(multistring
);
7296 static void execCommand(redisClient
*c
) {
7301 if (!(c
->flags
& REDIS_MULTI
)) {
7302 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
7306 /* Replicate a MULTI request now that we are sure the block is executed.
7307 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7308 * both the AOF and the replication link will have the same consistency
7309 * and atomicity guarantees. */
7310 execCommandReplicateMulti(c
);
7312 /* Exec all the queued commands */
7313 orig_argv
= c
->argv
;
7314 orig_argc
= c
->argc
;
7315 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
7316 for (j
= 0; j
< c
->mstate
.count
; j
++) {
7317 c
->argc
= c
->mstate
.commands
[j
].argc
;
7318 c
->argv
= c
->mstate
.commands
[j
].argv
;
7319 call(c
,c
->mstate
.commands
[j
].cmd
);
7321 c
->argv
= orig_argv
;
7322 c
->argc
= orig_argc
;
7323 freeClientMultiState(c
);
7324 initClientMultiState(c
);
7325 c
->flags
&= (~REDIS_MULTI
);
7326 /* Make sure the EXEC command is always replicated / AOF, since we
7327 * always send the MULTI command (we can't know beforehand if the
7328 * next operations will contain at least a modification to the DB). */
7332 /* =========================== Blocking Operations ========================= */
7334 /* Currently Redis blocking operations support is limited to list POP ops,
7335 * so the current implementation is not fully generic, but it is also not
7336 * completely specific so it will not require a rewrite to support new
7337 * kind of blocking operations in the future.
7339 * Still it's important to note that list blocking operations can be already
7340 * used as a notification mechanism in order to implement other blocking
7341 * operations at application level, so there must be a very strong evidence
7342 * of usefulness and generality before new blocking operations are implemented.
7344 * This is how the current blocking POP works, we use BLPOP as example:
7345 * - If the user calls BLPOP and the key exists and contains a non empty list
7346 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7347 * if there is not to block.
7348 * - If instead BLPOP is called and the key does not exists or the list is
7349 * empty we need to block. In order to do so we remove the notification for
7350 * new data to read in the client socket (so that we'll not serve new
7351 * requests if the blocking request is not served). Also we put the client
7352 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7353 * blocking for this keys.
7354 * - If a PUSH operation against a key with blocked clients waiting is
7355 * performed, we serve the first in the list: basically instead to push
7356 * the new element inside the list we return it to the (first / oldest)
7357 * blocking client, unblock the client, and remove it form the list.
7359 * The above comment and the source code should be enough in order to understand
7360 * the implementation and modify / fix it later.
7363 /* Set a client in blocking mode for the specified key, with the specified
7365 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
7370 c
->blockingkeys
= zmalloc(sizeof(robj
*)*numkeys
);
7371 c
->blockingkeysnum
= numkeys
;
7372 c
->blockingto
= timeout
;
7373 for (j
= 0; j
< numkeys
; j
++) {
7374 /* Add the key in the client structure, to map clients -> keys */
7375 c
->blockingkeys
[j
] = keys
[j
];
7376 incrRefCount(keys
[j
]);
7378 /* And in the other "side", to map keys -> clients */
7379 de
= dictFind(c
->db
->blockingkeys
,keys
[j
]);
7383 /* For every key we take a list of clients blocked for it */
7385 retval
= dictAdd(c
->db
->blockingkeys
,keys
[j
],l
);
7386 incrRefCount(keys
[j
]);
7387 assert(retval
== DICT_OK
);
7389 l
= dictGetEntryVal(de
);
7391 listAddNodeTail(l
,c
);
7393 /* Mark the client as a blocked client */
7394 c
->flags
|= REDIS_BLOCKED
;
7395 server
.blpop_blocked_clients
++;
7398 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7399 static void unblockClientWaitingData(redisClient
*c
) {
7404 assert(c
->blockingkeys
!= NULL
);
7405 /* The client may wait for multiple keys, so unblock it for every key. */
7406 for (j
= 0; j
< c
->blockingkeysnum
; j
++) {
7407 /* Remove this client from the list of clients waiting for this key. */
7408 de
= dictFind(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
7410 l
= dictGetEntryVal(de
);
7411 listDelNode(l
,listSearchKey(l
,c
));
7412 /* If the list is empty we need to remove it to avoid wasting memory */
7413 if (listLength(l
) == 0)
7414 dictDelete(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
7415 decrRefCount(c
->blockingkeys
[j
]);
7417 /* Cleanup the client structure */
7418 zfree(c
->blockingkeys
);
7419 c
->blockingkeys
= NULL
;
7420 c
->flags
&= (~REDIS_BLOCKED
);
7421 server
.blpop_blocked_clients
--;
7422 /* We want to process data if there is some command waiting
7423 * in the input buffer. Note that this is safe even if
7424 * unblockClientWaitingData() gets called from freeClient() because
7425 * freeClient() will be smart enough to call this function
7426 * *after* c->querybuf was set to NULL. */
7427 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
7430 /* This should be called from any function PUSHing into lists.
7431 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7432 * 'ele' is the element pushed.
7434 * If the function returns 0 there was no client waiting for a list push
7437 * If the function returns 1 there was a client waiting for a list push
7438 * against this key, the element was passed to this client thus it's not
7439 * needed to actually add it to the list and the caller should return asap. */
7440 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
7441 struct dictEntry
*de
;
7442 redisClient
*receiver
;
7446 de
= dictFind(c
->db
->blockingkeys
,key
);
7447 if (de
== NULL
) return 0;
7448 l
= dictGetEntryVal(de
);
7451 receiver
= ln
->value
;
7453 addReplySds(receiver
,sdsnew("*2\r\n"));
7454 addReplyBulk(receiver
,key
);
7455 addReplyBulk(receiver
,ele
);
7456 unblockClientWaitingData(receiver
);
7460 /* Blocking RPOP/LPOP */
7461 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
7466 for (j
= 1; j
< c
->argc
-1; j
++) {
7467 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
7469 if (o
->type
!= REDIS_LIST
) {
7470 addReply(c
,shared
.wrongtypeerr
);
7473 list
*list
= o
->ptr
;
7474 if (listLength(list
) != 0) {
7475 /* If the list contains elements fall back to the usual
7476 * non-blocking POP operation */
7477 robj
*argv
[2], **orig_argv
;
7480 /* We need to alter the command arguments before to call
7481 * popGenericCommand() as the command takes a single key. */
7482 orig_argv
= c
->argv
;
7483 orig_argc
= c
->argc
;
7484 argv
[1] = c
->argv
[j
];
7488 /* Also the return value is different, we need to output
7489 * the multi bulk reply header and the key name. The
7490 * "real" command will add the last element (the value)
7491 * for us. If this souds like an hack to you it's just
7492 * because it is... */
7493 addReplySds(c
,sdsnew("*2\r\n"));
7494 addReplyBulk(c
,argv
[1]);
7495 popGenericCommand(c
,where
);
7497 /* Fix the client structure with the original stuff */
7498 c
->argv
= orig_argv
;
7499 c
->argc
= orig_argc
;
7505 /* If the list is empty or the key does not exists we must block */
7506 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
7507 if (timeout
> 0) timeout
+= time(NULL
);
7508 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
7511 static void blpopCommand(redisClient
*c
) {
7512 blockingPopGenericCommand(c
,REDIS_HEAD
);
7515 static void brpopCommand(redisClient
*c
) {
7516 blockingPopGenericCommand(c
,REDIS_TAIL
);
7519 /* =============================== Replication ============================= */
7521 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7522 ssize_t nwritten
, ret
= size
;
7523 time_t start
= time(NULL
);
7527 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
7528 nwritten
= write(fd
,ptr
,size
);
7529 if (nwritten
== -1) return -1;
7533 if ((time(NULL
)-start
) > timeout
) {
7541 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7542 ssize_t nread
, totread
= 0;
7543 time_t start
= time(NULL
);
7547 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
7548 nread
= read(fd
,ptr
,size
);
7549 if (nread
== -1) return -1;
7554 if ((time(NULL
)-start
) > timeout
) {
7562 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7569 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
7572 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
7583 static void syncCommand(redisClient
*c
) {
7584 /* ignore SYNC if aleady slave or in monitor mode */
7585 if (c
->flags
& REDIS_SLAVE
) return;
7587 /* SYNC can't be issued when the server has pending data to send to
7588 * the client about already issued commands. We need a fresh reply
7589 * buffer registering the differences between the BGSAVE and the current
7590 * dataset, so that we can copy to other slaves if needed. */
7591 if (listLength(c
->reply
) != 0) {
7592 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7596 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
7597 /* Here we need to check if there is a background saving operation
7598 * in progress, or if it is required to start one */
7599 if (server
.bgsavechildpid
!= -1) {
7600 /* Ok a background save is in progress. Let's check if it is a good
7601 * one for replication, i.e. if there is another slave that is
7602 * registering differences since the server forked to save */
7607 listRewind(server
.slaves
,&li
);
7608 while((ln
= listNext(&li
))) {
7610 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
7613 /* Perfect, the server is already registering differences for
7614 * another slave. Set the right state, and copy the buffer. */
7615 listRelease(c
->reply
);
7616 c
->reply
= listDup(slave
->reply
);
7617 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7618 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
7620 /* No way, we need to wait for the next BGSAVE in order to
7621 * register differences */
7622 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
7623 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
7626 /* Ok we don't have a BGSAVE in progress, let's start one */
7627 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
7628 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
7629 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
7630 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
7633 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7636 c
->flags
|= REDIS_SLAVE
;
7638 listAddNodeTail(server
.slaves
,c
);
7642 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
7643 redisClient
*slave
= privdata
;
7645 REDIS_NOTUSED(mask
);
7646 char buf
[REDIS_IOBUF_LEN
];
7647 ssize_t nwritten
, buflen
;
7649 if (slave
->repldboff
== 0) {
7650 /* Write the bulk write count before to transfer the DB. In theory here
7651 * we don't know how much room there is in the output buffer of the
7652 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7653 * operations) will never be smaller than the few bytes we need. */
7656 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7658 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
7666 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
7667 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
7669 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
7670 (buflen
== 0) ? "premature EOF" : strerror(errno
));
7674 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
7675 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
7680 slave
->repldboff
+= nwritten
;
7681 if (slave
->repldboff
== slave
->repldbsize
) {
7682 close(slave
->repldbfd
);
7683 slave
->repldbfd
= -1;
7684 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
7685 slave
->replstate
= REDIS_REPL_ONLINE
;
7686 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
7687 sendReplyToClient
, slave
) == AE_ERR
) {
7691 addReplySds(slave
,sdsempty());
7692 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
7696 /* This function is called at the end of every backgrond saving.
7697 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7698 * otherwise REDIS_ERR is passed to the function.
7700 * The goal of this function is to handle slaves waiting for a successful
7701 * background saving in order to perform non-blocking synchronization. */
7702 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
7704 int startbgsave
= 0;
7707 listRewind(server
.slaves
,&li
);
7708 while((ln
= listNext(&li
))) {
7709 redisClient
*slave
= ln
->value
;
7711 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
7713 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7714 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
7715 struct redis_stat buf
;
7717 if (bgsaveerr
!= REDIS_OK
) {
7719 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
7722 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
7723 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
7725 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
7728 slave
->repldboff
= 0;
7729 slave
->repldbsize
= buf
.st_size
;
7730 slave
->replstate
= REDIS_REPL_SEND_BULK
;
7731 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
7732 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
7739 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
7742 listRewind(server
.slaves
,&li
);
7743 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
7744 while((ln
= listNext(&li
))) {
7745 redisClient
*slave
= ln
->value
;
7747 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
7754 static int syncWithMaster(void) {
7755 char buf
[1024], tmpfile
[256], authcmd
[1024];
7757 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
7758 int dfd
, maxtries
= 5;
7761 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
7766 /* AUTH with the master if required. */
7767 if(server
.masterauth
) {
7768 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
7769 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
7771 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
7775 /* Read the AUTH result. */
7776 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
7778 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
7782 if (buf
[0] != '+') {
7784 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
7789 /* Issue the SYNC command */
7790 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
7792 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
7796 /* Read the bulk write count */
7797 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
7799 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
7803 if (buf
[0] != '$') {
7805 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7808 dumpsize
= strtol(buf
+1,NULL
,10);
7809 redisLog(REDIS_NOTICE
,"Receiving %ld bytes data dump from MASTER",dumpsize
);
7810 /* Read the bulk write data on a temp file */
7812 snprintf(tmpfile
,256,
7813 "temp-%d.%ld.rdb",(int)time(NULL
),(long int)getpid());
7814 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
|O_EXCL
,0644);
7815 if (dfd
!= -1) break;
7820 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
7824 int nread
, nwritten
;
7826 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
7828 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
7834 nwritten
= write(dfd
,buf
,nread
);
7835 if (nwritten
== -1) {
7836 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
7844 if (rename(tmpfile
,server
.dbfilename
) == -1) {
7845 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
7851 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
7852 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
7856 server
.master
= createClient(fd
);
7857 server
.master
->flags
|= REDIS_MASTER
;
7858 server
.master
->authenticated
= 1;
7859 server
.replstate
= REDIS_REPL_CONNECTED
;
7863 static void slaveofCommand(redisClient
*c
) {
7864 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
7865 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
7866 if (server
.masterhost
) {
7867 sdsfree(server
.masterhost
);
7868 server
.masterhost
= NULL
;
7869 if (server
.master
) freeClient(server
.master
);
7870 server
.replstate
= REDIS_REPL_NONE
;
7871 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
7874 sdsfree(server
.masterhost
);
7875 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
7876 server
.masterport
= atoi(c
->argv
[2]->ptr
);
7877 if (server
.master
) freeClient(server
.master
);
7878 server
.replstate
= REDIS_REPL_CONNECT
;
7879 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
7880 server
.masterhost
, server
.masterport
);
7882 addReply(c
,shared
.ok
);
7885 /* ============================ Maxmemory directive ======================== */
7887 /* Try to free one object form the pre-allocated objects free list.
7888 * This is useful under low mem conditions as by default we take 1 million
7889 * free objects allocated. On success REDIS_OK is returned, otherwise
7891 static int tryFreeOneObjectFromFreelist(void) {
7894 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
7895 if (listLength(server
.objfreelist
)) {
7896 listNode
*head
= listFirst(server
.objfreelist
);
7897 o
= listNodeValue(head
);
7898 listDelNode(server
.objfreelist
,head
);
7899 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
7903 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
7908 /* This function gets called when 'maxmemory' is set on the config file to limit
7909 * the max memory used by the server, and we are out of memory.
7910 * This function will try to, in order:
7912 * - Free objects from the free list
7913 * - Try to remove keys with an EXPIRE set
7915 * It is not possible to free enough memory to reach used-memory < maxmemory
7916 * the server will start refusing commands that will enlarge even more the
7919 static void freeMemoryIfNeeded(void) {
7920 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
7921 int j
, k
, freed
= 0;
7923 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
7924 for (j
= 0; j
< server
.dbnum
; j
++) {
7926 robj
*minkey
= NULL
;
7927 struct dictEntry
*de
;
7929 if (dictSize(server
.db
[j
].expires
)) {
7931 /* From a sample of three keys drop the one nearest to
7932 * the natural expire */
7933 for (k
= 0; k
< 3; k
++) {
7936 de
= dictGetRandomKey(server
.db
[j
].expires
);
7937 t
= (time_t) dictGetEntryVal(de
);
7938 if (minttl
== -1 || t
< minttl
) {
7939 minkey
= dictGetEntryKey(de
);
7943 deleteKey(server
.db
+j
,minkey
);
7946 if (!freed
) return; /* nothing to free... */
7950 /* ============================== Append Only file ========================== */
7952 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
7953 sds buf
= sdsempty();
7959 /* The DB this command was targetting is not the same as the last command
7960 * we appendend. To issue a SELECT command is needed. */
7961 if (dictid
!= server
.appendseldb
) {
7964 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
7965 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7966 (unsigned long)strlen(seldb
),seldb
);
7967 server
.appendseldb
= dictid
;
7970 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7971 * EXPIREs into EXPIREATs calls */
7972 if (cmd
->proc
== expireCommand
) {
7975 tmpargv
[0] = createStringObject("EXPIREAT",8);
7976 tmpargv
[1] = argv
[1];
7977 incrRefCount(argv
[1]);
7978 when
= time(NULL
)+strtol(argv
[2]->ptr
,NULL
,10);
7979 tmpargv
[2] = createObject(REDIS_STRING
,
7980 sdscatprintf(sdsempty(),"%ld",when
));
7984 /* Append the actual command */
7985 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
7986 for (j
= 0; j
< argc
; j
++) {
7989 o
= getDecodedObject(o
);
7990 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
7991 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
7992 buf
= sdscatlen(buf
,"\r\n",2);
7996 /* Free the objects from the modified argv for EXPIREAT */
7997 if (cmd
->proc
== expireCommand
) {
7998 for (j
= 0; j
< 3; j
++)
7999 decrRefCount(argv
[j
]);
8002 /* We want to perform a single write. This should be guaranteed atomic
8003 * at least if the filesystem we are writing is a real physical one.
8004 * While this will save us against the server being killed I don't think
8005 * there is much to do about the whole server stopping for power problems
8007 nwritten
= write(server
.appendfd
,buf
,sdslen(buf
));
8008 if (nwritten
!= (signed)sdslen(buf
)) {
8009 /* Ooops, we are in troubles. The best thing to do for now is
8010 * to simply exit instead to give the illusion that everything is
8011 * working as expected. */
8012 if (nwritten
== -1) {
8013 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
8015 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
8019 /* If a background append only file rewriting is in progress we want to
8020 * accumulate the differences between the child DB and the current one
8021 * in a buffer, so that when the child process will do its work we
8022 * can append the differences to the new append only file. */
8023 if (server
.bgrewritechildpid
!= -1)
8024 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
8028 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
8029 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
8030 now
-server
.lastfsync
> 1))
8032 fsync(server
.appendfd
); /* Let's try to get this data on the disk */
8033 server
.lastfsync
= now
;
8037 /* In Redis commands are always executed in the context of a client, so in
8038 * order to load the append only file we need to create a fake client. */
8039 static struct redisClient
*createFakeClient(void) {
8040 struct redisClient
*c
= zmalloc(sizeof(*c
));
8044 c
->querybuf
= sdsempty();
8048 /* We set the fake client as a slave waiting for the synchronization
8049 * so that Redis will not try to send replies to this client. */
8050 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
8051 c
->reply
= listCreate();
8052 listSetFreeMethod(c
->reply
,decrRefCount
);
8053 listSetDupMethod(c
->reply
,dupClientReplyValue
);
8057 static void freeFakeClient(struct redisClient
*c
) {
8058 sdsfree(c
->querybuf
);
8059 listRelease(c
->reply
);
8063 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8064 * error (the append only file is zero-length) REDIS_ERR is returned. On
8065 * fatal error an error message is logged and the program exists. */
8066 int loadAppendOnlyFile(char *filename
) {
8067 struct redisClient
*fakeClient
;
8068 FILE *fp
= fopen(filename
,"r");
8069 struct redis_stat sb
;
8070 unsigned long long loadedkeys
= 0;
8072 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
8076 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
8080 fakeClient
= createFakeClient();
8087 struct redisCommand
*cmd
;
8089 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
8095 if (buf
[0] != '*') goto fmterr
;
8097 argv
= zmalloc(sizeof(robj
*)*argc
);
8098 for (j
= 0; j
< argc
; j
++) {
8099 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
8100 if (buf
[0] != '$') goto fmterr
;
8101 len
= strtol(buf
+1,NULL
,10);
8102 argsds
= sdsnewlen(NULL
,len
);
8103 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
8104 argv
[j
] = createObject(REDIS_STRING
,argsds
);
8105 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
8108 /* Command lookup */
8109 cmd
= lookupCommand(argv
[0]->ptr
);
8111 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
8114 /* Try object encoding */
8115 if (cmd
->flags
& REDIS_CMD_BULK
)
8116 argv
[argc
-1] = tryObjectEncoding(argv
[argc
-1]);
8117 /* Run the command in the context of a fake client */
8118 fakeClient
->argc
= argc
;
8119 fakeClient
->argv
= argv
;
8120 cmd
->proc(fakeClient
);
8121 /* Discard the reply objects list from the fake client */
8122 while(listLength(fakeClient
->reply
))
8123 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
8124 /* Clean up, ready for the next command */
8125 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
8127 /* Handle swapping while loading big datasets when VM is on */
8129 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
8130 while (zmalloc_used_memory() > server
.vm_max_memory
) {
8131 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
8136 freeFakeClient(fakeClient
);
8141 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
8143 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
8147 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
8151 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8152 static int fwriteBulkObject(FILE *fp
, robj
*obj
) {
8156 /* Avoid the incr/decr ref count business if possible to help
8157 * copy-on-write (we are often in a child process when this function
8159 * Also makes sure that key objects don't get incrRefCount-ed when VM
8161 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
8162 obj
= getDecodedObject(obj
);
8165 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(long)sdslen(obj
->ptr
));
8166 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) goto err
;
8167 if (sdslen(obj
->ptr
) && fwrite(obj
->ptr
,sdslen(obj
->ptr
),1,fp
) == 0)
8169 if (fwrite("\r\n",2,1,fp
) == 0) goto err
;
8170 if (decrrc
) decrRefCount(obj
);
8173 if (decrrc
) decrRefCount(obj
);
8177 /* Write binary-safe string into a file in the bulkformat
8178 * $<count>\r\n<payload>\r\n */
8179 static int fwriteBulkString(FILE *fp
, char *s
, unsigned long len
) {
8182 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(unsigned long)len
);
8183 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
8184 if (len
&& fwrite(s
,len
,1,fp
) == 0) return 0;
8185 if (fwrite("\r\n",2,1,fp
) == 0) return 0;
8189 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8190 static int fwriteBulkDouble(FILE *fp
, double d
) {
8191 char buf
[128], dbuf
[128];
8193 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
8194 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
8195 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
8196 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
8200 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8201 static int fwriteBulkLong(FILE *fp
, long l
) {
8202 char buf
[128], lbuf
[128];
8204 snprintf(lbuf
,sizeof(lbuf
),"%ld\r\n",l
);
8205 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(lbuf
)-2);
8206 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
8207 if (fwrite(lbuf
,strlen(lbuf
),1,fp
) == 0) return 0;
8211 /* Write a sequence of commands able to fully rebuild the dataset into
8212 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8213 static int rewriteAppendOnlyFile(char *filename
) {
8214 dictIterator
*di
= NULL
;
8219 time_t now
= time(NULL
);
8221 /* Note that we have to use a different temp name here compared to the
8222 * one used by rewriteAppendOnlyFileBackground() function. */
8223 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
8224 fp
= fopen(tmpfile
,"w");
8226 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
8229 for (j
= 0; j
< server
.dbnum
; j
++) {
8230 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
8231 redisDb
*db
= server
.db
+j
;
8233 if (dictSize(d
) == 0) continue;
8234 di
= dictGetIterator(d
);
8240 /* SELECT the new DB */
8241 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
8242 if (fwriteBulkLong(fp
,j
) == 0) goto werr
;
8244 /* Iterate this DB writing every entry */
8245 while((de
= dictNext(di
)) != NULL
) {
8250 key
= dictGetEntryKey(de
);
8251 /* If the value for this key is swapped, load a preview in memory.
8252 * We use a "swapped" flag to remember if we need to free the
8253 * value object instead to just increment the ref count anyway
8254 * in order to avoid copy-on-write of pages if we are forked() */
8255 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
8256 key
->storage
== REDIS_VM_SWAPPING
) {
8257 o
= dictGetEntryVal(de
);
8260 o
= vmPreviewObject(key
);
8263 expiretime
= getExpire(db
,key
);
8265 /* Save the key and associated value */
8266 if (o
->type
== REDIS_STRING
) {
8267 /* Emit a SET command */
8268 char cmd
[]="*3\r\n$3\r\nSET\r\n";
8269 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8271 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8272 if (fwriteBulkObject(fp
,o
) == 0) goto werr
;
8273 } else if (o
->type
== REDIS_LIST
) {
8274 /* Emit the RPUSHes needed to rebuild the list */
8275 list
*list
= o
->ptr
;
8279 listRewind(list
,&li
);
8280 while((ln
= listNext(&li
))) {
8281 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
8282 robj
*eleobj
= listNodeValue(ln
);
8284 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8285 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8286 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
8288 } else if (o
->type
== REDIS_SET
) {
8289 /* Emit the SADDs needed to rebuild the set */
8291 dictIterator
*di
= dictGetIterator(set
);
8294 while((de
= dictNext(di
)) != NULL
) {
8295 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
8296 robj
*eleobj
= dictGetEntryKey(de
);
8298 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8299 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8300 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
8302 dictReleaseIterator(di
);
8303 } else if (o
->type
== REDIS_ZSET
) {
8304 /* Emit the ZADDs needed to rebuild the sorted set */
8306 dictIterator
*di
= dictGetIterator(zs
->dict
);
8309 while((de
= dictNext(di
)) != NULL
) {
8310 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
8311 robj
*eleobj
= dictGetEntryKey(de
);
8312 double *score
= dictGetEntryVal(de
);
8314 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8315 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8316 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
8317 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
8319 dictReleaseIterator(di
);
8320 } else if (o
->type
== REDIS_HASH
) {
8321 char cmd
[]="*4\r\n$4\r\nHSET\r\n";
8323 /* Emit the HSETs needed to rebuild the hash */
8324 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
8325 unsigned char *p
= zipmapRewind(o
->ptr
);
8326 unsigned char *field
, *val
;
8327 unsigned int flen
, vlen
;
8329 while((p
= zipmapNext(p
,&field
,&flen
,&val
,&vlen
)) != NULL
) {
8330 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8331 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8332 if (fwriteBulkString(fp
,(char*)field
,flen
) == -1)
8334 if (fwriteBulkString(fp
,(char*)val
,vlen
) == -1)
8338 dictIterator
*di
= dictGetIterator(o
->ptr
);
8341 while((de
= dictNext(di
)) != NULL
) {
8342 robj
*field
= dictGetEntryKey(de
);
8343 robj
*val
= dictGetEntryVal(de
);
8345 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8346 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8347 if (fwriteBulkObject(fp
,field
) == -1) return -1;
8348 if (fwriteBulkObject(fp
,val
) == -1) return -1;
8350 dictReleaseIterator(di
);
8353 redisPanic("Unknown object type");
8355 /* Save the expire time */
8356 if (expiretime
!= -1) {
8357 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
8358 /* If this key is already expired skip it */
8359 if (expiretime
< now
) continue;
8360 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8361 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8362 if (fwriteBulkLong(fp
,expiretime
) == 0) goto werr
;
8364 if (swapped
) decrRefCount(o
);
8366 dictReleaseIterator(di
);
8369 /* Make sure data will not remain on the OS's output buffers */
8374 /* Use RENAME to make sure the DB file is changed atomically only
8375 * if the generate DB file is ok. */
8376 if (rename(tmpfile
,filename
) == -1) {
8377 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
8381 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
8387 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
8388 if (di
) dictReleaseIterator(di
);
8392 /* This is how rewriting of the append only file in background works:
8394 * 1) The user calls BGREWRITEAOF
8395 * 2) Redis calls this function, that forks():
8396 * 2a) the child rewrite the append only file in a temp file.
8397 * 2b) the parent accumulates differences in server.bgrewritebuf.
8398 * 3) When the child finished '2a' exists.
8399 * 4) The parent will trap the exit code, if it's OK, will append the
8400 * data accumulated into server.bgrewritebuf into the temp file, and
8401 * finally will rename(2) the temp file in the actual file name.
8402 * The the new file is reopened as the new append only file. Profit!
8404 static int rewriteAppendOnlyFileBackground(void) {
8407 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
8408 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
8409 if ((childpid
= fork()) == 0) {
8413 if (server
.vm_enabled
) vmReopenSwapFile();
8415 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8416 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
8423 if (childpid
== -1) {
8424 redisLog(REDIS_WARNING
,
8425 "Can't rewrite append only file in background: fork: %s",
8429 redisLog(REDIS_NOTICE
,
8430 "Background append only file rewriting started by pid %d",childpid
);
8431 server
.bgrewritechildpid
= childpid
;
8432 updateDictResizePolicy();
8433 /* We set appendseldb to -1 in order to force the next call to the
8434 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8435 * accumulated by the parent into server.bgrewritebuf will start
8436 * with a SELECT statement and it will be safe to merge. */
8437 server
.appendseldb
= -1;
8440 return REDIS_OK
; /* unreached */
8443 static void bgrewriteaofCommand(redisClient
*c
) {
8444 if (server
.bgrewritechildpid
!= -1) {
8445 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8448 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
8449 char *status
= "+Background append only file rewriting started\r\n";
8450 addReplySds(c
,sdsnew(status
));
8452 addReply(c
,shared
.err
);
8456 static void aofRemoveTempFile(pid_t childpid
) {
8459 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
8463 /* Virtual Memory is composed mainly of two subsystems:
8464 * - Blocking Virutal Memory
8465 * - Threaded Virtual Memory I/O
8466 * The two parts are not fully decoupled, but functions are split among two
8467 * different sections of the source code (delimited by comments) in order to
8468 * make more clear what functionality is about the blocking VM and what about
8469 * the threaded (not blocking) VM.
8473 * Redis VM is a blocking VM (one that blocks reading swapped values from
8474 * disk into memory when a value swapped out is needed in memory) that is made
8475 * unblocking by trying to examine the command argument vector in order to
8476 * load in background values that will likely be needed in order to exec
8477 * the command. The command is executed only once all the relevant keys
8478 * are loaded into memory.
8480 * This basically is almost as simple of a blocking VM, but almost as parallel
8481 * as a fully non-blocking VM.
8484 /* =================== Virtual Memory - Blocking Side ====================== */
8486 /* substitute the first occurrence of '%p' with the process pid in the
8487 * swap file name. */
8488 static void expandVmSwapFilename(void) {
8489 char *p
= strstr(server
.vm_swap_file
,"%p");
8495 new = sdscat(new,server
.vm_swap_file
);
8496 new = sdscatprintf(new,"%ld",(long) getpid());
8497 new = sdscat(new,p
+2);
8498 zfree(server
.vm_swap_file
);
8499 server
.vm_swap_file
= new;
8502 static void vmInit(void) {
8507 if (server
.vm_max_threads
!= 0)
8508 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8510 expandVmSwapFilename();
8511 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
8512 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
8513 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
8515 if (server
.vm_fp
== NULL
) {
8516 redisLog(REDIS_WARNING
,
8517 "Impossible to open the swap file: %s. Exiting.",
8521 server
.vm_fd
= fileno(server
.vm_fp
);
8522 server
.vm_next_page
= 0;
8523 server
.vm_near_pages
= 0;
8524 server
.vm_stats_used_pages
= 0;
8525 server
.vm_stats_swapped_objects
= 0;
8526 server
.vm_stats_swapouts
= 0;
8527 server
.vm_stats_swapins
= 0;
8528 totsize
= server
.vm_pages
*server
.vm_page_size
;
8529 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
8530 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
8531 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
8535 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
8537 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
8538 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
8539 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
8540 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
8542 /* Initialize threaded I/O (used by Virtual Memory) */
8543 server
.io_newjobs
= listCreate();
8544 server
.io_processing
= listCreate();
8545 server
.io_processed
= listCreate();
8546 server
.io_ready_clients
= listCreate();
8547 pthread_mutex_init(&server
.io_mutex
,NULL
);
8548 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
8549 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
8550 server
.io_active_threads
= 0;
8551 if (pipe(pipefds
) == -1) {
8552 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
8556 server
.io_ready_pipe_read
= pipefds
[0];
8557 server
.io_ready_pipe_write
= pipefds
[1];
8558 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
8559 /* LZF requires a lot of stack */
8560 pthread_attr_init(&server
.io_threads_attr
);
8561 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
8562 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
8563 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
8564 /* Listen for events in the threaded I/O pipe */
8565 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
8566 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
8567 oom("creating file event");
8570 /* Mark the page as used */
8571 static void vmMarkPageUsed(off_t page
) {
8572 off_t byte
= page
/8;
8574 redisAssert(vmFreePage(page
) == 1);
8575 server
.vm_bitmap
[byte
] |= 1<<bit
;
8578 /* Mark N contiguous pages as used, with 'page' being the first. */
8579 static void vmMarkPagesUsed(off_t page
, off_t count
) {
8582 for (j
= 0; j
< count
; j
++)
8583 vmMarkPageUsed(page
+j
);
8584 server
.vm_stats_used_pages
+= count
;
8585 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
8586 (long long)count
, (long long)page
);
8589 /* Mark the page as free */
8590 static void vmMarkPageFree(off_t page
) {
8591 off_t byte
= page
/8;
8593 redisAssert(vmFreePage(page
) == 0);
8594 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
8597 /* Mark N contiguous pages as free, with 'page' being the first. */
8598 static void vmMarkPagesFree(off_t page
, off_t count
) {
8601 for (j
= 0; j
< count
; j
++)
8602 vmMarkPageFree(page
+j
);
8603 server
.vm_stats_used_pages
-= count
;
8604 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
8605 (long long)count
, (long long)page
);
8608 /* Test if the page is free */
8609 static int vmFreePage(off_t page
) {
8610 off_t byte
= page
/8;
8612 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
8615 /* Find N contiguous free pages storing the first page of the cluster in *first.
8616 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8617 * REDIS_ERR is returned.
8619 * This function uses a simple algorithm: we try to allocate
8620 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8621 * again from the start of the swap file searching for free spaces.
8623 * If it looks pretty clear that there are no free pages near our offset
8624 * we try to find less populated places doing a forward jump of
8625 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8626 * without hurry, and then we jump again and so forth...
8628 * This function can be improved using a free list to avoid to guess
8629 * too much, since we could collect data about freed pages.
8631 * note: I implemented this function just after watching an episode of
8632 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8634 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
8635 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
8637 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
8638 server
.vm_near_pages
= 0;
8639 server
.vm_next_page
= 0;
8641 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
8642 base
= server
.vm_next_page
;
8644 while(offset
< server
.vm_pages
) {
8645 off_t
this = base
+offset
;
8647 /* If we overflow, restart from page zero */
8648 if (this >= server
.vm_pages
) {
8649 this -= server
.vm_pages
;
8651 /* Just overflowed, what we found on tail is no longer
8652 * interesting, as it's no longer contiguous. */
8656 if (vmFreePage(this)) {
8657 /* This is a free page */
8659 /* Already got N free pages? Return to the caller, with success */
8661 *first
= this-(n
-1);
8662 server
.vm_next_page
= this+1;
8663 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
8667 /* The current one is not a free page */
8671 /* Fast-forward if the current page is not free and we already
8672 * searched enough near this place. */
8674 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
8675 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
8677 /* Note that even if we rewind after the jump, we are don't need
8678 * to make sure numfree is set to zero as we only jump *if* it
8679 * is set to zero. */
8681 /* Otherwise just check the next page */
8688 /* Write the specified object at the specified page of the swap file */
8689 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
8690 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
8691 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
8692 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8693 redisLog(REDIS_WARNING
,
8694 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8698 rdbSaveObject(server
.vm_fp
,o
);
8699 fflush(server
.vm_fp
);
8700 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8704 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8705 * needed to later retrieve the object into the key object.
8706 * If we can't find enough contiguous empty pages to swap the object on disk
8707 * REDIS_ERR is returned. */
8708 static int vmSwapObjectBlocking(robj
*key
, robj
*val
) {
8709 off_t pages
= rdbSavedObjectPages(val
,NULL
);
8712 assert(key
->storage
== REDIS_VM_MEMORY
);
8713 assert(key
->refcount
== 1);
8714 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return REDIS_ERR
;
8715 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return REDIS_ERR
;
8716 key
->vm
.page
= page
;
8717 key
->vm
.usedpages
= pages
;
8718 key
->storage
= REDIS_VM_SWAPPED
;
8719 key
->vtype
= val
->type
;
8720 decrRefCount(val
); /* Deallocate the object from memory. */
8721 vmMarkPagesUsed(page
,pages
);
8722 redisLog(REDIS_DEBUG
,"VM: object %s swapped out at %lld (%lld pages)",
8723 (unsigned char*) key
->ptr
,
8724 (unsigned long long) page
, (unsigned long long) pages
);
8725 server
.vm_stats_swapped_objects
++;
8726 server
.vm_stats_swapouts
++;
8730 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
8733 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
8734 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
8735 redisLog(REDIS_WARNING
,
8736 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8740 o
= rdbLoadObject(type
,server
.vm_fp
);
8742 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
8745 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8749 /* Load the value object relative to the 'key' object from swap to memory.
8750 * The newly allocated object is returned.
8752 * If preview is true the unserialized object is returned to the caller but
8753 * no changes are made to the key object, nor the pages are marked as freed */
8754 static robj
*vmGenericLoadObject(robj
*key
, int preview
) {
8757 redisAssert(key
->storage
== REDIS_VM_SWAPPED
|| key
->storage
== REDIS_VM_LOADING
);
8758 val
= vmReadObjectFromSwap(key
->vm
.page
,key
->vtype
);
8760 key
->storage
= REDIS_VM_MEMORY
;
8761 key
->vm
.atime
= server
.unixtime
;
8762 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
8763 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk",
8764 (unsigned char*) key
->ptr
);
8765 server
.vm_stats_swapped_objects
--;
8767 redisLog(REDIS_DEBUG
, "VM: object %s previewed from disk",
8768 (unsigned char*) key
->ptr
);
8770 server
.vm_stats_swapins
++;
8774 /* Plain object loading, from swap to memory */
8775 static robj
*vmLoadObject(robj
*key
) {
8776 /* If we are loading the object in background, stop it, we
8777 * need to load this object synchronously ASAP. */
8778 if (key
->storage
== REDIS_VM_LOADING
)
8779 vmCancelThreadedIOJob(key
);
8780 return vmGenericLoadObject(key
,0);
8783 /* Just load the value on disk, without to modify the key.
8784 * This is useful when we want to perform some operation on the value
8785 * without to really bring it from swap to memory, like while saving the
8786 * dataset or rewriting the append only log. */
8787 static robj
*vmPreviewObject(robj
*key
) {
8788 return vmGenericLoadObject(key
,1);
8791 /* How a good candidate is this object for swapping?
8792 * The better candidate it is, the greater the returned value.
8794 * Currently we try to perform a fast estimation of the object size in
8795 * memory, and combine it with aging informations.
8797 * Basically swappability = idle-time * log(estimated size)
8799 * Bigger objects are preferred over smaller objects, but not
8800 * proportionally, this is why we use the logarithm. This algorithm is
8801 * just a first try and will probably be tuned later. */
8802 static double computeObjectSwappability(robj
*o
) {
8803 time_t age
= server
.unixtime
- o
->vm
.atime
;
8807 struct dictEntry
*de
;
8810 if (age
<= 0) return 0;
8813 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
8816 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
8821 listNode
*ln
= listFirst(l
);
8823 asize
= sizeof(list
);
8825 robj
*ele
= ln
->value
;
8828 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8829 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8831 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
8836 z
= (o
->type
== REDIS_ZSET
);
8837 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
8839 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
8840 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
8845 de
= dictGetRandomKey(d
);
8846 ele
= dictGetEntryKey(de
);
8847 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8848 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8850 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
8851 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
8855 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
8856 unsigned char *p
= zipmapRewind((unsigned char*)o
->ptr
);
8857 unsigned int len
= zipmapLen((unsigned char*)o
->ptr
);
8858 unsigned int klen
, vlen
;
8859 unsigned char *key
, *val
;
8861 if ((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) == NULL
) {
8865 asize
= len
*(klen
+vlen
+3);
8866 } else if (o
->encoding
== REDIS_ENCODING_HT
) {
8868 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
8873 de
= dictGetRandomKey(d
);
8874 ele
= dictGetEntryKey(de
);
8875 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8876 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8878 ele
= dictGetEntryVal(de
);
8879 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8880 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8882 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
8887 return (double)age
*log(1+asize
);
8890 /* Try to swap an object that's a good candidate for swapping.
8891 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8892 * to swap any object at all.
8894 * If 'usethreaded' is true, Redis will try to swap the object in background
8895 * using I/O threads. */
8896 static int vmSwapOneObject(int usethreads
) {
8898 struct dictEntry
*best
= NULL
;
8899 double best_swappability
= 0;
8900 redisDb
*best_db
= NULL
;
8903 for (j
= 0; j
< server
.dbnum
; j
++) {
8904 redisDb
*db
= server
.db
+j
;
8905 /* Why maxtries is set to 100?
8906 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8907 * are swappable objects */
8910 if (dictSize(db
->dict
) == 0) continue;
8911 for (i
= 0; i
< 5; i
++) {
8913 double swappability
;
8915 if (maxtries
) maxtries
--;
8916 de
= dictGetRandomKey(db
->dict
);
8917 key
= dictGetEntryKey(de
);
8918 val
= dictGetEntryVal(de
);
8919 /* Only swap objects that are currently in memory.
8921 * Also don't swap shared objects if threaded VM is on, as we
8922 * try to ensure that the main thread does not touch the
8923 * object while the I/O thread is using it, but we can't
8924 * control other keys without adding additional mutex. */
8925 if (key
->storage
!= REDIS_VM_MEMORY
||
8926 (server
.vm_max_threads
!= 0 && val
->refcount
!= 1)) {
8927 if (maxtries
) i
--; /* don't count this try */
8930 swappability
= computeObjectSwappability(val
);
8931 if (!best
|| swappability
> best_swappability
) {
8933 best_swappability
= swappability
;
8938 if (best
== NULL
) return REDIS_ERR
;
8939 key
= dictGetEntryKey(best
);
8940 val
= dictGetEntryVal(best
);
8942 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
8943 key
->ptr
, best_swappability
);
8945 /* Unshare the key if needed */
8946 if (key
->refcount
> 1) {
8947 robj
*newkey
= dupStringObject(key
);
8949 key
= dictGetEntryKey(best
) = newkey
;
8953 vmSwapObjectThreaded(key
,val
,best_db
);
8956 if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
8957 dictGetEntryVal(best
) = NULL
;
8965 static int vmSwapOneObjectBlocking() {
8966 return vmSwapOneObject(0);
8969 static int vmSwapOneObjectThreaded() {
8970 return vmSwapOneObject(1);
8973 /* Return true if it's safe to swap out objects in a given moment.
8974 * Basically we don't want to swap objects out while there is a BGSAVE
8975 * or a BGAEOREWRITE running in backgroud. */
8976 static int vmCanSwapOut(void) {
8977 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
8980 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8981 * and was deleted. Otherwise 0 is returned. */
8982 static int deleteIfSwapped(redisDb
*db
, robj
*key
) {
8986 if ((de
= dictFind(db
->dict
,key
)) == NULL
) return 0;
8987 foundkey
= dictGetEntryKey(de
);
8988 if (foundkey
->storage
== REDIS_VM_MEMORY
) return 0;
8993 /* =================== Virtual Memory - Threaded I/O ======================= */
8995 static void freeIOJob(iojob
*j
) {
8996 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
8997 j
->type
== REDIS_IOJOB_DO_SWAP
||
8998 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
8999 decrRefCount(j
->val
);
9000 /* We don't decrRefCount the j->key field as we did't incremented
9001 * the count creating IO Jobs. This is because the key field here is
9002 * just used as an indentifier and if a key is removed the Job should
9003 * never be touched again. */
9007 /* Every time a thread finished a Job, it writes a byte into the write side
9008 * of an unix pipe in order to "awake" the main thread, and this function
9010 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
9014 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
9016 REDIS_NOTUSED(mask
);
9017 REDIS_NOTUSED(privdata
);
9019 /* For every byte we read in the read side of the pipe, there is one
9020 * I/O job completed to process. */
9021 while((retval
= read(fd
,buf
,1)) == 1) {
9025 struct dictEntry
*de
;
9027 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
9029 /* Get the processed element (the oldest one) */
9031 assert(listLength(server
.io_processed
) != 0);
9032 if (toprocess
== -1) {
9033 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
9034 if (toprocess
<= 0) toprocess
= 1;
9036 ln
= listFirst(server
.io_processed
);
9038 listDelNode(server
.io_processed
,ln
);
9040 /* If this job is marked as canceled, just ignore it */
9045 /* Post process it in the main thread, as there are things we
9046 * can do just here to avoid race conditions and/or invasive locks */
9047 redisLog(REDIS_DEBUG
,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j
, j
->type
, (void*)j
->key
, (char*)j
->key
->ptr
, j
->key
->refcount
);
9048 de
= dictFind(j
->db
->dict
,j
->key
);
9050 key
= dictGetEntryKey(de
);
9051 if (j
->type
== REDIS_IOJOB_LOAD
) {
9054 /* Key loaded, bring it at home */
9055 key
->storage
= REDIS_VM_MEMORY
;
9056 key
->vm
.atime
= server
.unixtime
;
9057 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
9058 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
9059 (unsigned char*) key
->ptr
);
9060 server
.vm_stats_swapped_objects
--;
9061 server
.vm_stats_swapins
++;
9062 dictGetEntryVal(de
) = j
->val
;
9063 incrRefCount(j
->val
);
9066 /* Handle clients waiting for this key to be loaded. */
9067 handleClientsBlockedOnSwappedKey(db
,key
);
9068 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
9069 /* Now we know the amount of pages required to swap this object.
9070 * Let's find some space for it, and queue this task again
9071 * rebranded as REDIS_IOJOB_DO_SWAP. */
9072 if (!vmCanSwapOut() ||
9073 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
9075 /* Ooops... no space or we can't swap as there is
9076 * a fork()ed Redis trying to save stuff on disk. */
9078 key
->storage
= REDIS_VM_MEMORY
; /* undo operation */
9080 /* Note that we need to mark this pages as used now,
9081 * if the job will be canceled, we'll mark them as freed
9083 vmMarkPagesUsed(j
->page
,j
->pages
);
9084 j
->type
= REDIS_IOJOB_DO_SWAP
;
9089 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
9092 /* Key swapped. We can finally free some memory. */
9093 if (key
->storage
!= REDIS_VM_SWAPPING
) {
9094 printf("key->storage: %d\n",key
->storage
);
9095 printf("key->name: %s\n",(char*)key
->ptr
);
9096 printf("key->refcount: %d\n",key
->refcount
);
9097 printf("val: %p\n",(void*)j
->val
);
9098 printf("val->type: %d\n",j
->val
->type
);
9099 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
9101 redisAssert(key
->storage
== REDIS_VM_SWAPPING
);
9102 val
= dictGetEntryVal(de
);
9103 key
->vm
.page
= j
->page
;
9104 key
->vm
.usedpages
= j
->pages
;
9105 key
->storage
= REDIS_VM_SWAPPED
;
9106 key
->vtype
= j
->val
->type
;
9107 decrRefCount(val
); /* Deallocate the object from memory. */
9108 dictGetEntryVal(de
) = NULL
;
9109 redisLog(REDIS_DEBUG
,
9110 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9111 (unsigned char*) key
->ptr
,
9112 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
9113 server
.vm_stats_swapped_objects
++;
9114 server
.vm_stats_swapouts
++;
9116 /* Put a few more swap requests in queue if we are still
9118 if (trytoswap
&& vmCanSwapOut() &&
9119 zmalloc_used_memory() > server
.vm_max_memory
)
9124 more
= listLength(server
.io_newjobs
) <
9125 (unsigned) server
.vm_max_threads
;
9127 /* Don't waste CPU time if swappable objects are rare. */
9128 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
9136 if (processed
== toprocess
) return;
9138 if (retval
< 0 && errno
!= EAGAIN
) {
9139 redisLog(REDIS_WARNING
,
9140 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9145 static void lockThreadedIO(void) {
9146 pthread_mutex_lock(&server
.io_mutex
);
9149 static void unlockThreadedIO(void) {
9150 pthread_mutex_unlock(&server
.io_mutex
);
9153 /* Remove the specified object from the threaded I/O queue if still not
9154 * processed, otherwise make sure to flag it as canceled. */
9155 static void vmCancelThreadedIOJob(robj
*o
) {
9157 server
.io_newjobs
, /* 0 */
9158 server
.io_processing
, /* 1 */
9159 server
.io_processed
/* 2 */
9163 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
9166 /* Search for a matching key in one of the queues */
9167 for (i
= 0; i
< 3; i
++) {
9171 listRewind(lists
[i
],&li
);
9172 while ((ln
= listNext(&li
)) != NULL
) {
9173 iojob
*job
= ln
->value
;
9175 if (job
->canceled
) continue; /* Skip this, already canceled. */
9176 if (job
->key
== o
) {
9177 redisLog(REDIS_DEBUG
,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9178 (void*)job
, (char*)o
->ptr
, job
->type
, i
);
9179 /* Mark the pages as free since the swap didn't happened
9180 * or happened but is now discarded. */
9181 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
9182 vmMarkPagesFree(job
->page
,job
->pages
);
9183 /* Cancel the job. It depends on the list the job is
9186 case 0: /* io_newjobs */
9187 /* If the job was yet not processed the best thing to do
9188 * is to remove it from the queue at all */
9190 listDelNode(lists
[i
],ln
);
9192 case 1: /* io_processing */
9193 /* Oh Shi- the thread is messing with the Job:
9195 * Probably it's accessing the object if this is a
9196 * PREPARE_SWAP or DO_SWAP job.
9197 * If it's a LOAD job it may be reading from disk and
9198 * if we don't wait for the job to terminate before to
9199 * cancel it, maybe in a few microseconds data can be
9200 * corrupted in this pages. So the short story is:
9202 * Better to wait for the job to move into the
9203 * next queue (processed)... */
9205 /* We try again and again until the job is completed. */
9207 /* But let's wait some time for the I/O thread
9208 * to finish with this job. After all this condition
9209 * should be very rare. */
9212 case 2: /* io_processed */
9213 /* The job was already processed, that's easy...
9214 * just mark it as canceled so that we'll ignore it
9215 * when processing completed jobs. */
9219 /* Finally we have to adjust the storage type of the object
9220 * in order to "UNDO" the operaiton. */
9221 if (o
->storage
== REDIS_VM_LOADING
)
9222 o
->storage
= REDIS_VM_SWAPPED
;
9223 else if (o
->storage
== REDIS_VM_SWAPPING
)
9224 o
->storage
= REDIS_VM_MEMORY
;
9231 assert(1 != 1); /* We should never reach this */
9234 static void *IOThreadEntryPoint(void *arg
) {
9239 pthread_detach(pthread_self());
9241 /* Get a new job to process */
9243 if (listLength(server
.io_newjobs
) == 0) {
9244 /* No new jobs in queue, exit. */
9245 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
9246 (long) pthread_self());
9247 server
.io_active_threads
--;
9251 ln
= listFirst(server
.io_newjobs
);
9253 listDelNode(server
.io_newjobs
,ln
);
9254 /* Add the job in the processing queue */
9255 j
->thread
= pthread_self();
9256 listAddNodeTail(server
.io_processing
,j
);
9257 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
9259 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
9260 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
9262 /* Process the Job */
9263 if (j
->type
== REDIS_IOJOB_LOAD
) {
9264 j
->val
= vmReadObjectFromSwap(j
->page
,j
->key
->vtype
);
9265 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
9266 FILE *fp
= fopen("/dev/null","w+");
9267 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
9269 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
9270 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
9274 /* Done: insert the job into the processed queue */
9275 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
9276 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
9278 listDelNode(server
.io_processing
,ln
);
9279 listAddNodeTail(server
.io_processed
,j
);
9282 /* Signal the main thread there is new stuff to process */
9283 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
9285 return NULL
; /* never reached */
9288 static void spawnIOThread(void) {
9290 sigset_t mask
, omask
;
9294 sigaddset(&mask
,SIGCHLD
);
9295 sigaddset(&mask
,SIGHUP
);
9296 sigaddset(&mask
,SIGPIPE
);
9297 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
9298 while ((err
= pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
)) != 0) {
9299 redisLog(REDIS_WARNING
,"Unable to spawn an I/O thread: %s",
9303 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
9304 server
.io_active_threads
++;
9307 /* We need to wait for the last thread to exit before we are able to
9308 * fork() in order to BGSAVE or BGREWRITEAOF. */
9309 static void waitEmptyIOJobsQueue(void) {
9311 int io_processed_len
;
9314 if (listLength(server
.io_newjobs
) == 0 &&
9315 listLength(server
.io_processing
) == 0 &&
9316 server
.io_active_threads
== 0)
9321 /* While waiting for empty jobs queue condition we post-process some
9322 * finshed job, as I/O threads may be hanging trying to write against
9323 * the io_ready_pipe_write FD but there are so much pending jobs that
9325 io_processed_len
= listLength(server
.io_processed
);
9327 if (io_processed_len
) {
9328 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
9329 usleep(1000); /* 1 millisecond */
9331 usleep(10000); /* 10 milliseconds */
9336 static void vmReopenSwapFile(void) {
9337 /* Note: we don't close the old one as we are in the child process
9338 * and don't want to mess at all with the original file object. */
9339 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
9340 if (server
.vm_fp
== NULL
) {
9341 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
9342 server
.vm_swap_file
);
9345 server
.vm_fd
= fileno(server
.vm_fp
);
9348 /* This function must be called while with threaded IO locked */
9349 static void queueIOJob(iojob
*j
) {
9350 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
9351 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
9352 listAddNodeTail(server
.io_newjobs
,j
);
9353 if (server
.io_active_threads
< server
.vm_max_threads
)
9357 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
9360 assert(key
->storage
== REDIS_VM_MEMORY
);
9361 assert(key
->refcount
== 1);
9363 j
= zmalloc(sizeof(*j
));
9364 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
9370 j
->thread
= (pthread_t
) -1;
9371 key
->storage
= REDIS_VM_SWAPPING
;
9379 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9381 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9382 * If there is not already a job loading the key, it is craeted.
9383 * The key is added to the io_keys list in the client structure, and also
9384 * in the hash table mapping swapped keys to waiting clients, that is,
9385 * server.io_waited_keys. */
9386 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
9387 struct dictEntry
*de
;
9391 /* If the key does not exist or is already in RAM we don't need to
9392 * block the client at all. */
9393 de
= dictFind(c
->db
->dict
,key
);
9394 if (de
== NULL
) return 0;
9395 o
= dictGetEntryKey(de
);
9396 if (o
->storage
== REDIS_VM_MEMORY
) {
9398 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
9399 /* We were swapping the key, undo it! */
9400 vmCancelThreadedIOJob(o
);
9404 /* OK: the key is either swapped, or being loaded just now. */
9406 /* Add the key to the list of keys this client is waiting for.
9407 * This maps clients to keys they are waiting for. */
9408 listAddNodeTail(c
->io_keys
,key
);
9411 /* Add the client to the swapped keys => clients waiting map. */
9412 de
= dictFind(c
->db
->io_keys
,key
);
9416 /* For every key we take a list of clients blocked for it */
9418 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
9420 assert(retval
== DICT_OK
);
9422 l
= dictGetEntryVal(de
);
9424 listAddNodeTail(l
,c
);
9426 /* Are we already loading the key from disk? If not create a job */
9427 if (o
->storage
== REDIS_VM_SWAPPED
) {
9430 o
->storage
= REDIS_VM_LOADING
;
9431 j
= zmalloc(sizeof(*j
));
9432 j
->type
= REDIS_IOJOB_LOAD
;
9435 j
->key
->vtype
= o
->vtype
;
9436 j
->page
= o
->vm
.page
;
9439 j
->thread
= (pthread_t
) -1;
9447 /* Preload keys needed for the ZUNION and ZINTER commands. */
9448 static void zunionInterBlockClientOnSwappedKeys(redisClient
*c
) {
9450 num
= atoi(c
->argv
[2]->ptr
);
9451 for (i
= 0; i
< num
; i
++) {
9452 waitForSwappedKey(c
,c
->argv
[3+i
]);
9456 /* Is this client attempting to run a command against swapped keys?
9457 * If so, block it ASAP, load the keys in background, then resume it.
9459 * The important idea about this function is that it can fail! If keys will
9460 * still be swapped when the client is resumed, this key lookups will
9461 * just block loading keys from disk. In practical terms this should only
9462 * happen with SORT BY command or if there is a bug in this function.
9464 * Return 1 if the client is marked as blocked, 0 if the client can
9465 * continue as the keys it is going to access appear to be in memory. */
9466 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
) {
9469 if (cmd
->vm_preload_proc
!= NULL
) {
9470 cmd
->vm_preload_proc(c
);
9472 if (cmd
->vm_firstkey
== 0) return 0;
9473 last
= cmd
->vm_lastkey
;
9474 if (last
< 0) last
= c
->argc
+last
;
9475 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
)
9476 waitForSwappedKey(c
,c
->argv
[j
]);
9479 /* If the client was blocked for at least one key, mark it as blocked. */
9480 if (listLength(c
->io_keys
)) {
9481 c
->flags
|= REDIS_IO_WAIT
;
9482 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
9483 server
.vm_blocked_clients
++;
9490 /* Remove the 'key' from the list of blocked keys for a given client.
9492 * The function returns 1 when there are no longer blocking keys after
9493 * the current one was removed (and the client can be unblocked). */
9494 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
9498 struct dictEntry
*de
;
9500 /* Remove the key from the list of keys this client is waiting for. */
9501 listRewind(c
->io_keys
,&li
);
9502 while ((ln
= listNext(&li
)) != NULL
) {
9503 if (compareStringObjects(ln
->value
,key
) == 0) {
9504 listDelNode(c
->io_keys
,ln
);
9510 /* Remove the client form the key => waiting clients map. */
9511 de
= dictFind(c
->db
->io_keys
,key
);
9513 l
= dictGetEntryVal(de
);
9514 ln
= listSearchKey(l
,c
);
9517 if (listLength(l
) == 0)
9518 dictDelete(c
->db
->io_keys
,key
);
9520 return listLength(c
->io_keys
) == 0;
9523 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
9524 struct dictEntry
*de
;
9529 de
= dictFind(db
->io_keys
,key
);
9532 l
= dictGetEntryVal(de
);
9533 len
= listLength(l
);
9534 /* Note: we can't use something like while(listLength(l)) as the list
9535 * can be freed by the calling function when we remove the last element. */
9538 redisClient
*c
= ln
->value
;
9540 if (dontWaitForSwappedKey(c
,key
)) {
9541 /* Put the client in the list of clients ready to go as we
9542 * loaded all the keys about it. */
9543 listAddNodeTail(server
.io_ready_clients
,c
);
9548 /* =========================== Remote Configuration ========================= */
9550 static void configSetCommand(redisClient
*c
) {
9551 robj
*o
= getDecodedObject(c
->argv
[3]);
9552 if (!strcasecmp(c
->argv
[2]->ptr
,"dbfilename")) {
9553 zfree(server
.dbfilename
);
9554 server
.dbfilename
= zstrdup(o
->ptr
);
9555 } else if (!strcasecmp(c
->argv
[2]->ptr
,"requirepass")) {
9556 zfree(server
.requirepass
);
9557 server
.requirepass
= zstrdup(o
->ptr
);
9558 } else if (!strcasecmp(c
->argv
[2]->ptr
,"masterauth")) {
9559 zfree(server
.masterauth
);
9560 server
.masterauth
= zstrdup(o
->ptr
);
9561 } else if (!strcasecmp(c
->argv
[2]->ptr
,"maxmemory")) {
9562 server
.maxmemory
= strtoll(o
->ptr
, NULL
, 10);
9564 addReplySds(c
,sdscatprintf(sdsempty(),
9565 "-ERR not supported CONFIG parameter %s\r\n",
9566 (char*)c
->argv
[2]->ptr
));
9571 addReply(c
,shared
.ok
);
9574 static void configGetCommand(redisClient
*c
) {
9575 robj
*o
= getDecodedObject(c
->argv
[2]);
9576 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
9577 char *pattern
= o
->ptr
;
9581 decrRefCount(lenobj
);
9583 if (stringmatch(pattern
,"dbfilename",0)) {
9584 addReplyBulkCString(c
,"dbfilename");
9585 addReplyBulkCString(c
,server
.dbfilename
);
9588 if (stringmatch(pattern
,"requirepass",0)) {
9589 addReplyBulkCString(c
,"requirepass");
9590 addReplyBulkCString(c
,server
.requirepass
);
9593 if (stringmatch(pattern
,"masterauth",0)) {
9594 addReplyBulkCString(c
,"masterauth");
9595 addReplyBulkCString(c
,server
.masterauth
);
9598 if (stringmatch(pattern
,"maxmemory",0)) {
9601 snprintf(buf
,128,"%llu\n",server
.maxmemory
);
9602 addReplyBulkCString(c
,"maxmemory");
9603 addReplyBulkCString(c
,buf
);
9607 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%d\r\n",matches
*2);
9610 static void configCommand(redisClient
*c
) {
9611 if (!strcasecmp(c
->argv
[1]->ptr
,"set")) {
9612 if (c
->argc
!= 4) goto badarity
;
9613 configSetCommand(c
);
9614 } else if (!strcasecmp(c
->argv
[1]->ptr
,"get")) {
9615 if (c
->argc
!= 3) goto badarity
;
9616 configGetCommand(c
);
9617 } else if (!strcasecmp(c
->argv
[1]->ptr
,"resetstat")) {
9618 if (c
->argc
!= 2) goto badarity
;
9619 server
.stat_numcommands
= 0;
9620 server
.stat_numconnections
= 0;
9621 server
.stat_expiredkeys
= 0;
9622 server
.stat_starttime
= time(NULL
);
9623 addReply(c
,shared
.ok
);
9625 addReplySds(c
,sdscatprintf(sdsempty(),
9626 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9631 addReplySds(c
,sdscatprintf(sdsempty(),
9632 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9633 (char*) c
->argv
[1]->ptr
));
9636 /* =========================== Pubsub implementation ======================== */
9638 static void freePubsubPattern(void *p
) {
9639 pubsubPattern
*pat
= p
;
9641 decrRefCount(pat
->pattern
);
9645 static int listMatchPubsubPattern(void *a
, void *b
) {
9646 pubsubPattern
*pa
= a
, *pb
= b
;
9648 return (pa
->client
== pb
->client
) &&
9649 (compareStringObjects(pa
->pattern
,pb
->pattern
) == 0);
9652 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9653 * 0 if the client was already subscribed to that channel. */
9654 static int pubsubSubscribeChannel(redisClient
*c
, robj
*channel
) {
9655 struct dictEntry
*de
;
9656 list
*clients
= NULL
;
9659 /* Add the channel to the client -> channels hash table */
9660 if (dictAdd(c
->pubsub_channels
,channel
,NULL
) == DICT_OK
) {
9662 incrRefCount(channel
);
9663 /* Add the client to the channel -> list of clients hash table */
9664 de
= dictFind(server
.pubsub_channels
,channel
);
9666 clients
= listCreate();
9667 dictAdd(server
.pubsub_channels
,channel
,clients
);
9668 incrRefCount(channel
);
9670 clients
= dictGetEntryVal(de
);
9672 listAddNodeTail(clients
,c
);
9674 /* Notify the client */
9675 addReply(c
,shared
.mbulk3
);
9676 addReply(c
,shared
.subscribebulk
);
9677 addReplyBulk(c
,channel
);
9678 addReplyLong(c
,dictSize(c
->pubsub_channels
)+listLength(c
->pubsub_patterns
));
9682 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9683 * 0 if the client was not subscribed to the specified channel. */
9684 static int pubsubUnsubscribeChannel(redisClient
*c
, robj
*channel
, int notify
) {
9685 struct dictEntry
*de
;
9690 /* Remove the channel from the client -> channels hash table */
9691 incrRefCount(channel
); /* channel may be just a pointer to the same object
9692 we have in the hash tables. Protect it... */
9693 if (dictDelete(c
->pubsub_channels
,channel
) == DICT_OK
) {
9695 /* Remove the client from the channel -> clients list hash table */
9696 de
= dictFind(server
.pubsub_channels
,channel
);
9698 clients
= dictGetEntryVal(de
);
9699 ln
= listSearchKey(clients
,c
);
9701 listDelNode(clients
,ln
);
9702 if (listLength(clients
) == 0) {
9703 /* Free the list and associated hash entry at all if this was
9704 * the latest client, so that it will be possible to abuse
9705 * Redis PUBSUB creating millions of channels. */
9706 dictDelete(server
.pubsub_channels
,channel
);
9709 /* Notify the client */
9711 addReply(c
,shared
.mbulk3
);
9712 addReply(c
,shared
.unsubscribebulk
);
9713 addReplyBulk(c
,channel
);
9714 addReplyLong(c
,dictSize(c
->pubsub_channels
)+
9715 listLength(c
->pubsub_patterns
));
9718 decrRefCount(channel
); /* it is finally safe to release it */
9722 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9723 static int pubsubSubscribePattern(redisClient
*c
, robj
*pattern
) {
9726 if (listSearchKey(c
->pubsub_patterns
,pattern
) == NULL
) {
9729 listAddNodeTail(c
->pubsub_patterns
,pattern
);
9730 incrRefCount(pattern
);
9731 pat
= zmalloc(sizeof(*pat
));
9732 pat
->pattern
= getDecodedObject(pattern
);
9734 listAddNodeTail(server
.pubsub_patterns
,pat
);
9736 /* Notify the client */
9737 addReply(c
,shared
.mbulk3
);
9738 addReply(c
,shared
.psubscribebulk
);
9739 addReplyBulk(c
,pattern
);
9740 addReplyLong(c
,dictSize(c
->pubsub_channels
)+listLength(c
->pubsub_patterns
));
9744 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9745 * 0 if the client was not subscribed to the specified channel. */
9746 static int pubsubUnsubscribePattern(redisClient
*c
, robj
*pattern
, int notify
) {
9751 incrRefCount(pattern
); /* Protect the object. May be the same we remove */
9752 if ((ln
= listSearchKey(c
->pubsub_patterns
,pattern
)) != NULL
) {
9754 listDelNode(c
->pubsub_patterns
,ln
);
9756 pat
.pattern
= pattern
;
9757 ln
= listSearchKey(server
.pubsub_patterns
,&pat
);
9758 listDelNode(server
.pubsub_patterns
,ln
);
9760 /* Notify the client */
9762 addReply(c
,shared
.mbulk3
);
9763 addReply(c
,shared
.punsubscribebulk
);
9764 addReplyBulk(c
,pattern
);
9765 addReplyLong(c
,dictSize(c
->pubsub_channels
)+
9766 listLength(c
->pubsub_patterns
));
9768 decrRefCount(pattern
);
9772 /* Unsubscribe from all the channels. Return the number of channels the
9773 * client was subscribed from. */
9774 static int pubsubUnsubscribeAllChannels(redisClient
*c
, int notify
) {
9775 dictIterator
*di
= dictGetIterator(c
->pubsub_channels
);
9779 while((de
= dictNext(di
)) != NULL
) {
9780 robj
*channel
= dictGetEntryKey(de
);
9782 count
+= pubsubUnsubscribeChannel(c
,channel
,notify
);
9784 dictReleaseIterator(di
);
9788 /* Unsubscribe from all the patterns. Return the number of patterns the
9789 * client was subscribed from. */
9790 static int pubsubUnsubscribeAllPatterns(redisClient
*c
, int notify
) {
9795 listRewind(c
->pubsub_patterns
,&li
);
9796 while ((ln
= listNext(&li
)) != NULL
) {
9797 robj
*pattern
= ln
->value
;
9799 count
+= pubsubUnsubscribePattern(c
,pattern
,notify
);
9804 /* Publish a message */
9805 static int pubsubPublishMessage(robj
*channel
, robj
*message
) {
9807 struct dictEntry
*de
;
9811 /* Send to clients listening for that channel */
9812 de
= dictFind(server
.pubsub_channels
,channel
);
9814 list
*list
= dictGetEntryVal(de
);
9818 listRewind(list
,&li
);
9819 while ((ln
= listNext(&li
)) != NULL
) {
9820 redisClient
*c
= ln
->value
;
9822 addReply(c
,shared
.mbulk3
);
9823 addReply(c
,shared
.messagebulk
);
9824 addReplyBulk(c
,channel
);
9825 addReplyBulk(c
,message
);
9829 /* Send to clients listening to matching channels */
9830 if (listLength(server
.pubsub_patterns
)) {
9831 listRewind(server
.pubsub_patterns
,&li
);
9832 channel
= getDecodedObject(channel
);
9833 while ((ln
= listNext(&li
)) != NULL
) {
9834 pubsubPattern
*pat
= ln
->value
;
9836 if (stringmatchlen((char*)pat
->pattern
->ptr
,
9837 sdslen(pat
->pattern
->ptr
),
9838 (char*)channel
->ptr
,
9839 sdslen(channel
->ptr
),0)) {
9840 addReply(pat
->client
,shared
.mbulk4
);
9841 addReply(pat
->client
,shared
.pmessagebulk
);
9842 addReplyBulk(pat
->client
,pat
->pattern
);
9843 addReplyBulk(pat
->client
,channel
);
9844 addReplyBulk(pat
->client
,message
);
9848 decrRefCount(channel
);
9853 static void subscribeCommand(redisClient
*c
) {
9856 for (j
= 1; j
< c
->argc
; j
++)
9857 pubsubSubscribeChannel(c
,c
->argv
[j
]);
9860 static void unsubscribeCommand(redisClient
*c
) {
9862 pubsubUnsubscribeAllChannels(c
,1);
9867 for (j
= 1; j
< c
->argc
; j
++)
9868 pubsubUnsubscribeChannel(c
,c
->argv
[j
],1);
9872 static void psubscribeCommand(redisClient
*c
) {
9875 for (j
= 1; j
< c
->argc
; j
++)
9876 pubsubSubscribePattern(c
,c
->argv
[j
]);
9879 static void punsubscribeCommand(redisClient
*c
) {
9881 pubsubUnsubscribeAllPatterns(c
,1);
9886 for (j
= 1; j
< c
->argc
; j
++)
9887 pubsubUnsubscribePattern(c
,c
->argv
[j
],1);
9891 static void publishCommand(redisClient
*c
) {
9892 int receivers
= pubsubPublishMessage(c
->argv
[1],c
->argv
[2]);
9893 addReplyLong(c
,receivers
);
9896 /* ================================= Debugging ============================== */
9898 static void debugCommand(redisClient
*c
) {
9899 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
9901 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
9902 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
9903 addReply(c
,shared
.err
);
9907 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
9908 addReply(c
,shared
.err
);
9911 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
9912 addReply(c
,shared
.ok
);
9913 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
9915 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
9916 addReply(c
,shared
.err
);
9919 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
9920 addReply(c
,shared
.ok
);
9921 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
9922 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
9926 addReply(c
,shared
.nokeyerr
);
9929 key
= dictGetEntryKey(de
);
9930 val
= dictGetEntryVal(de
);
9931 if (!server
.vm_enabled
|| (key
->storage
== REDIS_VM_MEMORY
||
9932 key
->storage
== REDIS_VM_SWAPPING
)) {
9936 if (val
->encoding
< (sizeof(strencoding
)/sizeof(char*))) {
9937 strenc
= strencoding
[val
->encoding
];
9939 snprintf(buf
,64,"unknown encoding %d\n", val
->encoding
);
9942 addReplySds(c
,sdscatprintf(sdsempty(),
9943 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9944 "encoding:%s serializedlength:%lld\r\n",
9945 (void*)key
, key
->refcount
, (void*)val
, val
->refcount
,
9946 strenc
, (long long) rdbSavedObjectLen(val
,NULL
)));
9948 addReplySds(c
,sdscatprintf(sdsempty(),
9949 "+Key at:%p refcount:%d, value swapped at: page %llu "
9950 "using %llu pages\r\n",
9951 (void*)key
, key
->refcount
, (unsigned long long) key
->vm
.page
,
9952 (unsigned long long) key
->vm
.usedpages
));
9954 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapin") && c
->argc
== 3) {
9955 lookupKeyRead(c
->db
,c
->argv
[2]);
9956 addReply(c
,shared
.ok
);
9957 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
9958 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
9961 if (!server
.vm_enabled
) {
9962 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9966 addReply(c
,shared
.nokeyerr
);
9969 key
= dictGetEntryKey(de
);
9970 val
= dictGetEntryVal(de
);
9971 /* If the key is shared we want to create a copy */
9972 if (key
->refcount
> 1) {
9973 robj
*newkey
= dupStringObject(key
);
9975 key
= dictGetEntryKey(de
) = newkey
;
9978 if (key
->storage
!= REDIS_VM_MEMORY
) {
9979 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
9980 } else if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
9981 dictGetEntryVal(de
) = NULL
;
9982 addReply(c
,shared
.ok
);
9984 addReply(c
,shared
.err
);
9987 addReplySds(c
,sdsnew(
9988 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9992 static void _redisAssert(char *estr
, char *file
, int line
) {
9993 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
9994 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true\n",file
,line
,estr
);
9995 #ifdef HAVE_BACKTRACE
9996 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
10001 static void _redisPanic(char *msg
, char *file
, int line
) {
10002 redisLog(REDIS_WARNING
,"!!! Software Failure. Press left mouse button to continue");
10003 redisLog(REDIS_WARNING
,"Guru Meditation: %s #%s:%d",msg
,file
,line
);
10004 #ifdef HAVE_BACKTRACE
10005 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
10006 *((char*)-1) = 'x';
10010 /* =================================== Main! ================================ */
10013 int linuxOvercommitMemoryValue(void) {
10014 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
10017 if (!fp
) return -1;
10018 if (fgets(buf
,64,fp
) == NULL
) {
10027 void linuxOvercommitMemoryWarning(void) {
10028 if (linuxOvercommitMemoryValue() == 0) {
10029 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10032 #endif /* __linux__ */
10034 static void daemonize(void) {
10038 if (fork() != 0) exit(0); /* parent exits */
10039 setsid(); /* create a new session */
10041 /* Every output goes to /dev/null. If Redis is daemonized but
10042 * the 'logfile' is set to 'stdout' in the configuration file
10043 * it will not log at all. */
10044 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
10045 dup2(fd
, STDIN_FILENO
);
10046 dup2(fd
, STDOUT_FILENO
);
10047 dup2(fd
, STDERR_FILENO
);
10048 if (fd
> STDERR_FILENO
) close(fd
);
10050 /* Try to write the pid file */
10051 fp
= fopen(server
.pidfile
,"w");
10053 fprintf(fp
,"%d\n",getpid());
10058 static void version() {
10059 printf("Redis server version %s\n", REDIS_VERSION
);
10063 static void usage() {
10064 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
10065 fprintf(stderr
," ./redis-server - (read config from stdin)\n");
10069 int main(int argc
, char **argv
) {
10072 initServerConfig();
10074 if (strcmp(argv
[1], "-v") == 0 ||
10075 strcmp(argv
[1], "--version") == 0) version();
10076 if (strcmp(argv
[1], "--help") == 0) usage();
10077 resetServerSaveParams();
10078 loadServerConfig(argv
[1]);
10079 } else if ((argc
> 2)) {
10082 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10084 if (server
.daemonize
) daemonize();
10086 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
10088 linuxOvercommitMemoryWarning();
10090 start
= time(NULL
);
10091 if (server
.appendonly
) {
10092 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
10093 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
10095 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
10096 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
10098 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
10099 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
10101 aeDeleteEventLoop(server
.el
);
10105 /* ============================= Backtrace support ========================= */
10107 #ifdef HAVE_BACKTRACE
10108 static char *findFuncName(void *pointer
, unsigned long *offset
);
10110 static void *getMcontextEip(ucontext_t
*uc
) {
10111 #if defined(__FreeBSD__)
10112 return (void*) uc
->uc_mcontext
.mc_eip
;
10113 #elif defined(__dietlibc__)
10114 return (void*) uc
->uc_mcontext
.eip
;
10115 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10117 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
10119 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
10121 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10122 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10123 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
10125 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
10127 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10128 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
10129 #elif defined(__ia64__) /* Linux IA64 */
10130 return (void*) uc
->uc_mcontext
.sc_ip
;
10136 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
10138 char **messages
= NULL
;
10139 int i
, trace_size
= 0;
10140 unsigned long offset
=0;
10141 ucontext_t
*uc
= (ucontext_t
*) secret
;
10143 REDIS_NOTUSED(info
);
10145 redisLog(REDIS_WARNING
,
10146 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
10147 infostring
= genRedisInfoString();
10148 redisLog(REDIS_WARNING
, "%s",infostring
);
10149 /* It's not safe to sdsfree() the returned string under memory
10150 * corruption conditions. Let it leak as we are going to abort */
10152 trace_size
= backtrace(trace
, 100);
10153 /* overwrite sigaction with caller's address */
10154 if (getMcontextEip(uc
) != NULL
) {
10155 trace
[1] = getMcontextEip(uc
);
10157 messages
= backtrace_symbols(trace
, trace_size
);
10159 for (i
=1; i
<trace_size
; ++i
) {
10160 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
10162 p
= strchr(messages
[i
],'+');
10163 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
10164 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
10166 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
10169 /* free(messages); Don't call free() with possibly corrupted memory. */
10173 static void setupSigSegvAction(void) {
10174 struct sigaction act
;
10176 sigemptyset (&act
.sa_mask
);
10177 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10178 * is used. Otherwise, sa_handler is used */
10179 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
10180 act
.sa_sigaction
= segvHandler
;
10181 sigaction (SIGSEGV
, &act
, NULL
);
10182 sigaction (SIGBUS
, &act
, NULL
);
10183 sigaction (SIGFPE
, &act
, NULL
);
10184 sigaction (SIGILL
, &act
, NULL
);
10185 sigaction (SIGBUS
, &act
, NULL
);
10189 #include "staticsymbols.h"
10190 /* This function try to convert a pointer into a function name. It's used in
10191 * oreder to provide a backtrace under segmentation fault that's able to
10192 * display functions declared as static (otherwise the backtrace is useless). */
10193 static char *findFuncName(void *pointer
, unsigned long *offset
){
10195 unsigned long off
, minoff
= 0;
10197 /* Try to match against the Symbol with the smallest offset */
10198 for (i
=0; symsTable
[i
].pointer
; i
++) {
10199 unsigned long lp
= (unsigned long) pointer
;
10201 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
10202 off
=lp
-symsTable
[i
].pointer
;
10203 if (ret
< 0 || off
< minoff
) {
10209 if (ret
== -1) return NULL
;
10211 return symsTable
[ret
].name
;
10213 #else /* HAVE_BACKTRACE */
10214 static void setupSigSegvAction(void) {
10216 #endif /* HAVE_BACKTRACE */