2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "2.1.1"
45 #endif /* HAVE_BACKTRACE */
53 #include <arpa/inet.h>
57 #include <sys/resource.h>
65 #include "solarisfixes.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "intset.h" /* Compact integer set structure */
80 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
81 #include "release.h" /* Release and/or git repository information */
87 /* Static server configuration */
88 #define REDIS_SERVERPORT 6379 /* TCP port */
89 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
90 #define REDIS_IOBUF_LEN 1024
91 #define REDIS_LOADBUF_LEN 1024
92 #define REDIS_STATIC_ARGS 8
93 #define REDIS_DEFAULT_DBNUM 16
94 #define REDIS_CONFIGLINE_MAX 1024
95 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
96 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
97 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
98 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
99 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
101 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
102 #define REDIS_WRITEV_THRESHOLD 3
103 /* Max number of iovecs used for each writev call */
104 #define REDIS_WRITEV_IOVEC_COUNT 256
106 /* Hash table parameters */
107 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
110 #define REDIS_CMD_BULK 1 /* Bulk write command */
111 #define REDIS_CMD_INLINE 2 /* Inline command */
112 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
113 this flags will return an error when the 'maxmemory' option is set in the
114 config file and the server is using more than maxmemory bytes of memory.
115 In short this commands are denied on low memory conditions. */
116 #define REDIS_CMD_DENYOOM 4
117 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
120 #define REDIS_STRING 0
125 #define REDIS_VMPOINTER 8
127 /* Objects encoding. Some kind of objects like Strings and Hashes can be
128 * internally represented in multiple ways. The 'encoding' field of the object
129 * is set to one of this fields for this object. */
130 #define REDIS_ENCODING_RAW 0 /* Raw representation */
131 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
132 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
133 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
134 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
135 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
136 #define REDIS_ENCODING_INTSET 6 /* Encoded as intset */
138 static char* strencoding
[] = {
139 "raw", "int", "hashtable", "zipmap", "list", "ziplist", "intset"
142 /* Object types only used for dumping to disk */
143 #define REDIS_EXPIRETIME 253
144 #define REDIS_SELECTDB 254
145 #define REDIS_EOF 255
147 /* Defines related to the dump file format. To store 32 bits lengths for short
148 * keys requires a lot of space, so we check the most significant 2 bits of
149 * the first byte to interpreter the length:
151 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
152 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
153 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
154 * 11|000000 this means: specially encoded object will follow. The six bits
155 * number specify the kind of object that follows.
156 * See the REDIS_RDB_ENC_* defines.
158 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
159 * values, will fit inside. */
160 #define REDIS_RDB_6BITLEN 0
161 #define REDIS_RDB_14BITLEN 1
162 #define REDIS_RDB_32BITLEN 2
163 #define REDIS_RDB_ENCVAL 3
164 #define REDIS_RDB_LENERR UINT_MAX
166 /* When a length of a string object stored on disk has the first two bits
167 * set, the remaining two bits specify a special encoding for the object
168 * accordingly to the following defines: */
169 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
170 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
171 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
172 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
174 /* Virtual memory object->where field. */
175 #define REDIS_VM_MEMORY 0 /* The object is on memory */
176 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
177 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
178 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
180 /* Virtual memory static configuration stuff.
181 * Check vmFindContiguousPages() to know more about this magic numbers. */
182 #define REDIS_VM_MAX_NEAR_PAGES 65536
183 #define REDIS_VM_MAX_RANDOM_JUMP 4096
184 #define REDIS_VM_MAX_THREADS 32
185 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
186 /* The following is the *percentage* of completed I/O jobs to process when the
187 * handelr is called. While Virtual Memory I/O operations are performed by
188 * threads, this operations must be processed by the main thread when completed
189 * in order to take effect. */
190 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
193 #define REDIS_SLAVE 1 /* This client is a slave server */
194 #define REDIS_MASTER 2 /* This client is a master server */
195 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
196 #define REDIS_MULTI 8 /* This client is in a MULTI context */
197 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
198 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
199 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
201 /* Slave replication state - slave side */
202 #define REDIS_REPL_NONE 0 /* No active replication */
203 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
204 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
206 /* Slave replication state - from the point of view of master
207 * Note that in SEND_BULK and ONLINE state the slave receives new updates
208 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
209 * to start the next background saving in order to send updates to it. */
210 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
211 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
212 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
213 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
215 /* List related stuff */
219 /* Sort operations */
220 #define REDIS_SORT_GET 0
221 #define REDIS_SORT_ASC 1
222 #define REDIS_SORT_DESC 2
223 #define REDIS_SORTKEY_MAX 1024
226 #define REDIS_DEBUG 0
227 #define REDIS_VERBOSE 1
228 #define REDIS_NOTICE 2
229 #define REDIS_WARNING 3
231 /* Anti-warning macro... */
232 #define REDIS_NOTUSED(V) ((void) V)
234 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
235 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
237 /* Append only defines */
238 #define APPENDFSYNC_NO 0
239 #define APPENDFSYNC_ALWAYS 1
240 #define APPENDFSYNC_EVERYSEC 2
242 /* Zip structure related defaults */
243 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
244 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
245 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
246 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
247 #define REDIS_SET_MAX_INTSET_ENTRIES 4096
249 /* We can print the stacktrace, so our assert is defined this way: */
250 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
251 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
252 static void _redisAssert(char *estr
, char *file
, int line
);
253 static void _redisPanic(char *msg
, char *file
, int line
);
255 /*================================= Data types ============================== */
257 /* A redis object, that is a type able to hold a string / list / set */
259 /* The actual Redis Object */
260 typedef struct redisObject
{
262 unsigned storage
:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
264 unsigned lru
:22; /* lru time (relative to server.lruclock) */
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
273 /* The VM pointer structure - identifies an object in the swap file.
275 * This object is stored in place of the value
276 * object in the main key->value hash table representing a database.
277 * Note that the first fields (type, storage) are the same as the redisObject
278 * structure so that vmPointer strucuters can be accessed even when casted
279 * as redisObject structures.
281 * This is useful as we don't know if a value object is or not on disk, but we
282 * are always able to read obj->storage to check this. For vmPointer
283 * structures "type" is set to REDIS_VMPOINTER (even if without this field
284 * is still possible to check the kind of object from the value of 'storage').*/
285 typedef struct vmPointer
{
287 unsigned storage
:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
289 unsigned int vtype
; /* type of the object stored in the swap file */
290 off_t page
; /* the page at witch the object is stored on disk */
291 off_t usedpages
; /* number of pages used on disk */
294 /* Macro used to initalize a Redis object allocated on the stack.
295 * Note that this macro is taken near the structure definition to make sure
296 * we'll update it when the structure is changed, to avoid bugs like
297 * bug #85 introduced exactly in this way. */
298 #define initStaticStringObject(_var,_ptr) do { \
300 _var.type = REDIS_STRING; \
301 _var.encoding = REDIS_ENCODING_RAW; \
303 _var.storage = REDIS_VM_MEMORY; \
306 typedef struct redisDb
{
307 dict
*dict
; /* The keyspace for this DB */
308 dict
*expires
; /* Timeout of keys with a timeout set */
309 dict
*blocking_keys
; /* Keys with clients waiting for data (BLPOP) */
310 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
311 dict
*watched_keys
; /* WATCHED keys for MULTI/EXEC CAS */
315 /* Client MULTI/EXEC state */
316 typedef struct multiCmd
{
319 struct redisCommand
*cmd
;
322 typedef struct multiState
{
323 multiCmd
*commands
; /* Array of MULTI commands */
324 int count
; /* Total number of MULTI commands */
327 /* With multiplexing we need to take per-clinet state.
328 * Clients are taken in a liked list. */
329 typedef struct redisClient
{
334 robj
**argv
, **mbargv
;
336 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
337 int multibulk
; /* multi bulk command format active */
340 time_t lastinteraction
; /* time of the last interaction, used for timeout */
341 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
342 int slaveseldb
; /* slave selected db, if this client is a slave */
343 int authenticated
; /* when requirepass is non-NULL */
344 int replstate
; /* replication state if this is a slave */
345 int repldbfd
; /* replication DB file descriptor */
346 long repldboff
; /* replication DB file offset */
347 off_t repldbsize
; /* replication DB file size */
348 multiState mstate
; /* MULTI/EXEC state */
349 robj
**blocking_keys
; /* The key we are waiting to terminate a blocking
350 * operation such as BLPOP. Otherwise NULL. */
351 int blocking_keys_num
; /* Number of blocking keys */
352 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
353 * is >= blockingto then the operation timed out. */
354 list
*io_keys
; /* Keys this client is waiting to be loaded from the
355 * swap file in order to continue. */
356 list
*watched_keys
; /* Keys WATCHED for MULTI/EXEC CAS */
357 dict
*pubsub_channels
; /* channels a client is interested in (SUBSCRIBE) */
358 list
*pubsub_patterns
; /* patterns a client is interested in (SUBSCRIBE) */
366 /* Global server state structure */
371 long long dirty
; /* changes to DB from the last save */
373 list
*slaves
, *monitors
;
374 char neterr
[ANET_ERR_LEN
];
376 int cronloops
; /* number of times the cron function run */
377 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
378 time_t lastsave
; /* Unix time of last save succeeede */
379 /* Fields used only for stats */
380 time_t stat_starttime
; /* server start time */
381 long long stat_numcommands
; /* number of processed commands */
382 long long stat_numconnections
; /* number of connections received */
383 long long stat_expiredkeys
; /* number of expired keys */
392 int no_appendfsync_on_rewrite
;
398 pid_t bgsavechildpid
;
399 pid_t bgrewritechildpid
;
400 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
401 sds aofbuf
; /* AOF buffer, written before entering the event loop */
402 struct saveparam
*saveparams
;
407 char *appendfilename
;
411 /* Replication related */
416 redisClient
*master
; /* client that is master for this slave */
418 unsigned int maxclients
;
419 unsigned long long maxmemory
;
420 unsigned int blpop_blocked_clients
;
421 unsigned int vm_blocked_clients
;
422 /* Sort parameters - qsort_r() is only available under BSD so we
423 * have to take this state global, in order to pass it to sortCompare() */
427 /* Virtual memory configuration */
432 unsigned long long vm_max_memory
;
433 /* Zip structure config */
434 size_t hash_max_zipmap_entries
;
435 size_t hash_max_zipmap_value
;
436 size_t list_max_ziplist_entries
;
437 size_t list_max_ziplist_value
;
438 size_t set_max_intset_entries
;
439 /* Virtual memory state */
442 off_t vm_next_page
; /* Next probably empty page */
443 off_t vm_near_pages
; /* Number of pages allocated sequentially */
444 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
445 time_t unixtime
; /* Unix time sampled every second. */
446 /* Virtual memory I/O threads stuff */
447 /* An I/O thread process an element taken from the io_jobs queue and
448 * put the result of the operation in the io_done list. While the
449 * job is being processed, it's put on io_processing queue. */
450 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
451 list
*io_processing
; /* List of VM I/O jobs being processed */
452 list
*io_processed
; /* List of VM I/O jobs already processed */
453 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
454 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
455 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
456 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
457 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
458 int io_active_threads
; /* Number of running I/O threads */
459 int vm_max_threads
; /* Max number of I/O threads running at the same time */
460 /* Our main thread is blocked on the event loop, locking for sockets ready
461 * to be read or written, so when a threaded I/O operation is ready to be
462 * processed by the main thread, the I/O thread will use a unix pipe to
463 * awake the main thread. The followings are the two pipe FDs. */
464 int io_ready_pipe_read
;
465 int io_ready_pipe_write
;
466 /* Virtual memory stats */
467 unsigned long long vm_stats_used_pages
;
468 unsigned long long vm_stats_swapped_objects
;
469 unsigned long long vm_stats_swapouts
;
470 unsigned long long vm_stats_swapins
;
472 dict
*pubsub_channels
; /* Map channels to list of subscribed clients */
473 list
*pubsub_patterns
; /* A list of pubsub_patterns */
476 unsigned lruclock
:22; /* clock incrementing every minute, for LRU */
477 unsigned lruclock_padding
:10;
480 typedef struct pubsubPattern
{
485 typedef void redisCommandProc(redisClient
*c
);
486 typedef void redisVmPreloadProc(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
);
487 struct redisCommand
{
489 redisCommandProc
*proc
;
492 /* Use a function to determine which keys need to be loaded
493 * in the background prior to executing this command. Takes precedence
494 * over vm_firstkey and others, ignored when NULL */
495 redisVmPreloadProc
*vm_preload_proc
;
496 /* What keys should be loaded in background when calling this command? */
497 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
498 int vm_lastkey
; /* THe last argument that's a key */
499 int vm_keystep
; /* The step between first and last key */
502 struct redisFunctionSym
{
504 unsigned long pointer
;
507 typedef struct _redisSortObject
{
515 typedef struct _redisSortOperation
{
518 } redisSortOperation
;
520 /* ZSETs use a specialized version of Skiplists */
522 typedef struct zskiplistNode
{
523 struct zskiplistNode
**forward
;
524 struct zskiplistNode
*backward
;
530 typedef struct zskiplist
{
531 struct zskiplistNode
*header
, *tail
;
532 unsigned long length
;
536 typedef struct zset
{
541 /* Our shared "common" objects */
543 #define REDIS_SHARED_INTEGERS 10000
544 struct sharedObjectsStruct
{
545 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
546 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
547 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
548 *outofrangeerr
, *plus
,
549 *select0
, *select1
, *select2
, *select3
, *select4
,
550 *select5
, *select6
, *select7
, *select8
, *select9
,
551 *messagebulk
, *pmessagebulk
, *subscribebulk
, *unsubscribebulk
, *mbulk3
,
552 *mbulk4
, *psubscribebulk
, *punsubscribebulk
,
553 *integers
[REDIS_SHARED_INTEGERS
];
556 /* Global vars that are actally used as constants. The following double
557 * values are used for double on-disk serialization, and are initialized
558 * at runtime to avoid strange compiler optimizations. */
560 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
562 /* VM threaded I/O request message */
563 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
564 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
565 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
566 typedef struct iojob
{
567 int type
; /* Request type, REDIS_IOJOB_* */
568 redisDb
*db
;/* Redis database */
569 robj
*key
; /* This I/O request is about swapping this key */
570 robj
*id
; /* Unique identifier of this job:
571 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
572 vmpointer objct for REDIS_IOREQ_LOAD. */
573 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
574 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
575 off_t page
; /* Swap page where to read/write the object */
576 off_t pages
; /* Swap pages needed to save object. PREPARE_SWAP return val */
577 int canceled
; /* True if this command was canceled by blocking side of VM */
578 pthread_t thread
; /* ID of the thread processing this entry */
581 /*================================ Prototypes =============================== */
583 static void freeStringObject(robj
*o
);
584 static void freeListObject(robj
*o
);
585 static void freeSetObject(robj
*o
);
586 static void decrRefCount(void *o
);
587 static robj
*createObject(int type
, void *ptr
);
588 static void freeClient(redisClient
*c
);
589 static int rdbLoad(char *filename
);
590 static void addReply(redisClient
*c
, robj
*obj
);
591 static void addReplySds(redisClient
*c
, sds s
);
592 static void incrRefCount(robj
*o
);
593 static int rdbSaveBackground(char *filename
);
594 static robj
*createStringObject(char *ptr
, size_t len
);
595 static robj
*dupStringObject(robj
*o
);
596 static void replicationFeedSlaves(list
*slaves
, int dictid
, robj
**argv
, int argc
);
597 static void replicationFeedMonitors(list
*monitors
, int dictid
, robj
**argv
, int argc
);
598 static void flushAppendOnlyFile(void);
599 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
600 static int syncWithMaster(void);
601 static robj
*tryObjectEncoding(robj
*o
);
602 static robj
*getDecodedObject(robj
*o
);
603 static int removeExpire(redisDb
*db
, robj
*key
);
604 static int expireIfNeeded(redisDb
*db
, robj
*key
);
605 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
606 static int dbDelete(redisDb
*db
, robj
*key
);
607 static time_t getExpire(redisDb
*db
, robj
*key
);
608 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
609 static void updateSlavesWaitingBgsave(int bgsaveerr
);
610 static void freeMemoryIfNeeded(void);
611 static int processCommand(redisClient
*c
);
612 static void setupSigSegvAction(void);
613 static void rdbRemoveTempFile(pid_t childpid
);
614 static void aofRemoveTempFile(pid_t childpid
);
615 static size_t stringObjectLen(robj
*o
);
616 static void processInputBuffer(redisClient
*c
);
617 static zskiplist
*zslCreate(void);
618 static void zslFree(zskiplist
*zsl
);
619 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
620 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
621 static void initClientMultiState(redisClient
*c
);
622 static void freeClientMultiState(redisClient
*c
);
623 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
624 static void unblockClientWaitingData(redisClient
*c
);
625 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
626 static void vmInit(void);
627 static void vmMarkPagesFree(off_t page
, off_t count
);
628 static robj
*vmLoadObject(robj
*o
);
629 static robj
*vmPreviewObject(robj
*o
);
630 static int vmSwapOneObjectBlocking(void);
631 static int vmSwapOneObjectThreaded(void);
632 static int vmCanSwapOut(void);
633 static int tryFreeOneObjectFromFreelist(void);
634 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
635 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
636 static void vmCancelThreadedIOJob(robj
*o
);
637 static void lockThreadedIO(void);
638 static void unlockThreadedIO(void);
639 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
640 static void freeIOJob(iojob
*j
);
641 static void queueIOJob(iojob
*j
);
642 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
643 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
644 static void waitEmptyIOJobsQueue(void);
645 static void vmReopenSwapFile(void);
646 static int vmFreePage(off_t page
);
647 static void zunionInterBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
);
648 static void execBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
);
649 static int blockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
);
650 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
651 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
652 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
653 static struct redisCommand
*lookupCommand(char *name
);
654 static void call(redisClient
*c
, struct redisCommand
*cmd
);
655 static void resetClient(redisClient
*c
);
656 static void convertToRealHash(robj
*o
);
657 static void listTypeConvert(robj
*o
, int enc
);
658 static void setTypeConvert(robj
*o
, int enc
);
659 static int pubsubUnsubscribeAllChannels(redisClient
*c
, int notify
);
660 static int pubsubUnsubscribeAllPatterns(redisClient
*c
, int notify
);
661 static void freePubsubPattern(void *p
);
662 static int listMatchPubsubPattern(void *a
, void *b
);
663 static int compareStringObjects(robj
*a
, robj
*b
);
664 static int equalStringObjects(robj
*a
, robj
*b
);
666 static int rewriteAppendOnlyFileBackground(void);
667 static vmpointer
*vmSwapObjectBlocking(robj
*val
);
668 static int prepareForShutdown();
669 static void touchWatchedKey(redisDb
*db
, robj
*key
);
670 static void touchWatchedKeysOnFlush(int dbid
);
671 static void unwatchAllKeys(redisClient
*c
);
673 static void authCommand(redisClient
*c
);
674 static void pingCommand(redisClient
*c
);
675 static void echoCommand(redisClient
*c
);
676 static void setCommand(redisClient
*c
);
677 static void setnxCommand(redisClient
*c
);
678 static void setexCommand(redisClient
*c
);
679 static void getCommand(redisClient
*c
);
680 static void delCommand(redisClient
*c
);
681 static void existsCommand(redisClient
*c
);
682 static void incrCommand(redisClient
*c
);
683 static void decrCommand(redisClient
*c
);
684 static void incrbyCommand(redisClient
*c
);
685 static void decrbyCommand(redisClient
*c
);
686 static void selectCommand(redisClient
*c
);
687 static void randomkeyCommand(redisClient
*c
);
688 static void keysCommand(redisClient
*c
);
689 static void dbsizeCommand(redisClient
*c
);
690 static void lastsaveCommand(redisClient
*c
);
691 static void saveCommand(redisClient
*c
);
692 static void bgsaveCommand(redisClient
*c
);
693 static void bgrewriteaofCommand(redisClient
*c
);
694 static void shutdownCommand(redisClient
*c
);
695 static void moveCommand(redisClient
*c
);
696 static void renameCommand(redisClient
*c
);
697 static void renamenxCommand(redisClient
*c
);
698 static void lpushCommand(redisClient
*c
);
699 static void rpushCommand(redisClient
*c
);
700 static void lpopCommand(redisClient
*c
);
701 static void rpopCommand(redisClient
*c
);
702 static void llenCommand(redisClient
*c
);
703 static void lindexCommand(redisClient
*c
);
704 static void lrangeCommand(redisClient
*c
);
705 static void ltrimCommand(redisClient
*c
);
706 static void typeCommand(redisClient
*c
);
707 static void lsetCommand(redisClient
*c
);
708 static void saddCommand(redisClient
*c
);
709 static void sremCommand(redisClient
*c
);
710 static void smoveCommand(redisClient
*c
);
711 static void sismemberCommand(redisClient
*c
);
712 static void scardCommand(redisClient
*c
);
713 static void spopCommand(redisClient
*c
);
714 static void srandmemberCommand(redisClient
*c
);
715 static void sinterCommand(redisClient
*c
);
716 static void sinterstoreCommand(redisClient
*c
);
717 static void sunionCommand(redisClient
*c
);
718 static void sunionstoreCommand(redisClient
*c
);
719 static void sdiffCommand(redisClient
*c
);
720 static void sdiffstoreCommand(redisClient
*c
);
721 static void syncCommand(redisClient
*c
);
722 static void flushdbCommand(redisClient
*c
);
723 static void flushallCommand(redisClient
*c
);
724 static void sortCommand(redisClient
*c
);
725 static void lremCommand(redisClient
*c
);
726 static void rpoplpushcommand(redisClient
*c
);
727 static void infoCommand(redisClient
*c
);
728 static void mgetCommand(redisClient
*c
);
729 static void monitorCommand(redisClient
*c
);
730 static void expireCommand(redisClient
*c
);
731 static void expireatCommand(redisClient
*c
);
732 static void getsetCommand(redisClient
*c
);
733 static void ttlCommand(redisClient
*c
);
734 static void slaveofCommand(redisClient
*c
);
735 static void debugCommand(redisClient
*c
);
736 static void msetCommand(redisClient
*c
);
737 static void msetnxCommand(redisClient
*c
);
738 static void zaddCommand(redisClient
*c
);
739 static void zincrbyCommand(redisClient
*c
);
740 static void zrangeCommand(redisClient
*c
);
741 static void zrangebyscoreCommand(redisClient
*c
);
742 static void zcountCommand(redisClient
*c
);
743 static void zrevrangeCommand(redisClient
*c
);
744 static void zcardCommand(redisClient
*c
);
745 static void zremCommand(redisClient
*c
);
746 static void zscoreCommand(redisClient
*c
);
747 static void zremrangebyscoreCommand(redisClient
*c
);
748 static void multiCommand(redisClient
*c
);
749 static void execCommand(redisClient
*c
);
750 static void discardCommand(redisClient
*c
);
751 static void blpopCommand(redisClient
*c
);
752 static void brpopCommand(redisClient
*c
);
753 static void appendCommand(redisClient
*c
);
754 static void substrCommand(redisClient
*c
);
755 static void zrankCommand(redisClient
*c
);
756 static void zrevrankCommand(redisClient
*c
);
757 static void hsetCommand(redisClient
*c
);
758 static void hsetnxCommand(redisClient
*c
);
759 static void hgetCommand(redisClient
*c
);
760 static void hmsetCommand(redisClient
*c
);
761 static void hmgetCommand(redisClient
*c
);
762 static void hdelCommand(redisClient
*c
);
763 static void hlenCommand(redisClient
*c
);
764 static void zremrangebyrankCommand(redisClient
*c
);
765 static void zunionstoreCommand(redisClient
*c
);
766 static void zinterstoreCommand(redisClient
*c
);
767 static void hkeysCommand(redisClient
*c
);
768 static void hvalsCommand(redisClient
*c
);
769 static void hgetallCommand(redisClient
*c
);
770 static void hexistsCommand(redisClient
*c
);
771 static void configCommand(redisClient
*c
);
772 static void hincrbyCommand(redisClient
*c
);
773 static void subscribeCommand(redisClient
*c
);
774 static void unsubscribeCommand(redisClient
*c
);
775 static void psubscribeCommand(redisClient
*c
);
776 static void punsubscribeCommand(redisClient
*c
);
777 static void publishCommand(redisClient
*c
);
778 static void watchCommand(redisClient
*c
);
779 static void unwatchCommand(redisClient
*c
);
781 /*================================= Globals ================================= */
784 static struct redisServer server
; /* server global state */
785 static struct redisCommand
*commandTable
;
786 static struct redisCommand readonlyCommandTable
[] = {
787 {"get",getCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
788 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
789 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
790 {"setex",setexCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
791 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
792 {"substr",substrCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
793 {"del",delCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
794 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
795 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
796 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
797 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,NULL
,1,-1,1},
798 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
799 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
800 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
801 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
802 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,NULL
,1,1,1},
803 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,NULL
,1,1,1},
804 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
805 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
806 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
807 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
808 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
809 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,NULL
,1,1,1},
810 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,2,1},
811 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
812 {"srem",sremCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
813 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,NULL
,1,2,1},
814 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
815 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
816 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
817 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
818 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
819 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
820 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
821 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
822 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
823 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
824 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
825 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
826 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
827 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
828 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
829 {"zremrangebyrank",zremrangebyrankCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
830 {"zunionstore",zunionstoreCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,zunionInterBlockClientOnSwappedKeys
,0,0,0},
831 {"zinterstore",zinterstoreCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,zunionInterBlockClientOnSwappedKeys
,0,0,0},
832 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
833 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
834 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
835 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
836 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
837 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
838 {"zrank",zrankCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
839 {"zrevrank",zrevrankCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
840 {"hset",hsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
841 {"hsetnx",hsetnxCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
842 {"hget",hgetCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
843 {"hmset",hmsetCommand
,-4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
844 {"hmget",hmgetCommand
,-3,REDIS_CMD_BULK
,NULL
,1,1,1},
845 {"hincrby",hincrbyCommand
,4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
846 {"hdel",hdelCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
847 {"hlen",hlenCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
848 {"hkeys",hkeysCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
849 {"hvals",hvalsCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
850 {"hgetall",hgetallCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
851 {"hexists",hexistsCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
852 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
853 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
854 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
855 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,-1,2},
856 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,-1,2},
857 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
858 {"select",selectCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
859 {"move",moveCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
860 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
861 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
862 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
863 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
864 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
865 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
866 {"auth",authCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
867 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
868 {"echo",echoCommand
,2,REDIS_CMD_BULK
,NULL
,0,0,0},
869 {"save",saveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
870 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
871 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
872 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
873 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
874 {"type",typeCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
875 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
876 {"exec",execCommand
,1,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,execBlockClientOnSwappedKeys
,0,0,0},
877 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
878 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
879 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
880 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
881 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
882 {"info",infoCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
883 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
884 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
885 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
886 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
887 {"config",configCommand
,-2,REDIS_CMD_BULK
,NULL
,0,0,0},
888 {"subscribe",subscribeCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
889 {"unsubscribe",unsubscribeCommand
,-1,REDIS_CMD_INLINE
,NULL
,0,0,0},
890 {"psubscribe",psubscribeCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
891 {"punsubscribe",punsubscribeCommand
,-1,REDIS_CMD_INLINE
,NULL
,0,0,0},
892 {"publish",publishCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_FORCE_REPLICATION
,NULL
,0,0,0},
893 {"watch",watchCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
894 {"unwatch",unwatchCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0}
897 /*============================ Utility functions ============================ */
899 /* Glob-style pattern matching. */
900 static int stringmatchlen(const char *pattern
, int patternLen
,
901 const char *string
, int stringLen
, int nocase
)
906 while (pattern
[1] == '*') {
911 return 1; /* match */
913 if (stringmatchlen(pattern
+1, patternLen
-1,
914 string
, stringLen
, nocase
))
915 return 1; /* match */
919 return 0; /* no match */
923 return 0; /* no match */
933 not = pattern
[0] == '^';
940 if (pattern
[0] == '\\') {
943 if (pattern
[0] == string
[0])
945 } else if (pattern
[0] == ']') {
947 } else if (patternLen
== 0) {
951 } else if (pattern
[1] == '-' && patternLen
>= 3) {
952 int start
= pattern
[0];
953 int end
= pattern
[2];
961 start
= tolower(start
);
967 if (c
>= start
&& c
<= end
)
971 if (pattern
[0] == string
[0])
974 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
984 return 0; /* no match */
990 if (patternLen
>= 2) {
997 if (pattern
[0] != string
[0])
998 return 0; /* no match */
1000 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
1001 return 0; /* no match */
1009 if (stringLen
== 0) {
1010 while(*pattern
== '*') {
1017 if (patternLen
== 0 && stringLen
== 0)
1022 static int stringmatch(const char *pattern
, const char *string
, int nocase
) {
1023 return stringmatchlen(pattern
,strlen(pattern
),string
,strlen(string
),nocase
);
1026 /* Convert a string representing an amount of memory into the number of
1027 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1030 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1032 static long long memtoll(const char *p
, int *err
) {
1035 long mul
; /* unit multiplier */
1037 unsigned int digits
;
1040 /* Search the first non digit character. */
1043 while(*u
&& isdigit(*u
)) u
++;
1044 if (*u
== '\0' || !strcasecmp(u
,"b")) {
1046 } else if (!strcasecmp(u
,"k")) {
1048 } else if (!strcasecmp(u
,"kb")) {
1050 } else if (!strcasecmp(u
,"m")) {
1052 } else if (!strcasecmp(u
,"mb")) {
1054 } else if (!strcasecmp(u
,"g")) {
1055 mul
= 1000L*1000*1000;
1056 } else if (!strcasecmp(u
,"gb")) {
1057 mul
= 1024L*1024*1024;
1063 if (digits
>= sizeof(buf
)) {
1067 memcpy(buf
,p
,digits
);
1069 val
= strtoll(buf
,NULL
,10);
1073 /* Convert a long long into a string. Returns the number of
1074 * characters needed to represent the number, that can be shorter if passed
1075 * buffer length is not enough to store the whole number. */
1076 static int ll2string(char *s
, size_t len
, long long value
) {
1078 unsigned long long v
;
1081 if (len
== 0) return 0;
1082 v
= (value
< 0) ? -value
: value
;
1083 p
= buf
+31; /* point to the last character */
1088 if (value
< 0) *p
-- = '-';
1091 if (l
+1 > len
) l
= len
-1; /* Make sure it fits, including the nul term */
1097 static void redisLog(int level
, const char *fmt
, ...) {
1101 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
1105 if (level
>= server
.verbosity
) {
1111 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
1112 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
1113 vfprintf(fp
, fmt
, ap
);
1119 if (server
.logfile
) fclose(fp
);
1122 /*====================== Hash table type implementation ==================== */
1124 /* This is an hash table type that uses the SDS dynamic strings libary as
1125 * keys and radis objects as values (objects can hold SDS strings,
1128 static void dictVanillaFree(void *privdata
, void *val
)
1130 DICT_NOTUSED(privdata
);
1134 static void dictListDestructor(void *privdata
, void *val
)
1136 DICT_NOTUSED(privdata
);
1137 listRelease((list
*)val
);
1140 static int dictSdsKeyCompare(void *privdata
, const void *key1
,
1144 DICT_NOTUSED(privdata
);
1146 l1
= sdslen((sds
)key1
);
1147 l2
= sdslen((sds
)key2
);
1148 if (l1
!= l2
) return 0;
1149 return memcmp(key1
, key2
, l1
) == 0;
1152 static void dictRedisObjectDestructor(void *privdata
, void *val
)
1154 DICT_NOTUSED(privdata
);
1156 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
1160 static void dictSdsDestructor(void *privdata
, void *val
)
1162 DICT_NOTUSED(privdata
);
1167 static int dictObjKeyCompare(void *privdata
, const void *key1
,
1170 const robj
*o1
= key1
, *o2
= key2
;
1171 return dictSdsKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1174 static unsigned int dictObjHash(const void *key
) {
1175 const robj
*o
= key
;
1176 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1179 static unsigned int dictSdsHash(const void *key
) {
1180 return dictGenHashFunction((unsigned char*)key
, sdslen((char*)key
));
1183 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
1186 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
1189 if (o1
->encoding
== REDIS_ENCODING_INT
&&
1190 o2
->encoding
== REDIS_ENCODING_INT
)
1191 return o1
->ptr
== o2
->ptr
;
1193 o1
= getDecodedObject(o1
);
1194 o2
= getDecodedObject(o2
);
1195 cmp
= dictSdsKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1201 static unsigned int dictEncObjHash(const void *key
) {
1202 robj
*o
= (robj
*) key
;
1204 if (o
->encoding
== REDIS_ENCODING_RAW
) {
1205 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1207 if (o
->encoding
== REDIS_ENCODING_INT
) {
1211 len
= ll2string(buf
,32,(long)o
->ptr
);
1212 return dictGenHashFunction((unsigned char*)buf
, len
);
1216 o
= getDecodedObject(o
);
1217 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1225 static dictType setDictType
= {
1226 dictEncObjHash
, /* hash function */
1229 dictEncObjKeyCompare
, /* key compare */
1230 dictRedisObjectDestructor
, /* key destructor */
1231 NULL
/* val destructor */
1234 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1235 static dictType zsetDictType
= {
1236 dictEncObjHash
, /* hash function */
1239 dictEncObjKeyCompare
, /* key compare */
1240 dictRedisObjectDestructor
, /* key destructor */
1241 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1244 /* Db->dict, keys are sds strings, vals are Redis objects. */
1245 static dictType dbDictType
= {
1246 dictSdsHash
, /* hash function */
1249 dictSdsKeyCompare
, /* key compare */
1250 dictSdsDestructor
, /* key destructor */
1251 dictRedisObjectDestructor
/* val destructor */
1255 static dictType keyptrDictType
= {
1256 dictSdsHash
, /* hash function */
1259 dictSdsKeyCompare
, /* key compare */
1260 dictSdsDestructor
, /* key destructor */
1261 NULL
/* val destructor */
1264 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1265 static dictType hashDictType
= {
1266 dictEncObjHash
, /* hash function */
1269 dictEncObjKeyCompare
, /* key compare */
1270 dictRedisObjectDestructor
, /* key destructor */
1271 dictRedisObjectDestructor
/* val destructor */
1274 /* Keylist hash table type has unencoded redis objects as keys and
1275 * lists as values. It's used for blocking operations (BLPOP) and to
1276 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1277 static dictType keylistDictType
= {
1278 dictObjHash
, /* hash function */
1281 dictObjKeyCompare
, /* key compare */
1282 dictRedisObjectDestructor
, /* key destructor */
1283 dictListDestructor
/* val destructor */
1286 static void version();
1288 /* ========================= Random utility functions ======================= */
1290 /* Redis generally does not try to recover from out of memory conditions
1291 * when allocating objects or strings, it is not clear if it will be possible
1292 * to report this condition to the client since the networking layer itself
1293 * is based on heap allocation for send buffers, so we simply abort.
1294 * At least the code will be simpler to read... */
1295 static void oom(const char *msg
) {
1296 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1301 /* ====================== Redis server networking stuff ===================== */
1302 static void closeTimedoutClients(void) {
1305 time_t now
= time(NULL
);
1308 listRewind(server
.clients
,&li
);
1309 while ((ln
= listNext(&li
)) != NULL
) {
1310 c
= listNodeValue(ln
);
1311 if (server
.maxidletime
&&
1312 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1313 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1314 dictSize(c
->pubsub_channels
) == 0 && /* no timeout for pubsub */
1315 listLength(c
->pubsub_patterns
) == 0 &&
1316 (now
- c
->lastinteraction
> server
.maxidletime
))
1318 redisLog(REDIS_VERBOSE
,"Closing idle client");
1320 } else if (c
->flags
& REDIS_BLOCKED
) {
1321 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1322 addReply(c
,shared
.nullmultibulk
);
1323 unblockClientWaitingData(c
);
1329 static int htNeedsResize(dict
*dict
) {
1330 long long size
, used
;
1332 size
= dictSlots(dict
);
1333 used
= dictSize(dict
);
1334 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1335 (used
*100/size
< REDIS_HT_MINFILL
));
1338 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1339 * we resize the hash table to save memory */
1340 static void tryResizeHashTables(void) {
1343 for (j
= 0; j
< server
.dbnum
; j
++) {
1344 if (htNeedsResize(server
.db
[j
].dict
))
1345 dictResize(server
.db
[j
].dict
);
1346 if (htNeedsResize(server
.db
[j
].expires
))
1347 dictResize(server
.db
[j
].expires
);
1351 /* Our hash table implementation performs rehashing incrementally while
1352 * we write/read from the hash table. Still if the server is idle, the hash
1353 * table will use two tables for a long time. So we try to use 1 millisecond
1354 * of CPU time at every serverCron() loop in order to rehash some key. */
1355 static void incrementallyRehash(void) {
1358 for (j
= 0; j
< server
.dbnum
; j
++) {
1359 if (dictIsRehashing(server
.db
[j
].dict
)) {
1360 dictRehashMilliseconds(server
.db
[j
].dict
,1);
1361 break; /* already used our millisecond for this loop... */
1366 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1367 void backgroundSaveDoneHandler(int statloc
) {
1368 int exitcode
= WEXITSTATUS(statloc
);
1369 int bysignal
= WIFSIGNALED(statloc
);
1371 if (!bysignal
&& exitcode
== 0) {
1372 redisLog(REDIS_NOTICE
,
1373 "Background saving terminated with success");
1375 server
.lastsave
= time(NULL
);
1376 } else if (!bysignal
&& exitcode
!= 0) {
1377 redisLog(REDIS_WARNING
, "Background saving error");
1379 redisLog(REDIS_WARNING
,
1380 "Background saving terminated by signal %d", WTERMSIG(statloc
));
1381 rdbRemoveTempFile(server
.bgsavechildpid
);
1383 server
.bgsavechildpid
= -1;
1384 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1385 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1386 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1389 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1391 void backgroundRewriteDoneHandler(int statloc
) {
1392 int exitcode
= WEXITSTATUS(statloc
);
1393 int bysignal
= WIFSIGNALED(statloc
);
1395 if (!bysignal
&& exitcode
== 0) {
1399 redisLog(REDIS_NOTICE
,
1400 "Background append only file rewriting terminated with success");
1401 /* Now it's time to flush the differences accumulated by the parent */
1402 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1403 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1405 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1408 /* Flush our data... */
1409 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1410 (signed) sdslen(server
.bgrewritebuf
)) {
1411 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1415 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1416 /* Now our work is to rename the temp file into the stable file. And
1417 * switch the file descriptor used by the server for append only. */
1418 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1419 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1423 /* Mission completed... almost */
1424 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1425 if (server
.appendfd
!= -1) {
1426 /* If append only is actually enabled... */
1427 close(server
.appendfd
);
1428 server
.appendfd
= fd
;
1429 if (server
.appendfsync
!= APPENDFSYNC_NO
) aof_fsync(fd
);
1430 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1431 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1433 /* If append only is disabled we just generate a dump in this
1434 * format. Why not? */
1437 } else if (!bysignal
&& exitcode
!= 0) {
1438 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1440 redisLog(REDIS_WARNING
,
1441 "Background append only file rewriting terminated by signal %d",
1445 sdsfree(server
.bgrewritebuf
);
1446 server
.bgrewritebuf
= sdsempty();
1447 aofRemoveTempFile(server
.bgrewritechildpid
);
1448 server
.bgrewritechildpid
= -1;
1451 /* This function is called once a background process of some kind terminates,
1452 * as we want to avoid resizing the hash tables when there is a child in order
1453 * to play well with copy-on-write (otherwise when a resize happens lots of
1454 * memory pages are copied). The goal of this function is to update the ability
1455 * for dict.c to resize the hash tables accordingly to the fact we have o not
1456 * running childs. */
1457 static void updateDictResizePolicy(void) {
1458 if (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1)
1461 dictDisableResize();
1464 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1465 int j
, loops
= server
.cronloops
++;
1466 REDIS_NOTUSED(eventLoop
);
1468 REDIS_NOTUSED(clientData
);
1470 /* We take a cached value of the unix time in the global state because
1471 * with virtual memory and aging there is to store the current time
1472 * in objects at every object access, and accuracy is not needed.
1473 * To access a global var is faster than calling time(NULL) */
1474 server
.unixtime
= time(NULL
);
1475 /* We have just 21 bits per object for LRU information.
1476 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1478 * When we need to select what object to swap, we compute the minimum
1479 * time distance between the current lruclock and the object last access
1480 * lruclock info. Even if clocks will wrap on overflow, there is
1481 * the interesting property that we are sure that at least
1482 * ABS(A-B) minutes passed between current time and timestamp B.
1484 * This is not precise but we don't need at all precision, but just
1485 * something statistically reasonable.
1487 server
.lruclock
= (time(NULL
)/60)&((1<<21)-1);
1489 /* We received a SIGTERM, shutting down here in a safe way, as it is
1490 * not ok doing so inside the signal handler. */
1491 if (server
.shutdown_asap
) {
1492 if (prepareForShutdown() == REDIS_OK
) exit(0);
1493 redisLog(REDIS_WARNING
,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1496 /* Show some info about non-empty databases */
1497 for (j
= 0; j
< server
.dbnum
; j
++) {
1498 long long size
, used
, vkeys
;
1500 size
= dictSlots(server
.db
[j
].dict
);
1501 used
= dictSize(server
.db
[j
].dict
);
1502 vkeys
= dictSize(server
.db
[j
].expires
);
1503 if (!(loops
% 50) && (used
|| vkeys
)) {
1504 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1505 /* dictPrintStats(server.dict); */
1509 /* We don't want to resize the hash tables while a bacground saving
1510 * is in progress: the saving child is created using fork() that is
1511 * implemented with a copy-on-write semantic in most modern systems, so
1512 * if we resize the HT while there is the saving child at work actually
1513 * a lot of memory movements in the parent will cause a lot of pages
1515 if (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1) {
1516 if (!(loops
% 10)) tryResizeHashTables();
1517 if (server
.activerehashing
) incrementallyRehash();
1520 /* Show information about connected clients */
1521 if (!(loops
% 50)) {
1522 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use",
1523 listLength(server
.clients
)-listLength(server
.slaves
),
1524 listLength(server
.slaves
),
1525 zmalloc_used_memory());
1528 /* Close connections of timedout clients */
1529 if ((server
.maxidletime
&& !(loops
% 100)) || server
.blpop_blocked_clients
)
1530 closeTimedoutClients();
1532 /* Check if a background saving or AOF rewrite in progress terminated */
1533 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1537 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1538 if (pid
== server
.bgsavechildpid
) {
1539 backgroundSaveDoneHandler(statloc
);
1541 backgroundRewriteDoneHandler(statloc
);
1543 updateDictResizePolicy();
1546 /* If there is not a background saving in progress check if
1547 * we have to save now */
1548 time_t now
= time(NULL
);
1549 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1550 struct saveparam
*sp
= server
.saveparams
+j
;
1552 if (server
.dirty
>= sp
->changes
&&
1553 now
-server
.lastsave
> sp
->seconds
) {
1554 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1555 sp
->changes
, sp
->seconds
);
1556 rdbSaveBackground(server
.dbfilename
);
1562 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1563 * will use few CPU cycles if there are few expiring keys, otherwise
1564 * it will get more aggressive to avoid that too much memory is used by
1565 * keys that can be removed from the keyspace. */
1566 for (j
= 0; j
< server
.dbnum
; j
++) {
1568 redisDb
*db
= server
.db
+j
;
1570 /* Continue to expire if at the end of the cycle more than 25%
1571 * of the keys were expired. */
1573 long num
= dictSize(db
->expires
);
1574 time_t now
= time(NULL
);
1577 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1578 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1583 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1584 t
= (time_t) dictGetEntryVal(de
);
1586 sds key
= dictGetEntryKey(de
);
1587 robj
*keyobj
= createStringObject(key
,sdslen(key
));
1589 dbDelete(db
,keyobj
);
1590 decrRefCount(keyobj
);
1592 server
.stat_expiredkeys
++;
1595 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1598 /* Swap a few keys on disk if we are over the memory limit and VM
1599 * is enbled. Try to free objects from the free list first. */
1600 if (vmCanSwapOut()) {
1601 while (server
.vm_enabled
&& zmalloc_used_memory() >
1602 server
.vm_max_memory
)
1606 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1607 retval
= (server
.vm_max_threads
== 0) ?
1608 vmSwapOneObjectBlocking() :
1609 vmSwapOneObjectThreaded();
1610 if (retval
== REDIS_ERR
&& !(loops
% 300) &&
1611 zmalloc_used_memory() >
1612 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1614 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1616 /* Note that when using threade I/O we free just one object,
1617 * because anyway when the I/O thread in charge to swap this
1618 * object out will finish, the handler of completed jobs
1619 * will try to swap more objects if we are still out of memory. */
1620 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1624 /* Check if we should connect to a MASTER */
1625 if (server
.replstate
== REDIS_REPL_CONNECT
&& !(loops
% 10)) {
1626 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1627 if (syncWithMaster() == REDIS_OK
) {
1628 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1629 if (server
.appendonly
) rewriteAppendOnlyFileBackground();
1635 /* This function gets called every time Redis is entering the
1636 * main loop of the event driven library, that is, before to sleep
1637 * for ready file descriptors. */
1638 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1639 REDIS_NOTUSED(eventLoop
);
1641 /* Awake clients that got all the swapped keys they requested */
1642 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1646 listRewind(server
.io_ready_clients
,&li
);
1647 while((ln
= listNext(&li
))) {
1648 redisClient
*c
= ln
->value
;
1649 struct redisCommand
*cmd
;
1651 /* Resume the client. */
1652 listDelNode(server
.io_ready_clients
,ln
);
1653 c
->flags
&= (~REDIS_IO_WAIT
);
1654 server
.vm_blocked_clients
--;
1655 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1656 readQueryFromClient
, c
);
1657 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1658 assert(cmd
!= NULL
);
1661 /* There may be more data to process in the input buffer. */
1662 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1663 processInputBuffer(c
);
1666 /* Write the AOF buffer on disk */
1667 flushAppendOnlyFile();
1670 static void createSharedObjects(void) {
1673 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1674 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1675 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1676 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1677 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1678 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1679 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1680 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1681 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1682 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1683 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1684 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1685 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1686 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1687 "-ERR no such key\r\n"));
1688 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1689 "-ERR syntax error\r\n"));
1690 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1691 "-ERR source and destination objects are the same\r\n"));
1692 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1693 "-ERR index out of range\r\n"));
1694 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1695 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1696 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1697 shared
.select0
= createStringObject("select 0\r\n",10);
1698 shared
.select1
= createStringObject("select 1\r\n",10);
1699 shared
.select2
= createStringObject("select 2\r\n",10);
1700 shared
.select3
= createStringObject("select 3\r\n",10);
1701 shared
.select4
= createStringObject("select 4\r\n",10);
1702 shared
.select5
= createStringObject("select 5\r\n",10);
1703 shared
.select6
= createStringObject("select 6\r\n",10);
1704 shared
.select7
= createStringObject("select 7\r\n",10);
1705 shared
.select8
= createStringObject("select 8\r\n",10);
1706 shared
.select9
= createStringObject("select 9\r\n",10);
1707 shared
.messagebulk
= createStringObject("$7\r\nmessage\r\n",13);
1708 shared
.pmessagebulk
= createStringObject("$8\r\npmessage\r\n",14);
1709 shared
.subscribebulk
= createStringObject("$9\r\nsubscribe\r\n",15);
1710 shared
.unsubscribebulk
= createStringObject("$11\r\nunsubscribe\r\n",18);
1711 shared
.psubscribebulk
= createStringObject("$10\r\npsubscribe\r\n",17);
1712 shared
.punsubscribebulk
= createStringObject("$12\r\npunsubscribe\r\n",19);
1713 shared
.mbulk3
= createStringObject("*3\r\n",4);
1714 shared
.mbulk4
= createStringObject("*4\r\n",4);
1715 for (j
= 0; j
< REDIS_SHARED_INTEGERS
; j
++) {
1716 shared
.integers
[j
] = createObject(REDIS_STRING
,(void*)(long)j
);
1717 shared
.integers
[j
]->encoding
= REDIS_ENCODING_INT
;
1721 static void appendServerSaveParams(time_t seconds
, int changes
) {
1722 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1723 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1724 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1725 server
.saveparamslen
++;
1728 static void resetServerSaveParams() {
1729 zfree(server
.saveparams
);
1730 server
.saveparams
= NULL
;
1731 server
.saveparamslen
= 0;
1734 static void initServerConfig() {
1735 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1736 server
.port
= REDIS_SERVERPORT
;
1737 server
.verbosity
= REDIS_VERBOSE
;
1738 server
.maxidletime
= REDIS_MAXIDLETIME
;
1739 server
.saveparams
= NULL
;
1740 server
.logfile
= NULL
; /* NULL = log on standard output */
1741 server
.bindaddr
= NULL
;
1742 server
.glueoutputbuf
= 1;
1743 server
.daemonize
= 0;
1744 server
.appendonly
= 0;
1745 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1746 server
.no_appendfsync_on_rewrite
= 0;
1747 server
.lastfsync
= time(NULL
);
1748 server
.appendfd
= -1;
1749 server
.appendseldb
= -1; /* Make sure the first time will not match */
1750 server
.pidfile
= zstrdup("/var/run/redis.pid");
1751 server
.dbfilename
= zstrdup("dump.rdb");
1752 server
.appendfilename
= zstrdup("appendonly.aof");
1753 server
.requirepass
= NULL
;
1754 server
.rdbcompression
= 1;
1755 server
.activerehashing
= 1;
1756 server
.maxclients
= 0;
1757 server
.blpop_blocked_clients
= 0;
1758 server
.maxmemory
= 0;
1759 server
.vm_enabled
= 0;
1760 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1761 server
.vm_page_size
= 256; /* 256 bytes per page */
1762 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1763 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1764 server
.vm_max_threads
= 4;
1765 server
.vm_blocked_clients
= 0;
1766 server
.hash_max_zipmap_entries
= REDIS_HASH_MAX_ZIPMAP_ENTRIES
;
1767 server
.hash_max_zipmap_value
= REDIS_HASH_MAX_ZIPMAP_VALUE
;
1768 server
.list_max_ziplist_entries
= REDIS_LIST_MAX_ZIPLIST_ENTRIES
;
1769 server
.list_max_ziplist_value
= REDIS_LIST_MAX_ZIPLIST_VALUE
;
1770 server
.set_max_intset_entries
= REDIS_SET_MAX_INTSET_ENTRIES
;
1771 server
.shutdown_asap
= 0;
1773 resetServerSaveParams();
1775 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1776 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1777 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1778 /* Replication related */
1780 server
.masterauth
= NULL
;
1781 server
.masterhost
= NULL
;
1782 server
.masterport
= 6379;
1783 server
.master
= NULL
;
1784 server
.replstate
= REDIS_REPL_NONE
;
1786 /* Double constants initialization */
1788 R_PosInf
= 1.0/R_Zero
;
1789 R_NegInf
= -1.0/R_Zero
;
1790 R_Nan
= R_Zero
/R_Zero
;
1793 static void initServer() {
1796 signal(SIGHUP
, SIG_IGN
);
1797 signal(SIGPIPE
, SIG_IGN
);
1798 setupSigSegvAction();
1800 server
.devnull
= fopen("/dev/null","w");
1801 if (server
.devnull
== NULL
) {
1802 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1805 server
.clients
= listCreate();
1806 server
.slaves
= listCreate();
1807 server
.monitors
= listCreate();
1808 server
.objfreelist
= listCreate();
1809 createSharedObjects();
1810 server
.el
= aeCreateEventLoop();
1811 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1812 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1813 if (server
.fd
== -1) {
1814 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1817 for (j
= 0; j
< server
.dbnum
; j
++) {
1818 server
.db
[j
].dict
= dictCreate(&dbDictType
,NULL
);
1819 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1820 server
.db
[j
].blocking_keys
= dictCreate(&keylistDictType
,NULL
);
1821 server
.db
[j
].watched_keys
= dictCreate(&keylistDictType
,NULL
);
1822 if (server
.vm_enabled
)
1823 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1824 server
.db
[j
].id
= j
;
1826 server
.pubsub_channels
= dictCreate(&keylistDictType
,NULL
);
1827 server
.pubsub_patterns
= listCreate();
1828 listSetFreeMethod(server
.pubsub_patterns
,freePubsubPattern
);
1829 listSetMatchMethod(server
.pubsub_patterns
,listMatchPubsubPattern
);
1830 server
.cronloops
= 0;
1831 server
.bgsavechildpid
= -1;
1832 server
.bgrewritechildpid
= -1;
1833 server
.bgrewritebuf
= sdsempty();
1834 server
.aofbuf
= sdsempty();
1835 server
.lastsave
= time(NULL
);
1837 server
.stat_numcommands
= 0;
1838 server
.stat_numconnections
= 0;
1839 server
.stat_expiredkeys
= 0;
1840 server
.stat_starttime
= time(NULL
);
1841 server
.unixtime
= time(NULL
);
1842 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1843 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1844 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1846 if (server
.appendonly
) {
1847 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1848 if (server
.appendfd
== -1) {
1849 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1855 if (server
.vm_enabled
) vmInit();
1858 /* Empty the whole database */
1859 static long long emptyDb() {
1861 long long removed
= 0;
1863 for (j
= 0; j
< server
.dbnum
; j
++) {
1864 removed
+= dictSize(server
.db
[j
].dict
);
1865 dictEmpty(server
.db
[j
].dict
);
1866 dictEmpty(server
.db
[j
].expires
);
1871 static int yesnotoi(char *s
) {
1872 if (!strcasecmp(s
,"yes")) return 1;
1873 else if (!strcasecmp(s
,"no")) return 0;
1877 /* I agree, this is a very rudimental way to load a configuration...
1878 will improve later if the config gets more complex */
1879 static void loadServerConfig(char *filename
) {
1881 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1885 if (filename
[0] == '-' && filename
[1] == '\0')
1888 if ((fp
= fopen(filename
,"r")) == NULL
) {
1889 redisLog(REDIS_WARNING
, "Fatal error, can't open config file '%s'", filename
);
1894 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1900 line
= sdstrim(line
," \t\r\n");
1902 /* Skip comments and blank lines*/
1903 if (line
[0] == '#' || line
[0] == '\0') {
1908 /* Split into arguments */
1909 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1910 sdstolower(argv
[0]);
1912 /* Execute config directives */
1913 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1914 server
.maxidletime
= atoi(argv
[1]);
1915 if (server
.maxidletime
< 0) {
1916 err
= "Invalid timeout value"; goto loaderr
;
1918 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1919 server
.port
= atoi(argv
[1]);
1920 if (server
.port
< 1 || server
.port
> 65535) {
1921 err
= "Invalid port"; goto loaderr
;
1923 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1924 server
.bindaddr
= zstrdup(argv
[1]);
1925 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1926 int seconds
= atoi(argv
[1]);
1927 int changes
= atoi(argv
[2]);
1928 if (seconds
< 1 || changes
< 0) {
1929 err
= "Invalid save parameters"; goto loaderr
;
1931 appendServerSaveParams(seconds
,changes
);
1932 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1933 if (chdir(argv
[1]) == -1) {
1934 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1935 argv
[1], strerror(errno
));
1938 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1939 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1940 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1941 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1942 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1944 err
= "Invalid log level. Must be one of debug, notice, warning";
1947 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1950 server
.logfile
= zstrdup(argv
[1]);
1951 if (!strcasecmp(server
.logfile
,"stdout")) {
1952 zfree(server
.logfile
);
1953 server
.logfile
= NULL
;
1955 if (server
.logfile
) {
1956 /* Test if we are able to open the file. The server will not
1957 * be able to abort just for this problem later... */
1958 logfp
= fopen(server
.logfile
,"a");
1959 if (logfp
== NULL
) {
1960 err
= sdscatprintf(sdsempty(),
1961 "Can't open the log file: %s", strerror(errno
));
1966 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1967 server
.dbnum
= atoi(argv
[1]);
1968 if (server
.dbnum
< 1) {
1969 err
= "Invalid number of databases"; goto loaderr
;
1971 } else if (!strcasecmp(argv
[0],"include") && argc
== 2) {
1972 loadServerConfig(argv
[1]);
1973 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1974 server
.maxclients
= atoi(argv
[1]);
1975 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1976 server
.maxmemory
= memtoll(argv
[1],NULL
);
1977 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1978 server
.masterhost
= sdsnew(argv
[1]);
1979 server
.masterport
= atoi(argv
[2]);
1980 server
.replstate
= REDIS_REPL_CONNECT
;
1981 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1982 server
.masterauth
= zstrdup(argv
[1]);
1983 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1984 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1985 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1987 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1988 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1989 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1991 } else if (!strcasecmp(argv
[0],"activerehashing") && argc
== 2) {
1992 if ((server
.activerehashing
= yesnotoi(argv
[1])) == -1) {
1993 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1995 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1996 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1997 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1999 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
2000 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
2001 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
2003 } else if (!strcasecmp(argv
[0],"appendfilename") && argc
== 2) {
2004 zfree(server
.appendfilename
);
2005 server
.appendfilename
= zstrdup(argv
[1]);
2006 } else if (!strcasecmp(argv
[0],"no-appendfsync-on-rewrite")
2008 if ((server
.no_appendfsync_on_rewrite
= yesnotoi(argv
[1])) == -1) {
2009 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
2011 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
2012 if (!strcasecmp(argv
[1],"no")) {
2013 server
.appendfsync
= APPENDFSYNC_NO
;
2014 } else if (!strcasecmp(argv
[1],"always")) {
2015 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
2016 } else if (!strcasecmp(argv
[1],"everysec")) {
2017 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
2019 err
= "argument must be 'no', 'always' or 'everysec'";
2022 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
2023 server
.requirepass
= zstrdup(argv
[1]);
2024 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
2025 zfree(server
.pidfile
);
2026 server
.pidfile
= zstrdup(argv
[1]);
2027 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
2028 zfree(server
.dbfilename
);
2029 server
.dbfilename
= zstrdup(argv
[1]);
2030 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
2031 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
2032 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
2034 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
2035 zfree(server
.vm_swap_file
);
2036 server
.vm_swap_file
= zstrdup(argv
[1]);
2037 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
2038 server
.vm_max_memory
= memtoll(argv
[1],NULL
);
2039 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
2040 server
.vm_page_size
= memtoll(argv
[1], NULL
);
2041 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
2042 server
.vm_pages
= memtoll(argv
[1], NULL
);
2043 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
2044 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
2045 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-entries") && argc
== 2){
2046 server
.hash_max_zipmap_entries
= memtoll(argv
[1], NULL
);
2047 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-value") && argc
== 2){
2048 server
.hash_max_zipmap_value
= memtoll(argv
[1], NULL
);
2049 } else if (!strcasecmp(argv
[0],"list-max-ziplist-entries") && argc
== 2){
2050 server
.list_max_ziplist_entries
= memtoll(argv
[1], NULL
);
2051 } else if (!strcasecmp(argv
[0],"list-max-ziplist-value") && argc
== 2){
2052 server
.list_max_ziplist_value
= memtoll(argv
[1], NULL
);
2053 } else if (!strcasecmp(argv
[0],"set-max-intset-entries") && argc
== 2){
2054 server
.set_max_intset_entries
= memtoll(argv
[1], NULL
);
2056 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
2058 for (j
= 0; j
< argc
; j
++)
2063 if (fp
!= stdin
) fclose(fp
);
2067 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
2068 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
2069 fprintf(stderr
, ">>> '%s'\n", line
);
2070 fprintf(stderr
, "%s\n", err
);
2074 static void freeClientArgv(redisClient
*c
) {
2077 for (j
= 0; j
< c
->argc
; j
++)
2078 decrRefCount(c
->argv
[j
]);
2079 for (j
= 0; j
< c
->mbargc
; j
++)
2080 decrRefCount(c
->mbargv
[j
]);
2085 static void freeClient(redisClient
*c
) {
2088 /* Note that if the client we are freeing is blocked into a blocking
2089 * call, we have to set querybuf to NULL *before* to call
2090 * unblockClientWaitingData() to avoid processInputBuffer() will get
2091 * called. Also it is important to remove the file events after
2092 * this, because this call adds the READABLE event. */
2093 sdsfree(c
->querybuf
);
2095 if (c
->flags
& REDIS_BLOCKED
)
2096 unblockClientWaitingData(c
);
2098 /* UNWATCH all the keys */
2100 listRelease(c
->watched_keys
);
2101 /* Unsubscribe from all the pubsub channels */
2102 pubsubUnsubscribeAllChannels(c
,0);
2103 pubsubUnsubscribeAllPatterns(c
,0);
2104 dictRelease(c
->pubsub_channels
);
2105 listRelease(c
->pubsub_patterns
);
2106 /* Obvious cleanup */
2107 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
2108 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2109 listRelease(c
->reply
);
2112 /* Remove from the list of clients */
2113 ln
= listSearchKey(server
.clients
,c
);
2114 redisAssert(ln
!= NULL
);
2115 listDelNode(server
.clients
,ln
);
2116 /* Remove from the list of clients that are now ready to be restarted
2117 * after waiting for swapped keys */
2118 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
2119 ln
= listSearchKey(server
.io_ready_clients
,c
);
2121 listDelNode(server
.io_ready_clients
,ln
);
2122 server
.vm_blocked_clients
--;
2125 /* Remove from the list of clients waiting for swapped keys */
2126 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
2127 ln
= listFirst(c
->io_keys
);
2128 dontWaitForSwappedKey(c
,ln
->value
);
2130 listRelease(c
->io_keys
);
2131 /* Master/slave cleanup */
2132 if (c
->flags
& REDIS_SLAVE
) {
2133 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
2135 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
2136 ln
= listSearchKey(l
,c
);
2137 redisAssert(ln
!= NULL
);
2140 if (c
->flags
& REDIS_MASTER
) {
2141 server
.master
= NULL
;
2142 server
.replstate
= REDIS_REPL_CONNECT
;
2144 /* Release memory */
2147 freeClientMultiState(c
);
2151 #define GLUEREPLY_UP_TO (1024)
2152 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
2154 char buf
[GLUEREPLY_UP_TO
];
2159 listRewind(c
->reply
,&li
);
2160 while((ln
= listNext(&li
))) {
2164 objlen
= sdslen(o
->ptr
);
2165 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
2166 memcpy(buf
+copylen
,o
->ptr
,objlen
);
2168 listDelNode(c
->reply
,ln
);
2170 if (copylen
== 0) return;
2174 /* Now the output buffer is empty, add the new single element */
2175 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
2176 listAddNodeHead(c
->reply
,o
);
2179 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2180 redisClient
*c
= privdata
;
2181 int nwritten
= 0, totwritten
= 0, objlen
;
2184 REDIS_NOTUSED(mask
);
2186 /* Use writev() if we have enough buffers to send */
2187 if (!server
.glueoutputbuf
&&
2188 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
2189 !(c
->flags
& REDIS_MASTER
))
2191 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
2195 while(listLength(c
->reply
)) {
2196 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
2197 glueReplyBuffersIfNeeded(c
);
2199 o
= listNodeValue(listFirst(c
->reply
));
2200 objlen
= sdslen(o
->ptr
);
2203 listDelNode(c
->reply
,listFirst(c
->reply
));
2207 if (c
->flags
& REDIS_MASTER
) {
2208 /* Don't reply to a master */
2209 nwritten
= objlen
- c
->sentlen
;
2211 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
2212 if (nwritten
<= 0) break;
2214 c
->sentlen
+= nwritten
;
2215 totwritten
+= nwritten
;
2216 /* If we fully sent the object on head go to the next one */
2217 if (c
->sentlen
== objlen
) {
2218 listDelNode(c
->reply
,listFirst(c
->reply
));
2221 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2222 * bytes, in a single threaded server it's a good idea to serve
2223 * other clients as well, even if a very large request comes from
2224 * super fast link that is always able to accept data (in real world
2225 * scenario think about 'KEYS *' against the loopback interfae) */
2226 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
2228 if (nwritten
== -1) {
2229 if (errno
== EAGAIN
) {
2232 redisLog(REDIS_VERBOSE
,
2233 "Error writing to client: %s", strerror(errno
));
2238 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
2239 if (listLength(c
->reply
) == 0) {
2241 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2245 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
2247 redisClient
*c
= privdata
;
2248 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
2250 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
2251 int offset
, ion
= 0;
2253 REDIS_NOTUSED(mask
);
2256 while (listLength(c
->reply
)) {
2257 offset
= c
->sentlen
;
2261 /* fill-in the iov[] array */
2262 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
2263 o
= listNodeValue(node
);
2264 objlen
= sdslen(o
->ptr
);
2266 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
2269 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
2270 break; /* no more iovecs */
2272 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
2273 iov
[ion
].iov_len
= objlen
- offset
;
2274 willwrite
+= objlen
- offset
;
2275 offset
= 0; /* just for the first item */
2282 /* write all collected blocks at once */
2283 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
2284 if (errno
!= EAGAIN
) {
2285 redisLog(REDIS_VERBOSE
,
2286 "Error writing to client: %s", strerror(errno
));
2293 totwritten
+= nwritten
;
2294 offset
= c
->sentlen
;
2296 /* remove written robjs from c->reply */
2297 while (nwritten
&& listLength(c
->reply
)) {
2298 o
= listNodeValue(listFirst(c
->reply
));
2299 objlen
= sdslen(o
->ptr
);
2301 if(nwritten
>= objlen
- offset
) {
2302 listDelNode(c
->reply
, listFirst(c
->reply
));
2303 nwritten
-= objlen
- offset
;
2307 c
->sentlen
+= nwritten
;
2315 c
->lastinteraction
= time(NULL
);
2317 if (listLength(c
->reply
) == 0) {
2319 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2323 static int qsortRedisCommands(const void *r1
, const void *r2
) {
2325 ((struct redisCommand
*)r1
)->name
,
2326 ((struct redisCommand
*)r2
)->name
);
2329 static void sortCommandTable() {
2330 /* Copy and sort the read-only version of the command table */
2331 commandTable
= (struct redisCommand
*)malloc(sizeof(readonlyCommandTable
));
2332 memcpy(commandTable
,readonlyCommandTable
,sizeof(readonlyCommandTable
));
2334 sizeof(readonlyCommandTable
)/sizeof(struct redisCommand
),
2335 sizeof(struct redisCommand
),qsortRedisCommands
);
2338 static struct redisCommand
*lookupCommand(char *name
) {
2339 struct redisCommand tmp
= {name
,NULL
,0,0,NULL
,0,0,0};
2343 sizeof(readonlyCommandTable
)/sizeof(struct redisCommand
),
2344 sizeof(struct redisCommand
),
2345 qsortRedisCommands
);
2348 /* resetClient prepare the client to process the next command */
2349 static void resetClient(redisClient
*c
) {
2355 /* Call() is the core of Redis execution of a command */
2356 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
2359 dirty
= server
.dirty
;
2361 dirty
= server
.dirty
-dirty
;
2363 if (server
.appendonly
&& dirty
)
2364 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2365 if ((dirty
|| cmd
->flags
& REDIS_CMD_FORCE_REPLICATION
) &&
2366 listLength(server
.slaves
))
2367 replicationFeedSlaves(server
.slaves
,c
->db
->id
,c
->argv
,c
->argc
);
2368 if (listLength(server
.monitors
))
2369 replicationFeedMonitors(server
.monitors
,c
->db
->id
,c
->argv
,c
->argc
);
2370 server
.stat_numcommands
++;
2373 /* If this function gets called we already read a whole
2374 * command, argments are in the client argv/argc fields.
2375 * processCommand() execute the command or prepare the
2376 * server for a bulk read from the client.
2378 * If 1 is returned the client is still alive and valid and
2379 * and other operations can be performed by the caller. Otherwise
2380 * if 0 is returned the client was destroied (i.e. after QUIT). */
2381 static int processCommand(redisClient
*c
) {
2382 struct redisCommand
*cmd
;
2384 /* Free some memory if needed (maxmemory setting) */
2385 if (server
.maxmemory
) freeMemoryIfNeeded();
2387 /* Handle the multi bulk command type. This is an alternative protocol
2388 * supported by Redis in order to receive commands that are composed of
2389 * multiple binary-safe "bulk" arguments. The latency of processing is
2390 * a bit higher but this allows things like multi-sets, so if this
2391 * protocol is used only for MSET and similar commands this is a big win. */
2392 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2393 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2394 if (c
->multibulk
<= 0) {
2398 decrRefCount(c
->argv
[c
->argc
-1]);
2402 } else if (c
->multibulk
) {
2403 if (c
->bulklen
== -1) {
2404 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2405 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2409 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2410 decrRefCount(c
->argv
[0]);
2411 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2413 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2418 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2422 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2423 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2427 if (c
->multibulk
== 0) {
2431 /* Here we need to swap the multi-bulk argc/argv with the
2432 * normal argc/argv of the client structure. */
2434 c
->argv
= c
->mbargv
;
2435 c
->mbargv
= auxargv
;
2438 c
->argc
= c
->mbargc
;
2439 c
->mbargc
= auxargc
;
2441 /* We need to set bulklen to something different than -1
2442 * in order for the code below to process the command without
2443 * to try to read the last argument of a bulk command as
2444 * a special argument. */
2446 /* continue below and process the command */
2453 /* -- end of multi bulk commands processing -- */
2455 /* The QUIT command is handled as a special case. Normal command
2456 * procs are unable to close the client connection safely */
2457 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2462 /* Now lookup the command and check ASAP about trivial error conditions
2463 * such wrong arity, bad command name and so forth. */
2464 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2467 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2468 (char*)c
->argv
[0]->ptr
));
2471 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2472 (c
->argc
< -cmd
->arity
)) {
2474 sdscatprintf(sdsempty(),
2475 "-ERR wrong number of arguments for '%s' command\r\n",
2479 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2480 /* This is a bulk command, we have to read the last argument yet. */
2481 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2483 decrRefCount(c
->argv
[c
->argc
-1]);
2484 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2486 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2491 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2492 /* It is possible that the bulk read is already in the
2493 * buffer. Check this condition and handle it accordingly.
2494 * This is just a fast path, alternative to call processInputBuffer().
2495 * It's a good idea since the code is small and this condition
2496 * happens most of the times. */
2497 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2498 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2500 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2502 /* Otherwise return... there is to read the last argument
2503 * from the socket. */
2507 /* Let's try to encode the bulk object to save space. */
2508 if (cmd
->flags
& REDIS_CMD_BULK
)
2509 c
->argv
[c
->argc
-1] = tryObjectEncoding(c
->argv
[c
->argc
-1]);
2511 /* Check if the user is authenticated */
2512 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2513 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2518 /* Handle the maxmemory directive */
2519 if (server
.maxmemory
&& (cmd
->flags
& REDIS_CMD_DENYOOM
) &&
2520 zmalloc_used_memory() > server
.maxmemory
)
2522 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2527 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2528 if ((dictSize(c
->pubsub_channels
) > 0 || listLength(c
->pubsub_patterns
) > 0)
2530 cmd
->proc
!= subscribeCommand
&& cmd
->proc
!= unsubscribeCommand
&&
2531 cmd
->proc
!= psubscribeCommand
&& cmd
->proc
!= punsubscribeCommand
) {
2532 addReplySds(c
,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2537 /* Exec the command */
2538 if (c
->flags
& REDIS_MULTI
&&
2539 cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
&&
2540 cmd
->proc
!= multiCommand
&& cmd
->proc
!= watchCommand
)
2542 queueMultiCommand(c
,cmd
);
2543 addReply(c
,shared
.queued
);
2545 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2546 blockClientOnSwappedKeys(c
,cmd
)) return 1;
2550 /* Prepare the client for the next command */
2555 static void replicationFeedSlaves(list
*slaves
, int dictid
, robj
**argv
, int argc
) {
2560 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2561 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2562 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2563 robj
*static_outv
[REDIS_STATIC_ARGS
*3+1];
2566 if (argc
<= REDIS_STATIC_ARGS
) {
2569 outv
= zmalloc(sizeof(robj
*)*(argc
*3+1));
2572 lenobj
= createObject(REDIS_STRING
,
2573 sdscatprintf(sdsempty(), "*%d\r\n", argc
));
2574 lenobj
->refcount
= 0;
2575 outv
[outc
++] = lenobj
;
2576 for (j
= 0; j
< argc
; j
++) {
2577 lenobj
= createObject(REDIS_STRING
,
2578 sdscatprintf(sdsempty(),"$%lu\r\n",
2579 (unsigned long) stringObjectLen(argv
[j
])));
2580 lenobj
->refcount
= 0;
2581 outv
[outc
++] = lenobj
;
2582 outv
[outc
++] = argv
[j
];
2583 outv
[outc
++] = shared
.crlf
;
2586 /* Increment all the refcounts at start and decrement at end in order to
2587 * be sure to free objects if there is no slave in a replication state
2588 * able to be feed with commands */
2589 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2590 listRewind(slaves
,&li
);
2591 while((ln
= listNext(&li
))) {
2592 redisClient
*slave
= ln
->value
;
2594 /* Don't feed slaves that are still waiting for BGSAVE to start */
2595 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2597 /* Feed all the other slaves, MONITORs and so on */
2598 if (slave
->slaveseldb
!= dictid
) {
2602 case 0: selectcmd
= shared
.select0
; break;
2603 case 1: selectcmd
= shared
.select1
; break;
2604 case 2: selectcmd
= shared
.select2
; break;
2605 case 3: selectcmd
= shared
.select3
; break;
2606 case 4: selectcmd
= shared
.select4
; break;
2607 case 5: selectcmd
= shared
.select5
; break;
2608 case 6: selectcmd
= shared
.select6
; break;
2609 case 7: selectcmd
= shared
.select7
; break;
2610 case 8: selectcmd
= shared
.select8
; break;
2611 case 9: selectcmd
= shared
.select9
; break;
2613 selectcmd
= createObject(REDIS_STRING
,
2614 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2615 selectcmd
->refcount
= 0;
2618 addReply(slave
,selectcmd
);
2619 slave
->slaveseldb
= dictid
;
2621 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2623 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2624 if (outv
!= static_outv
) zfree(outv
);
2627 static sds
sdscatrepr(sds s
, char *p
, size_t len
) {
2628 s
= sdscatlen(s
,"\"",1);
2633 s
= sdscatprintf(s
,"\\%c",*p
);
2635 case '\n': s
= sdscatlen(s
,"\\n",1); break;
2636 case '\r': s
= sdscatlen(s
,"\\r",1); break;
2637 case '\t': s
= sdscatlen(s
,"\\t",1); break;
2638 case '\a': s
= sdscatlen(s
,"\\a",1); break;
2639 case '\b': s
= sdscatlen(s
,"\\b",1); break;
2642 s
= sdscatprintf(s
,"%c",*p
);
2644 s
= sdscatprintf(s
,"\\x%02x",(unsigned char)*p
);
2649 return sdscatlen(s
,"\"",1);
2652 static void replicationFeedMonitors(list
*monitors
, int dictid
, robj
**argv
, int argc
) {
2656 sds cmdrepr
= sdsnew("+");
2660 gettimeofday(&tv
,NULL
);
2661 cmdrepr
= sdscatprintf(cmdrepr
,"%ld.%ld ",(long)tv
.tv_sec
,(long)tv
.tv_usec
);
2662 if (dictid
!= 0) cmdrepr
= sdscatprintf(cmdrepr
,"(db %d) ", dictid
);
2664 for (j
= 0; j
< argc
; j
++) {
2665 if (argv
[j
]->encoding
== REDIS_ENCODING_INT
) {
2666 cmdrepr
= sdscatprintf(cmdrepr
, "%ld", (long)argv
[j
]->ptr
);
2668 cmdrepr
= sdscatrepr(cmdrepr
,(char*)argv
[j
]->ptr
,
2669 sdslen(argv
[j
]->ptr
));
2672 cmdrepr
= sdscatlen(cmdrepr
," ",1);
2674 cmdrepr
= sdscatlen(cmdrepr
,"\r\n",2);
2675 cmdobj
= createObject(REDIS_STRING
,cmdrepr
);
2677 listRewind(monitors
,&li
);
2678 while((ln
= listNext(&li
))) {
2679 redisClient
*monitor
= ln
->value
;
2680 addReply(monitor
,cmdobj
);
2682 decrRefCount(cmdobj
);
2685 static void processInputBuffer(redisClient
*c
) {
2687 /* Before to process the input buffer, make sure the client is not
2688 * waitig for a blocking operation such as BLPOP. Note that the first
2689 * iteration the client is never blocked, otherwise the processInputBuffer
2690 * would not be called at all, but after the execution of the first commands
2691 * in the input buffer the client may be blocked, and the "goto again"
2692 * will try to reiterate. The following line will make it return asap. */
2693 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2694 if (c
->bulklen
== -1) {
2695 /* Read the first line of the query */
2696 char *p
= strchr(c
->querybuf
,'\n');
2703 query
= c
->querybuf
;
2704 c
->querybuf
= sdsempty();
2705 querylen
= 1+(p
-(query
));
2706 if (sdslen(query
) > querylen
) {
2707 /* leave data after the first line of the query in the buffer */
2708 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2710 *p
= '\0'; /* remove "\n" */
2711 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2712 sdsupdatelen(query
);
2714 /* Now we can split the query in arguments */
2715 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2718 if (c
->argv
) zfree(c
->argv
);
2719 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2721 for (j
= 0; j
< argc
; j
++) {
2722 if (sdslen(argv
[j
])) {
2723 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2731 /* Execute the command. If the client is still valid
2732 * after processCommand() return and there is something
2733 * on the query buffer try to process the next command. */
2734 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2736 /* Nothing to process, argc == 0. Just process the query
2737 * buffer if it's not empty or return to the caller */
2738 if (sdslen(c
->querybuf
)) goto again
;
2741 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2742 redisLog(REDIS_VERBOSE
, "Client protocol error");
2747 /* Bulk read handling. Note that if we are at this point
2748 the client already sent a command terminated with a newline,
2749 we are reading the bulk data that is actually the last
2750 argument of the command. */
2751 int qbl
= sdslen(c
->querybuf
);
2753 if (c
->bulklen
<= qbl
) {
2754 /* Copy everything but the final CRLF as final argument */
2755 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2757 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2758 /* Process the command. If the client is still valid after
2759 * the processing and there is more data in the buffer
2760 * try to parse it. */
2761 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2767 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2768 redisClient
*c
= (redisClient
*) privdata
;
2769 char buf
[REDIS_IOBUF_LEN
];
2772 REDIS_NOTUSED(mask
);
2774 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2776 if (errno
== EAGAIN
) {
2779 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2783 } else if (nread
== 0) {
2784 redisLog(REDIS_VERBOSE
, "Client closed connection");
2789 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2790 c
->lastinteraction
= time(NULL
);
2794 processInputBuffer(c
);
2797 static int selectDb(redisClient
*c
, int id
) {
2798 if (id
< 0 || id
>= server
.dbnum
)
2800 c
->db
= &server
.db
[id
];
2804 static void *dupClientReplyValue(void *o
) {
2805 incrRefCount((robj
*)o
);
2809 static int listMatchObjects(void *a
, void *b
) {
2810 return equalStringObjects(a
,b
);
2813 static redisClient
*createClient(int fd
) {
2814 redisClient
*c
= zmalloc(sizeof(*c
));
2816 anetNonBlock(NULL
,fd
);
2817 anetTcpNoDelay(NULL
,fd
);
2818 if (!c
) return NULL
;
2821 c
->querybuf
= sdsempty();
2830 c
->lastinteraction
= time(NULL
);
2831 c
->authenticated
= 0;
2832 c
->replstate
= REDIS_REPL_NONE
;
2833 c
->reply
= listCreate();
2834 listSetFreeMethod(c
->reply
,decrRefCount
);
2835 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2836 c
->blocking_keys
= NULL
;
2837 c
->blocking_keys_num
= 0;
2838 c
->io_keys
= listCreate();
2839 c
->watched_keys
= listCreate();
2840 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2841 c
->pubsub_channels
= dictCreate(&setDictType
,NULL
);
2842 c
->pubsub_patterns
= listCreate();
2843 listSetFreeMethod(c
->pubsub_patterns
,decrRefCount
);
2844 listSetMatchMethod(c
->pubsub_patterns
,listMatchObjects
);
2845 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2846 readQueryFromClient
, c
) == AE_ERR
) {
2850 listAddNodeTail(server
.clients
,c
);
2851 initClientMultiState(c
);
2855 static void addReply(redisClient
*c
, robj
*obj
) {
2856 if (listLength(c
->reply
) == 0 &&
2857 (c
->replstate
== REDIS_REPL_NONE
||
2858 c
->replstate
== REDIS_REPL_ONLINE
) &&
2859 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2860 sendReplyToClient
, c
) == AE_ERR
) return;
2862 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2863 obj
= dupStringObject(obj
);
2864 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2866 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2869 static void addReplySds(redisClient
*c
, sds s
) {
2870 robj
*o
= createObject(REDIS_STRING
,s
);
2875 static void addReplyDouble(redisClient
*c
, double d
) {
2878 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2879 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2880 (unsigned long) strlen(buf
),buf
));
2883 static void addReplyLongLong(redisClient
*c
, long long ll
) {
2888 addReply(c
,shared
.czero
);
2890 } else if (ll
== 1) {
2891 addReply(c
,shared
.cone
);
2895 len
= ll2string(buf
+1,sizeof(buf
)-1,ll
);
2898 addReplySds(c
,sdsnewlen(buf
,len
+3));
2901 static void addReplyUlong(redisClient
*c
, unsigned long ul
) {
2906 addReply(c
,shared
.czero
);
2908 } else if (ul
== 1) {
2909 addReply(c
,shared
.cone
);
2912 len
= snprintf(buf
,sizeof(buf
),":%lu\r\n",ul
);
2913 addReplySds(c
,sdsnewlen(buf
,len
));
2916 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2920 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2921 len
= sdslen(obj
->ptr
);
2923 long n
= (long)obj
->ptr
;
2925 /* Compute how many bytes will take this integer as a radix 10 string */
2931 while((n
= n
/10) != 0) {
2936 intlen
= ll2string(buf
+1,sizeof(buf
)-1,(long long)len
);
2937 buf
[intlen
+1] = '\r';
2938 buf
[intlen
+2] = '\n';
2939 addReplySds(c
,sdsnewlen(buf
,intlen
+3));
2942 static void addReplyBulk(redisClient
*c
, robj
*obj
) {
2943 addReplyBulkLen(c
,obj
);
2945 addReply(c
,shared
.crlf
);
2948 static void addReplyBulkSds(redisClient
*c
, sds s
) {
2949 robj
*o
= createStringObject(s
, sdslen(s
));
2954 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2955 static void addReplyBulkCString(redisClient
*c
, char *s
) {
2957 addReply(c
,shared
.nullbulk
);
2959 robj
*o
= createStringObject(s
,strlen(s
));
2965 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2970 REDIS_NOTUSED(mask
);
2971 REDIS_NOTUSED(privdata
);
2973 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2974 if (cfd
== AE_ERR
) {
2975 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2978 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2979 if ((c
= createClient(cfd
)) == NULL
) {
2980 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2981 close(cfd
); /* May be already closed, just ingore errors */
2984 /* If maxclient directive is set and this is one client more... close the
2985 * connection. Note that we create the client instead to check before
2986 * for this condition, since now the socket is already set in nonblocking
2987 * mode and we can send an error for free using the Kernel I/O */
2988 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2989 char *err
= "-ERR max number of clients reached\r\n";
2991 /* That's a best effort error message, don't check write errors */
2992 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2993 /* Nothing to do, Just to avoid the warning... */
2998 server
.stat_numconnections
++;
3001 /* ======================= Redis objects implementation ===================== */
3003 static robj
*createObject(int type
, void *ptr
) {
3006 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
3007 if (listLength(server
.objfreelist
)) {
3008 listNode
*head
= listFirst(server
.objfreelist
);
3009 o
= listNodeValue(head
);
3010 listDelNode(server
.objfreelist
,head
);
3011 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
3013 if (server
.vm_enabled
)
3014 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
3015 o
= zmalloc(sizeof(*o
));
3018 o
->encoding
= REDIS_ENCODING_RAW
;
3021 if (server
.vm_enabled
) {
3022 /* Note that this code may run in the context of an I/O thread
3023 * and accessing server.lruclock in theory is an error
3024 * (no locks). But in practice this is safe, and even if we read
3025 * garbage Redis will not fail. */
3026 o
->lru
= server
.lruclock
;
3027 o
->storage
= REDIS_VM_MEMORY
;
3032 static robj
*createStringObject(char *ptr
, size_t len
) {
3033 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
3036 static robj
*createStringObjectFromLongLong(long long value
) {
3038 if (value
>= 0 && value
< REDIS_SHARED_INTEGERS
) {
3039 incrRefCount(shared
.integers
[value
]);
3040 o
= shared
.integers
[value
];
3042 if (value
>= LONG_MIN
&& value
<= LONG_MAX
) {
3043 o
= createObject(REDIS_STRING
, NULL
);
3044 o
->encoding
= REDIS_ENCODING_INT
;
3045 o
->ptr
= (void*)((long)value
);
3047 o
= createObject(REDIS_STRING
,sdsfromlonglong(value
));
3053 static robj
*dupStringObject(robj
*o
) {
3054 assert(o
->encoding
== REDIS_ENCODING_RAW
);
3055 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
3058 static robj
*createListObject(void) {
3059 list
*l
= listCreate();
3060 robj
*o
= createObject(REDIS_LIST
,l
);
3061 listSetFreeMethod(l
,decrRefCount
);
3062 o
->encoding
= REDIS_ENCODING_LIST
;
3066 static robj
*createZiplistObject(void) {
3067 unsigned char *zl
= ziplistNew();
3068 robj
*o
= createObject(REDIS_LIST
,zl
);
3069 o
->encoding
= REDIS_ENCODING_ZIPLIST
;
3073 static robj
*createSetObject(void) {
3074 dict
*d
= dictCreate(&setDictType
,NULL
);
3075 robj
*o
= createObject(REDIS_SET
,d
);
3076 o
->encoding
= REDIS_ENCODING_HT
;
3080 static robj
*createIntsetObject(void) {
3081 intset
*is
= intsetNew();
3082 robj
*o
= createObject(REDIS_SET
,is
);
3083 o
->encoding
= REDIS_ENCODING_INTSET
;
3087 static robj
*createHashObject(void) {
3088 /* All the Hashes start as zipmaps. Will be automatically converted
3089 * into hash tables if there are enough elements or big elements
3091 unsigned char *zm
= zipmapNew();
3092 robj
*o
= createObject(REDIS_HASH
,zm
);
3093 o
->encoding
= REDIS_ENCODING_ZIPMAP
;
3097 static robj
*createZsetObject(void) {
3098 zset
*zs
= zmalloc(sizeof(*zs
));
3100 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
3101 zs
->zsl
= zslCreate();
3102 return createObject(REDIS_ZSET
,zs
);
3105 static void freeStringObject(robj
*o
) {
3106 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3111 static void freeListObject(robj
*o
) {
3112 switch (o
->encoding
) {
3113 case REDIS_ENCODING_LIST
:
3114 listRelease((list
*) o
->ptr
);
3116 case REDIS_ENCODING_ZIPLIST
:
3120 redisPanic("Unknown list encoding type");
3124 static void freeSetObject(robj
*o
) {
3125 switch (o
->encoding
) {
3126 case REDIS_ENCODING_HT
:
3127 dictRelease((dict
*) o
->ptr
);
3129 case REDIS_ENCODING_INTSET
:
3133 redisPanic("Unknown set encoding type");
3137 static void freeZsetObject(robj
*o
) {
3140 dictRelease(zs
->dict
);
3145 static void freeHashObject(robj
*o
) {
3146 switch (o
->encoding
) {
3147 case REDIS_ENCODING_HT
:
3148 dictRelease((dict
*) o
->ptr
);
3150 case REDIS_ENCODING_ZIPMAP
:
3154 redisPanic("Unknown hash encoding type");
3159 static void incrRefCount(robj
*o
) {
3163 static void decrRefCount(void *obj
) {
3166 /* Object is a swapped out value, or in the process of being loaded. */
3167 if (server
.vm_enabled
&&
3168 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
3170 vmpointer
*vp
= obj
;
3171 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(o
);
3172 vmMarkPagesFree(vp
->page
,vp
->usedpages
);
3173 server
.vm_stats_swapped_objects
--;
3178 if (o
->refcount
<= 0) redisPanic("decrRefCount against refcount <= 0");
3179 /* Object is in memory, or in the process of being swapped out.
3181 * If the object is being swapped out, abort the operation on
3182 * decrRefCount even if the refcount does not drop to 0: the object
3183 * is referenced at least two times, as value of the key AND as
3184 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3185 * done but the relevant key was removed in the meantime, the
3186 * complete jobs handler will not find the key about the job and the
3187 * assert will fail. */
3188 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
3189 vmCancelThreadedIOJob(o
);
3190 if (--(o
->refcount
) == 0) {
3192 case REDIS_STRING
: freeStringObject(o
); break;
3193 case REDIS_LIST
: freeListObject(o
); break;
3194 case REDIS_SET
: freeSetObject(o
); break;
3195 case REDIS_ZSET
: freeZsetObject(o
); break;
3196 case REDIS_HASH
: freeHashObject(o
); break;
3197 default: redisPanic("Unknown object type"); break;
3199 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
3200 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
3201 !listAddNodeHead(server
.objfreelist
,o
))
3203 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
3207 static int checkType(redisClient
*c
, robj
*o
, int type
) {
3208 if (o
->type
!= type
) {
3209 addReply(c
,shared
.wrongtypeerr
);
3215 /* Check if the nul-terminated string 's' can be represented by a long
3216 * (that is, is a number that fits into long without any other space or
3217 * character before or after the digits).
3219 * If so, the function returns REDIS_OK and *longval is set to the value
3220 * of the number. Otherwise REDIS_ERR is returned */
3221 static int isStringRepresentableAsLong(sds s
, long *longval
) {
3222 char buf
[32], *endptr
;
3226 value
= strtol(s
, &endptr
, 10);
3227 if (endptr
[0] != '\0') return REDIS_ERR
;
3228 slen
= ll2string(buf
,32,value
);
3230 /* If the number converted back into a string is not identical
3231 * then it's not possible to encode the string as integer */
3232 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
3233 if (longval
) *longval
= value
;
3237 /* Try to encode a string object in order to save space */
3238 static robj
*tryObjectEncoding(robj
*o
) {
3242 if (o
->encoding
!= REDIS_ENCODING_RAW
)
3243 return o
; /* Already encoded */
3245 /* It's not safe to encode shared objects: shared objects can be shared
3246 * everywhere in the "object space" of Redis. Encoded objects can only
3247 * appear as "values" (and not, for instance, as keys) */
3248 if (o
->refcount
> 1) return o
;
3250 /* Currently we try to encode only strings */
3251 redisAssert(o
->type
== REDIS_STRING
);
3253 /* Check if we can represent this string as a long integer */
3254 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return o
;
3256 /* Ok, this object can be encoded */
3257 if (value
>= 0 && value
< REDIS_SHARED_INTEGERS
) {
3259 incrRefCount(shared
.integers
[value
]);
3260 return shared
.integers
[value
];
3262 o
->encoding
= REDIS_ENCODING_INT
;
3264 o
->ptr
= (void*) value
;
3269 /* Get a decoded version of an encoded object (returned as a new object).
3270 * If the object is already raw-encoded just increment the ref count. */
3271 static robj
*getDecodedObject(robj
*o
) {
3274 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3278 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
3281 ll2string(buf
,32,(long)o
->ptr
);
3282 dec
= createStringObject(buf
,strlen(buf
));
3285 redisPanic("Unknown encoding type");
3289 /* Compare two string objects via strcmp() or alike.
3290 * Note that the objects may be integer-encoded. In such a case we
3291 * use ll2string() to get a string representation of the numbers on the stack
3292 * and compare the strings, it's much faster than calling getDecodedObject().
3294 * Important note: if objects are not integer encoded, but binary-safe strings,
3295 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3297 static int compareStringObjects(robj
*a
, robj
*b
) {
3298 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
3299 char bufa
[128], bufb
[128], *astr
, *bstr
;
3302 if (a
== b
) return 0;
3303 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
3304 ll2string(bufa
,sizeof(bufa
),(long) a
->ptr
);
3310 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
3311 ll2string(bufb
,sizeof(bufb
),(long) b
->ptr
);
3317 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
3320 /* Equal string objects return 1 if the two objects are the same from the
3321 * point of view of a string comparison, otherwise 0 is returned. Note that
3322 * this function is faster then checking for (compareStringObject(a,b) == 0)
3323 * because it can perform some more optimization. */
3324 static int equalStringObjects(robj
*a
, robj
*b
) {
3325 if (a
->encoding
!= REDIS_ENCODING_RAW
&& b
->encoding
!= REDIS_ENCODING_RAW
){
3326 return a
->ptr
== b
->ptr
;
3328 return compareStringObjects(a
,b
) == 0;
3332 static size_t stringObjectLen(robj
*o
) {
3333 redisAssert(o
->type
== REDIS_STRING
);
3334 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3335 return sdslen(o
->ptr
);
3339 return ll2string(buf
,32,(long)o
->ptr
);
3343 static int getDoubleFromObject(robj
*o
, double *target
) {
3350 redisAssert(o
->type
== REDIS_STRING
);
3351 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3352 value
= strtod(o
->ptr
, &eptr
);
3353 if (eptr
[0] != '\0') return REDIS_ERR
;
3354 } else if (o
->encoding
== REDIS_ENCODING_INT
) {
3355 value
= (long)o
->ptr
;
3357 redisPanic("Unknown string encoding");
3365 static int getDoubleFromObjectOrReply(redisClient
*c
, robj
*o
, double *target
, const char *msg
) {
3367 if (getDoubleFromObject(o
, &value
) != REDIS_OK
) {
3369 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3371 addReplySds(c
, sdsnew("-ERR value is not a double\r\n"));
3380 static int getLongLongFromObject(robj
*o
, long long *target
) {
3387 redisAssert(o
->type
== REDIS_STRING
);
3388 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3389 value
= strtoll(o
->ptr
, &eptr
, 10);
3390 if (eptr
[0] != '\0') return REDIS_ERR
;
3391 } else if (o
->encoding
== REDIS_ENCODING_INT
) {
3392 value
= (long)o
->ptr
;
3394 redisPanic("Unknown string encoding");
3398 if (target
) *target
= value
;
3402 static int getLongLongFromObjectOrReply(redisClient
*c
, robj
*o
, long long *target
, const char *msg
) {
3404 if (getLongLongFromObject(o
, &value
) != REDIS_OK
) {
3406 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3408 addReplySds(c
, sdsnew("-ERR value is not an integer\r\n"));
3417 static int getLongFromObjectOrReply(redisClient
*c
, robj
*o
, long *target
, const char *msg
) {
3420 if (getLongLongFromObjectOrReply(c
, o
, &value
, msg
) != REDIS_OK
) return REDIS_ERR
;
3421 if (value
< LONG_MIN
|| value
> LONG_MAX
) {
3423 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3425 addReplySds(c
, sdsnew("-ERR value is out of range\r\n"));
3434 /* =========================== Keyspace access API ========================== */
3436 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
3437 dictEntry
*de
= dictFind(db
->dict
,key
->ptr
);
3439 robj
*val
= dictGetEntryVal(de
);
3441 if (server
.vm_enabled
) {
3442 if (val
->storage
== REDIS_VM_MEMORY
||
3443 val
->storage
== REDIS_VM_SWAPPING
)
3445 /* If we were swapping the object out, cancel the operation */
3446 if (val
->storage
== REDIS_VM_SWAPPING
)
3447 vmCancelThreadedIOJob(val
);
3448 /* Update the access time for the aging algorithm. */
3449 val
->lru
= server
.lruclock
;
3451 int notify
= (val
->storage
== REDIS_VM_LOADING
);
3453 /* Our value was swapped on disk. Bring it at home. */
3454 redisAssert(val
->type
== REDIS_VMPOINTER
);
3455 val
= vmLoadObject(val
);
3456 dictGetEntryVal(de
) = val
;
3458 /* Clients blocked by the VM subsystem may be waiting for
3460 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
3469 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
3470 expireIfNeeded(db
,key
);
3471 return lookupKey(db
,key
);
3474 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
3475 deleteIfVolatile(db
,key
);
3476 touchWatchedKey(db
,key
);
3477 return lookupKey(db
,key
);
3480 static robj
*lookupKeyReadOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
3481 robj
*o
= lookupKeyRead(c
->db
, key
);
3482 if (!o
) addReply(c
,reply
);
3486 static robj
*lookupKeyWriteOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
3487 robj
*o
= lookupKeyWrite(c
->db
, key
);
3488 if (!o
) addReply(c
,reply
);
3492 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3493 * otherwise REDIS_OK is returned, and the caller should increment the
3494 * refcount of 'val'. */
3495 static int dbAdd(redisDb
*db
, robj
*key
, robj
*val
) {
3496 /* Perform a lookup before adding the key, as we need to copy the
3498 if (dictFind(db
->dict
, key
->ptr
) != NULL
) {
3501 sds copy
= sdsdup(key
->ptr
);
3502 dictAdd(db
->dict
, copy
, val
);
3507 /* If the key does not exist, this is just like dbAdd(). Otherwise
3508 * the value associated to the key is replaced with the new one.
3510 * On update (key already existed) 0 is returned. Otherwise 1. */
3511 static int dbReplace(redisDb
*db
, robj
*key
, robj
*val
) {
3512 if (dictFind(db
->dict
,key
->ptr
) == NULL
) {
3513 sds copy
= sdsdup(key
->ptr
);
3514 dictAdd(db
->dict
, copy
, val
);
3517 dictReplace(db
->dict
, key
->ptr
, val
);
3522 static int dbExists(redisDb
*db
, robj
*key
) {
3523 return dictFind(db
->dict
,key
->ptr
) != NULL
;
3526 /* Return a random key, in form of a Redis object.
3527 * If there are no keys, NULL is returned.
3529 * The function makes sure to return keys not already expired. */
3530 static robj
*dbRandomKey(redisDb
*db
) {
3531 struct dictEntry
*de
;
3537 de
= dictGetRandomKey(db
->dict
);
3538 if (de
== NULL
) return NULL
;
3540 key
= dictGetEntryKey(de
);
3541 keyobj
= createStringObject(key
,sdslen(key
));
3542 if (dictFind(db
->expires
,key
)) {
3543 if (expireIfNeeded(db
,keyobj
)) {
3544 decrRefCount(keyobj
);
3545 continue; /* search for another key. This expired. */
3552 /* Delete a key, value, and associated expiration entry if any, from the DB */
3553 static int dbDelete(redisDb
*db
, robj
*key
) {
3556 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
->ptr
);
3557 retval
= dictDelete(db
->dict
,key
->ptr
);
3559 return retval
== DICT_OK
;
3562 /*============================ RDB saving/loading =========================== */
3564 static int rdbSaveType(FILE *fp
, unsigned char type
) {
3565 if (fwrite(&type
,1,1,fp
) == 0) return -1;
3569 static int rdbSaveTime(FILE *fp
, time_t t
) {
3570 int32_t t32
= (int32_t) t
;
3571 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
3575 /* check rdbLoadLen() comments for more info */
3576 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
3577 unsigned char buf
[2];
3580 /* Save a 6 bit len */
3581 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
3582 if (fwrite(buf
,1,1,fp
) == 0) return -1;
3583 } else if (len
< (1<<14)) {
3584 /* Save a 14 bit len */
3585 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
3587 if (fwrite(buf
,2,1,fp
) == 0) return -1;
3589 /* Save a 32 bit len */
3590 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
3591 if (fwrite(buf
,1,1,fp
) == 0) return -1;
3593 if (fwrite(&len
,4,1,fp
) == 0) return -1;
3598 /* Encode 'value' as an integer if possible (if integer will fit the
3599 * supported range). If the function sucessful encoded the integer
3600 * then the (up to 5 bytes) encoded representation is written in the
3601 * string pointed by 'enc' and the length is returned. Otherwise
3603 static int rdbEncodeInteger(long long value
, unsigned char *enc
) {
3604 /* Finally check if it fits in our ranges */
3605 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
3606 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
3607 enc
[1] = value
&0xFF;
3609 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
3610 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
3611 enc
[1] = value
&0xFF;
3612 enc
[2] = (value
>>8)&0xFF;
3614 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
3615 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
3616 enc
[1] = value
&0xFF;
3617 enc
[2] = (value
>>8)&0xFF;
3618 enc
[3] = (value
>>16)&0xFF;
3619 enc
[4] = (value
>>24)&0xFF;
3626 /* String objects in the form "2391" "-100" without any space and with a
3627 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3628 * encoded as integers to save space */
3629 static int rdbTryIntegerEncoding(char *s
, size_t len
, unsigned char *enc
) {
3631 char *endptr
, buf
[32];
3633 /* Check if it's possible to encode this value as a number */
3634 value
= strtoll(s
, &endptr
, 10);
3635 if (endptr
[0] != '\0') return 0;
3636 ll2string(buf
,32,value
);
3638 /* If the number converted back into a string is not identical
3639 * then it's not possible to encode the string as integer */
3640 if (strlen(buf
) != len
|| memcmp(buf
,s
,len
)) return 0;
3642 return rdbEncodeInteger(value
,enc
);
3645 static int rdbSaveLzfStringObject(FILE *fp
, unsigned char *s
, size_t len
) {
3646 size_t comprlen
, outlen
;
3650 /* We require at least four bytes compression for this to be worth it */
3651 if (len
<= 4) return 0;
3653 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
3654 comprlen
= lzf_compress(s
, len
, out
, outlen
);
3655 if (comprlen
== 0) {
3659 /* Data compressed! Let's save it on disk */
3660 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
3661 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
3662 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
3663 if (rdbSaveLen(fp
,len
) == -1) goto writeerr
;
3664 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
3673 /* Save a string objet as [len][data] on disk. If the object is a string
3674 * representation of an integer value we try to safe it in a special form */
3675 static int rdbSaveRawString(FILE *fp
, unsigned char *s
, size_t len
) {
3678 /* Try integer encoding */
3680 unsigned char buf
[5];
3681 if ((enclen
= rdbTryIntegerEncoding((char*)s
,len
,buf
)) > 0) {
3682 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
3687 /* Try LZF compression - under 20 bytes it's unable to compress even
3688 * aaaaaaaaaaaaaaaaaa so skip it */
3689 if (server
.rdbcompression
&& len
> 20) {
3692 retval
= rdbSaveLzfStringObject(fp
,s
,len
);
3693 if (retval
== -1) return -1;
3694 if (retval
> 0) return 0;
3695 /* retval == 0 means data can't be compressed, save the old way */
3698 /* Store verbatim */
3699 if (rdbSaveLen(fp
,len
) == -1) return -1;
3700 if (len
&& fwrite(s
,len
,1,fp
) == 0) return -1;
3704 /* Save a long long value as either an encoded string or a string. */
3705 static int rdbSaveLongLongAsStringObject(FILE *fp
, long long value
) {
3706 unsigned char buf
[32];
3707 int enclen
= rdbEncodeInteger(value
,buf
);
3709 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
3711 /* Encode as string */
3712 enclen
= ll2string((char*)buf
,32,value
);
3713 redisAssert(enclen
< 32);
3714 if (rdbSaveLen(fp
,enclen
) == -1) return -1;
3715 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
3720 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3721 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
3722 /* Avoid to decode the object, then encode it again, if the
3723 * object is alrady integer encoded. */
3724 if (obj
->encoding
== REDIS_ENCODING_INT
) {
3725 return rdbSaveLongLongAsStringObject(fp
,(long)obj
->ptr
);
3727 redisAssert(obj
->encoding
== REDIS_ENCODING_RAW
);
3728 return rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3732 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3733 * 8 bit integer specifing the length of the representation.
3734 * This 8 bit integer has special values in order to specify the following
3740 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3741 unsigned char buf
[128];
3747 } else if (!isfinite(val
)) {
3749 buf
[0] = (val
< 0) ? 255 : 254;
3751 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3752 /* Check if the float is in a safe range to be casted into a
3753 * long long. We are assuming that long long is 64 bit here.
3754 * Also we are assuming that there are no implementations around where
3755 * double has precision < 52 bit.
3757 * Under this assumptions we test if a double is inside an interval
3758 * where casting to long long is safe. Then using two castings we
3759 * make sure the decimal part is zero. If all this is true we use
3760 * integer printing function that is much faster. */
3761 double min
= -4503599627370495; /* (2^52)-1 */
3762 double max
= 4503599627370496; /* -(2^52) */
3763 if (val
> min
&& val
< max
&& val
== ((double)((long long)val
)))
3764 ll2string((char*)buf
+1,sizeof(buf
),(long long)val
);
3767 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3768 buf
[0] = strlen((char*)buf
+1);
3771 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3775 /* Save a Redis object. */
3776 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3777 if (o
->type
== REDIS_STRING
) {
3778 /* Save a string value */
3779 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3780 } else if (o
->type
== REDIS_LIST
) {
3781 /* Save a list value */
3782 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
) {
3784 unsigned char *vstr
;
3788 if (rdbSaveLen(fp
,ziplistLen(o
->ptr
)) == -1) return -1;
3789 p
= ziplistIndex(o
->ptr
,0);
3790 while(ziplistGet(p
,&vstr
,&vlen
,&vlong
)) {
3792 if (rdbSaveRawString(fp
,vstr
,vlen
) == -1)
3795 if (rdbSaveLongLongAsStringObject(fp
,vlong
) == -1)
3798 p
= ziplistNext(o
->ptr
,p
);
3800 } else if (o
->encoding
== REDIS_ENCODING_LIST
) {
3801 list
*list
= o
->ptr
;
3805 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3806 listRewind(list
,&li
);
3807 while((ln
= listNext(&li
))) {
3808 robj
*eleobj
= listNodeValue(ln
);
3809 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3812 redisPanic("Unknown list encoding");
3814 } else if (o
->type
== REDIS_SET
) {
3815 /* Save a set value */
3816 if (o
->encoding
== REDIS_ENCODING_HT
) {
3818 dictIterator
*di
= dictGetIterator(set
);
3821 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3822 while((de
= dictNext(di
)) != NULL
) {
3823 robj
*eleobj
= dictGetEntryKey(de
);
3824 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3826 dictReleaseIterator(di
);
3827 } else if (o
->encoding
== REDIS_ENCODING_INTSET
) {
3828 intset
*is
= o
->ptr
;
3832 if (rdbSaveLen(fp
,intsetLen(is
)) == -1) return -1;
3833 while(intsetGet(is
,i
++,&llval
)) {
3834 if (rdbSaveLongLongAsStringObject(fp
,llval
) == -1) return -1;
3837 redisPanic("Unknown set encoding");
3839 } else if (o
->type
== REDIS_ZSET
) {
3840 /* Save a set value */
3842 dictIterator
*di
= dictGetIterator(zs
->dict
);
3845 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3846 while((de
= dictNext(di
)) != NULL
) {
3847 robj
*eleobj
= dictGetEntryKey(de
);
3848 double *score
= dictGetEntryVal(de
);
3850 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3851 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3853 dictReleaseIterator(di
);
3854 } else if (o
->type
== REDIS_HASH
) {
3855 /* Save a hash value */
3856 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3857 unsigned char *p
= zipmapRewind(o
->ptr
);
3858 unsigned int count
= zipmapLen(o
->ptr
);
3859 unsigned char *key
, *val
;
3860 unsigned int klen
, vlen
;
3862 if (rdbSaveLen(fp
,count
) == -1) return -1;
3863 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
3864 if (rdbSaveRawString(fp
,key
,klen
) == -1) return -1;
3865 if (rdbSaveRawString(fp
,val
,vlen
) == -1) return -1;
3868 dictIterator
*di
= dictGetIterator(o
->ptr
);
3871 if (rdbSaveLen(fp
,dictSize((dict
*)o
->ptr
)) == -1) return -1;
3872 while((de
= dictNext(di
)) != NULL
) {
3873 robj
*key
= dictGetEntryKey(de
);
3874 robj
*val
= dictGetEntryVal(de
);
3876 if (rdbSaveStringObject(fp
,key
) == -1) return -1;
3877 if (rdbSaveStringObject(fp
,val
) == -1) return -1;
3879 dictReleaseIterator(di
);
3882 redisPanic("Unknown object type");
3887 /* Return the length the object will have on disk if saved with
3888 * the rdbSaveObject() function. Currently we use a trick to get
3889 * this length with very little changes to the code. In the future
3890 * we could switch to a faster solution. */
3891 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3892 if (fp
== NULL
) fp
= server
.devnull
;
3894 assert(rdbSaveObject(fp
,o
) != 1);
3898 /* Return the number of pages required to save this object in the swap file */
3899 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3900 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3902 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3905 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3906 static int rdbSave(char *filename
) {
3907 dictIterator
*di
= NULL
;
3912 time_t now
= time(NULL
);
3914 /* Wait for I/O therads to terminate, just in case this is a
3915 * foreground-saving, to avoid seeking the swap file descriptor at the
3917 if (server
.vm_enabled
)
3918 waitEmptyIOJobsQueue();
3920 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3921 fp
= fopen(tmpfile
,"w");
3923 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3926 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3927 for (j
= 0; j
< server
.dbnum
; j
++) {
3928 redisDb
*db
= server
.db
+j
;
3930 if (dictSize(d
) == 0) continue;
3931 di
= dictGetIterator(d
);
3937 /* Write the SELECT DB opcode */
3938 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3939 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3941 /* Iterate this DB writing every entry */
3942 while((de
= dictNext(di
)) != NULL
) {
3943 sds keystr
= dictGetEntryKey(de
);
3944 robj key
, *o
= dictGetEntryVal(de
);
3947 initStaticStringObject(key
,keystr
);
3948 expiretime
= getExpire(db
,&key
);
3950 /* Save the expire time */
3951 if (expiretime
!= -1) {
3952 /* If this key is already expired skip it */
3953 if (expiretime
< now
) continue;
3954 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3955 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3957 /* Save the key and associated value. This requires special
3958 * handling if the value is swapped out. */
3959 if (!server
.vm_enabled
|| o
->storage
== REDIS_VM_MEMORY
||
3960 o
->storage
== REDIS_VM_SWAPPING
) {
3961 /* Save type, key, value */
3962 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3963 if (rdbSaveStringObject(fp
,&key
) == -1) goto werr
;
3964 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3966 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3968 /* Get a preview of the object in memory */
3969 po
= vmPreviewObject(o
);
3970 /* Save type, key, value */
3971 if (rdbSaveType(fp
,po
->type
) == -1) goto werr
;
3972 if (rdbSaveStringObject(fp
,&key
) == -1) goto werr
;
3973 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3974 /* Remove the loaded object from memory */
3978 dictReleaseIterator(di
);
3981 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3983 /* Make sure data will not remain on the OS's output buffers */
3988 /* Use RENAME to make sure the DB file is changed atomically only
3989 * if the generate DB file is ok. */
3990 if (rename(tmpfile
,filename
) == -1) {
3991 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3995 redisLog(REDIS_NOTICE
,"DB saved on disk");
3997 server
.lastsave
= time(NULL
);
4003 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
4004 if (di
) dictReleaseIterator(di
);
4008 static int rdbSaveBackground(char *filename
) {
4011 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
4012 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
4013 if ((childpid
= fork()) == 0) {
4015 if (server
.vm_enabled
) vmReopenSwapFile();
4017 if (rdbSave(filename
) == REDIS_OK
) {
4024 if (childpid
== -1) {
4025 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
4029 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
4030 server
.bgsavechildpid
= childpid
;
4031 updateDictResizePolicy();
4034 return REDIS_OK
; /* unreached */
4037 static void rdbRemoveTempFile(pid_t childpid
) {
4040 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
4044 static int rdbLoadType(FILE *fp
) {
4046 if (fread(&type
,1,1,fp
) == 0) return -1;
4050 static time_t rdbLoadTime(FILE *fp
) {
4052 if (fread(&t32
,4,1,fp
) == 0) return -1;
4053 return (time_t) t32
;
4056 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4057 * of this file for a description of how this are stored on disk.
4059 * isencoded is set to 1 if the readed length is not actually a length but
4060 * an "encoding type", check the above comments for more info */
4061 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
4062 unsigned char buf
[2];
4066 if (isencoded
) *isencoded
= 0;
4067 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
4068 type
= (buf
[0]&0xC0)>>6;
4069 if (type
== REDIS_RDB_6BITLEN
) {
4070 /* Read a 6 bit len */
4072 } else if (type
== REDIS_RDB_ENCVAL
) {
4073 /* Read a 6 bit len encoding type */
4074 if (isencoded
) *isencoded
= 1;
4076 } else if (type
== REDIS_RDB_14BITLEN
) {
4077 /* Read a 14 bit len */
4078 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
4079 return ((buf
[0]&0x3F)<<8)|buf
[1];
4081 /* Read a 32 bit len */
4082 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
4087 /* Load an integer-encoded object from file 'fp', with the specified
4088 * encoding type 'enctype'. If encode is true the function may return
4089 * an integer-encoded object as reply, otherwise the returned object
4090 * will always be encoded as a raw string. */
4091 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
, int encode
) {
4092 unsigned char enc
[4];
4095 if (enctype
== REDIS_RDB_ENC_INT8
) {
4096 if (fread(enc
,1,1,fp
) == 0) return NULL
;
4097 val
= (signed char)enc
[0];
4098 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
4100 if (fread(enc
,2,1,fp
) == 0) return NULL
;
4101 v
= enc
[0]|(enc
[1]<<8);
4103 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
4105 if (fread(enc
,4,1,fp
) == 0) return NULL
;
4106 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
4109 val
= 0; /* anti-warning */
4110 redisPanic("Unknown RDB integer encoding type");
4113 return createStringObjectFromLongLong(val
);
4115 return createObject(REDIS_STRING
,sdsfromlonglong(val
));
4118 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
4119 unsigned int len
, clen
;
4120 unsigned char *c
= NULL
;
4123 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
4124 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
4125 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
4126 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
4127 if (fread(c
,clen
,1,fp
) == 0) goto err
;
4128 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
4130 return createObject(REDIS_STRING
,val
);
4137 static robj
*rdbGenericLoadStringObject(FILE*fp
, int encode
) {
4142 len
= rdbLoadLen(fp
,&isencoded
);
4145 case REDIS_RDB_ENC_INT8
:
4146 case REDIS_RDB_ENC_INT16
:
4147 case REDIS_RDB_ENC_INT32
:
4148 return rdbLoadIntegerObject(fp
,len
,encode
);
4149 case REDIS_RDB_ENC_LZF
:
4150 return rdbLoadLzfStringObject(fp
);
4152 redisPanic("Unknown RDB encoding type");
4156 if (len
== REDIS_RDB_LENERR
) return NULL
;
4157 val
= sdsnewlen(NULL
,len
);
4158 if (len
&& fread(val
,len
,1,fp
) == 0) {
4162 return createObject(REDIS_STRING
,val
);
4165 static robj
*rdbLoadStringObject(FILE *fp
) {
4166 return rdbGenericLoadStringObject(fp
,0);
4169 static robj
*rdbLoadEncodedStringObject(FILE *fp
) {
4170 return rdbGenericLoadStringObject(fp
,1);
4173 /* For information about double serialization check rdbSaveDoubleValue() */
4174 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
4178 if (fread(&len
,1,1,fp
) == 0) return -1;
4180 case 255: *val
= R_NegInf
; return 0;
4181 case 254: *val
= R_PosInf
; return 0;
4182 case 253: *val
= R_Nan
; return 0;
4184 if (fread(buf
,len
,1,fp
) == 0) return -1;
4186 sscanf(buf
, "%lg", val
);
4191 /* Load a Redis object of the specified type from the specified file.
4192 * On success a newly allocated object is returned, otherwise NULL. */
4193 static robj
*rdbLoadObject(int type
, FILE *fp
) {
4194 robj
*o
, *ele
, *dec
;
4197 redisLog(REDIS_DEBUG
,"LOADING OBJECT %d (at %d)\n",type
,ftell(fp
));
4198 if (type
== REDIS_STRING
) {
4199 /* Read string value */
4200 if ((o
= rdbLoadEncodedStringObject(fp
)) == NULL
) return NULL
;
4201 o
= tryObjectEncoding(o
);
4202 } else if (type
== REDIS_LIST
) {
4203 /* Read list value */
4204 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
4206 /* Use a real list when there are too many entries */
4207 if (len
> server
.list_max_ziplist_entries
) {
4208 o
= createListObject();
4210 o
= createZiplistObject();
4213 /* Load every single element of the list */
4215 if ((ele
= rdbLoadEncodedStringObject(fp
)) == NULL
) return NULL
;
4217 /* If we are using a ziplist and the value is too big, convert
4218 * the object to a real list. */
4219 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
&&
4220 ele
->encoding
== REDIS_ENCODING_RAW
&&
4221 sdslen(ele
->ptr
) > server
.list_max_ziplist_value
)
4222 listTypeConvert(o
,REDIS_ENCODING_LIST
);
4224 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
) {
4225 dec
= getDecodedObject(ele
);
4226 o
->ptr
= ziplistPush(o
->ptr
,dec
->ptr
,sdslen(dec
->ptr
),REDIS_TAIL
);
4230 ele
= tryObjectEncoding(ele
);
4231 listAddNodeTail(o
->ptr
,ele
);
4234 } else if (type
== REDIS_SET
) {
4235 /* Read list/set value */
4236 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
4237 o
= createSetObject();
4238 /* It's faster to expand the dict to the right size asap in order
4239 * to avoid rehashing */
4240 if (len
> DICT_HT_INITIAL_SIZE
)
4241 dictExpand(o
->ptr
,len
);
4242 /* Load every single element of the list/set */
4244 if ((ele
= rdbLoadEncodedStringObject(fp
)) == NULL
) return NULL
;
4245 ele
= tryObjectEncoding(ele
);
4246 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
4248 } else if (type
== REDIS_ZSET
) {
4249 /* Read list/set value */
4253 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
4254 o
= createZsetObject();
4256 /* Load every single element of the list/set */
4259 double *score
= zmalloc(sizeof(double));
4261 if ((ele
= rdbLoadEncodedStringObject(fp
)) == NULL
) return NULL
;
4262 ele
= tryObjectEncoding(ele
);
4263 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
4264 dictAdd(zs
->dict
,ele
,score
);
4265 zslInsert(zs
->zsl
,*score
,ele
);
4266 incrRefCount(ele
); /* added to skiplist */
4268 } else if (type
== REDIS_HASH
) {
4271 if ((hashlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
4272 o
= createHashObject();
4273 /* Too many entries? Use an hash table. */
4274 if (hashlen
> server
.hash_max_zipmap_entries
)
4275 convertToRealHash(o
);
4276 /* Load every key/value, then set it into the zipmap or hash
4277 * table, as needed. */
4281 if ((key
= rdbLoadEncodedStringObject(fp
)) == NULL
) return NULL
;
4282 if ((val
= rdbLoadEncodedStringObject(fp
)) == NULL
) return NULL
;
4283 /* If we are using a zipmap and there are too big values
4284 * the object is converted to real hash table encoding. */
4285 if (o
->encoding
!= REDIS_ENCODING_HT
&&
4286 (sdslen(key
->ptr
) > server
.hash_max_zipmap_value
||
4287 sdslen(val
->ptr
) > server
.hash_max_zipmap_value
))
4289 convertToRealHash(o
);
4292 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
4293 unsigned char *zm
= o
->ptr
;
4295 zm
= zipmapSet(zm
,key
->ptr
,sdslen(key
->ptr
),
4296 val
->ptr
,sdslen(val
->ptr
),NULL
);
4301 key
= tryObjectEncoding(key
);
4302 val
= tryObjectEncoding(val
);
4303 dictAdd((dict
*)o
->ptr
,key
,val
);
4307 redisPanic("Unknown object type");
4312 static int rdbLoad(char *filename
) {
4315 int type
, retval
, rdbver
;
4316 int swap_all_values
= 0;
4317 redisDb
*db
= server
.db
+0;
4319 time_t expiretime
, now
= time(NULL
);
4321 fp
= fopen(filename
,"r");
4322 if (!fp
) return REDIS_ERR
;
4323 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
4325 if (memcmp(buf
,"REDIS",5) != 0) {
4327 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
4330 rdbver
= atoi(buf
+5);
4333 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
4342 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
4343 if (type
== REDIS_EXPIRETIME
) {
4344 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
4345 /* We read the time so we need to read the object type again */
4346 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
4348 if (type
== REDIS_EOF
) break;
4349 /* Handle SELECT DB opcode as a special case */
4350 if (type
== REDIS_SELECTDB
) {
4351 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
4353 if (dbid
>= (unsigned)server
.dbnum
) {
4354 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
4357 db
= server
.db
+dbid
;
4361 if ((key
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
4363 if ((val
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
4364 /* Check if the key already expired */
4365 if (expiretime
!= -1 && expiretime
< now
) {
4370 /* Add the new object in the hash table */
4371 retval
= dbAdd(db
,key
,val
);
4372 if (retval
== REDIS_ERR
) {
4373 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key
->ptr
);
4376 /* Set the expire time if needed */
4377 if (expiretime
!= -1) setExpire(db
,key
,expiretime
);
4379 /* Handle swapping while loading big datasets when VM is on */
4381 /* If we detecter we are hopeless about fitting something in memory
4382 * we just swap every new key on disk. Directly...
4383 * Note that's important to check for this condition before resorting
4384 * to random sampling, otherwise we may try to swap already
4386 if (swap_all_values
) {
4387 dictEntry
*de
= dictFind(db
->dict
,key
->ptr
);
4389 /* de may be NULL since the key already expired */
4392 val
= dictGetEntryVal(de
);
4394 if (val
->refcount
== 1 &&
4395 (vp
= vmSwapObjectBlocking(val
)) != NULL
)
4396 dictGetEntryVal(de
) = vp
;
4403 /* Flush data on disk once 32 MB of additional RAM are used... */
4405 if ((zmalloc_used_memory() - server
.vm_max_memory
) > 1024*1024*32)
4408 /* If we have still some hope of having some value fitting memory
4409 * then we try random sampling. */
4410 if (!swap_all_values
&& server
.vm_enabled
&& force_swapout
) {
4411 while (zmalloc_used_memory() > server
.vm_max_memory
) {
4412 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
4414 if (zmalloc_used_memory() > server
.vm_max_memory
)
4415 swap_all_values
= 1; /* We are already using too much mem */
4421 eoferr
: /* unexpected end of file is handled here with a fatal exit */
4422 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4424 return REDIS_ERR
; /* Just to avoid warning */
4427 /*================================== Shutdown =============================== */
4428 static int prepareForShutdown() {
4429 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
4430 /* Kill the saving child if there is a background saving in progress.
4431 We want to avoid race conditions, for instance our saving child may
4432 overwrite the synchronous saving did by SHUTDOWN. */
4433 if (server
.bgsavechildpid
!= -1) {
4434 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
4435 kill(server
.bgsavechildpid
,SIGKILL
);
4436 rdbRemoveTempFile(server
.bgsavechildpid
);
4438 if (server
.appendonly
) {
4439 /* Append only file: fsync() the AOF and exit */
4440 aof_fsync(server
.appendfd
);
4441 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4443 /* Snapshotting. Perform a SYNC SAVE and exit */
4444 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4445 if (server
.daemonize
)
4446 unlink(server
.pidfile
);
4447 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
4449 /* Ooops.. error saving! The best we can do is to continue
4450 * operating. Note that if there was a background saving process,
4451 * in the next cron() Redis will be notified that the background
4452 * saving aborted, handling special stuff like slaves pending for
4453 * synchronization... */
4454 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
4458 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
4462 /*================================== Commands =============================== */
4464 static void authCommand(redisClient
*c
) {
4465 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
4466 c
->authenticated
= 1;
4467 addReply(c
,shared
.ok
);
4469 c
->authenticated
= 0;
4470 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4474 static void pingCommand(redisClient
*c
) {
4475 addReply(c
,shared
.pong
);
4478 static void echoCommand(redisClient
*c
) {
4479 addReplyBulk(c
,c
->argv
[1]);
4482 /*=================================== Strings =============================== */
4484 static void setGenericCommand(redisClient
*c
, int nx
, robj
*key
, robj
*val
, robj
*expire
) {
4486 long seconds
= 0; /* initialized to avoid an harmness warning */
4489 if (getLongFromObjectOrReply(c
, expire
, &seconds
, NULL
) != REDIS_OK
)
4492 addReplySds(c
,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4497 touchWatchedKey(c
->db
,key
);
4498 if (nx
) deleteIfVolatile(c
->db
,key
);
4499 retval
= dbAdd(c
->db
,key
,val
);
4500 if (retval
== REDIS_ERR
) {
4502 dbReplace(c
->db
,key
,val
);
4505 addReply(c
,shared
.czero
);
4512 removeExpire(c
->db
,key
);
4513 if (expire
) setExpire(c
->db
,key
,time(NULL
)+seconds
);
4514 addReply(c
, nx
? shared
.cone
: shared
.ok
);
4517 static void setCommand(redisClient
*c
) {
4518 setGenericCommand(c
,0,c
->argv
[1],c
->argv
[2],NULL
);
4521 static void setnxCommand(redisClient
*c
) {
4522 setGenericCommand(c
,1,c
->argv
[1],c
->argv
[2],NULL
);
4525 static void setexCommand(redisClient
*c
) {
4526 setGenericCommand(c
,0,c
->argv
[1],c
->argv
[3],c
->argv
[2]);
4529 static int getGenericCommand(redisClient
*c
) {
4532 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
)
4535 if (o
->type
!= REDIS_STRING
) {
4536 addReply(c
,shared
.wrongtypeerr
);
4544 static void getCommand(redisClient
*c
) {
4545 getGenericCommand(c
);
4548 static void getsetCommand(redisClient
*c
) {
4549 if (getGenericCommand(c
) == REDIS_ERR
) return;
4550 dbReplace(c
->db
,c
->argv
[1],c
->argv
[2]);
4551 incrRefCount(c
->argv
[2]);
4553 removeExpire(c
->db
,c
->argv
[1]);
4556 static void mgetCommand(redisClient
*c
) {
4559 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
4560 for (j
= 1; j
< c
->argc
; j
++) {
4561 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
4563 addReply(c
,shared
.nullbulk
);
4565 if (o
->type
!= REDIS_STRING
) {
4566 addReply(c
,shared
.nullbulk
);
4574 static void msetGenericCommand(redisClient
*c
, int nx
) {
4575 int j
, busykeys
= 0;
4577 if ((c
->argc
% 2) == 0) {
4578 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4581 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4582 * set nothing at all if at least one already key exists. */
4584 for (j
= 1; j
< c
->argc
; j
+= 2) {
4585 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
4591 addReply(c
, shared
.czero
);
4595 for (j
= 1; j
< c
->argc
; j
+= 2) {
4596 c
->argv
[j
+1] = tryObjectEncoding(c
->argv
[j
+1]);
4597 dbReplace(c
->db
,c
->argv
[j
],c
->argv
[j
+1]);
4598 incrRefCount(c
->argv
[j
+1]);
4599 removeExpire(c
->db
,c
->argv
[j
]);
4601 server
.dirty
+= (c
->argc
-1)/2;
4602 addReply(c
, nx
? shared
.cone
: shared
.ok
);
4605 static void msetCommand(redisClient
*c
) {
4606 msetGenericCommand(c
,0);
4609 static void msetnxCommand(redisClient
*c
) {
4610 msetGenericCommand(c
,1);
4613 static void incrDecrCommand(redisClient
*c
, long long incr
) {
4617 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4618 if (o
!= NULL
&& checkType(c
,o
,REDIS_STRING
)) return;
4619 if (getLongLongFromObjectOrReply(c
,o
,&value
,NULL
) != REDIS_OK
) return;
4622 o
= createStringObjectFromLongLong(value
);
4623 dbReplace(c
->db
,c
->argv
[1],o
);
4625 addReply(c
,shared
.colon
);
4627 addReply(c
,shared
.crlf
);
4630 static void incrCommand(redisClient
*c
) {
4631 incrDecrCommand(c
,1);
4634 static void decrCommand(redisClient
*c
) {
4635 incrDecrCommand(c
,-1);
4638 static void incrbyCommand(redisClient
*c
) {
4641 if (getLongLongFromObjectOrReply(c
, c
->argv
[2], &incr
, NULL
) != REDIS_OK
) return;
4642 incrDecrCommand(c
,incr
);
4645 static void decrbyCommand(redisClient
*c
) {
4648 if (getLongLongFromObjectOrReply(c
, c
->argv
[2], &incr
, NULL
) != REDIS_OK
) return;
4649 incrDecrCommand(c
,-incr
);
4652 static void appendCommand(redisClient
*c
) {
4657 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4659 /* Create the key */
4660 retval
= dbAdd(c
->db
,c
->argv
[1],c
->argv
[2]);
4661 incrRefCount(c
->argv
[2]);
4662 totlen
= stringObjectLen(c
->argv
[2]);
4664 if (o
->type
!= REDIS_STRING
) {
4665 addReply(c
,shared
.wrongtypeerr
);
4668 /* If the object is specially encoded or shared we have to make
4670 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
4671 robj
*decoded
= getDecodedObject(o
);
4673 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
4674 decrRefCount(decoded
);
4675 dbReplace(c
->db
,c
->argv
[1],o
);
4678 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
4679 o
->ptr
= sdscatlen(o
->ptr
,
4680 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
4682 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
4683 (unsigned long) c
->argv
[2]->ptr
);
4685 totlen
= sdslen(o
->ptr
);
4688 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
4691 static void substrCommand(redisClient
*c
) {
4693 long start
= atoi(c
->argv
[2]->ptr
);
4694 long end
= atoi(c
->argv
[3]->ptr
);
4695 size_t rangelen
, strlen
;
4698 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4699 checkType(c
,o
,REDIS_STRING
)) return;
4701 o
= getDecodedObject(o
);
4702 strlen
= sdslen(o
->ptr
);
4704 /* convert negative indexes */
4705 if (start
< 0) start
= strlen
+start
;
4706 if (end
< 0) end
= strlen
+end
;
4707 if (start
< 0) start
= 0;
4708 if (end
< 0) end
= 0;
4710 /* indexes sanity checks */
4711 if (start
> end
|| (size_t)start
>= strlen
) {
4712 /* Out of range start or start > end result in null reply */
4713 addReply(c
,shared
.nullbulk
);
4717 if ((size_t)end
>= strlen
) end
= strlen
-1;
4718 rangelen
= (end
-start
)+1;
4720 /* Return the result */
4721 addReplySds(c
,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen
));
4722 range
= sdsnewlen((char*)o
->ptr
+start
,rangelen
);
4723 addReplySds(c
,range
);
4724 addReply(c
,shared
.crlf
);
4728 /* ========================= Type agnostic commands ========================= */
4730 static void delCommand(redisClient
*c
) {
4733 for (j
= 1; j
< c
->argc
; j
++) {
4734 if (dbDelete(c
->db
,c
->argv
[j
])) {
4735 touchWatchedKey(c
->db
,c
->argv
[j
]);
4740 addReplyLongLong(c
,deleted
);
4743 static void existsCommand(redisClient
*c
) {
4744 expireIfNeeded(c
->db
,c
->argv
[1]);
4745 if (dbExists(c
->db
,c
->argv
[1])) {
4746 addReply(c
, shared
.cone
);
4748 addReply(c
, shared
.czero
);
4752 static void selectCommand(redisClient
*c
) {
4753 int id
= atoi(c
->argv
[1]->ptr
);
4755 if (selectDb(c
,id
) == REDIS_ERR
) {
4756 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
4758 addReply(c
,shared
.ok
);
4762 static void randomkeyCommand(redisClient
*c
) {
4765 if ((key
= dbRandomKey(c
->db
)) == NULL
) {
4766 addReply(c
,shared
.nullbulk
);
4770 addReplyBulk(c
,key
);
4774 static void keysCommand(redisClient
*c
) {
4777 sds pattern
= c
->argv
[1]->ptr
;
4778 int plen
= sdslen(pattern
);
4779 unsigned long numkeys
= 0;
4780 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
4782 di
= dictGetIterator(c
->db
->dict
);
4784 decrRefCount(lenobj
);
4785 while((de
= dictNext(di
)) != NULL
) {
4786 sds key
= dictGetEntryKey(de
);
4789 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
4790 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
4791 keyobj
= createStringObject(key
,sdslen(key
));
4792 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
4793 addReplyBulk(c
,keyobj
);
4796 decrRefCount(keyobj
);
4799 dictReleaseIterator(di
);
4800 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
4803 static void dbsizeCommand(redisClient
*c
) {
4805 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
4808 static void lastsaveCommand(redisClient
*c
) {
4810 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
4813 static void typeCommand(redisClient
*c
) {
4817 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4822 case REDIS_STRING
: type
= "+string"; break;
4823 case REDIS_LIST
: type
= "+list"; break;
4824 case REDIS_SET
: type
= "+set"; break;
4825 case REDIS_ZSET
: type
= "+zset"; break;
4826 case REDIS_HASH
: type
= "+hash"; break;
4827 default: type
= "+unknown"; break;
4830 addReplySds(c
,sdsnew(type
));
4831 addReply(c
,shared
.crlf
);
4834 static void saveCommand(redisClient
*c
) {
4835 if (server
.bgsavechildpid
!= -1) {
4836 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
4839 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4840 addReply(c
,shared
.ok
);
4842 addReply(c
,shared
.err
);
4846 static void bgsaveCommand(redisClient
*c
) {
4847 if (server
.bgsavechildpid
!= -1) {
4848 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
4851 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
4852 char *status
= "+Background saving started\r\n";
4853 addReplySds(c
,sdsnew(status
));
4855 addReply(c
,shared
.err
);
4859 static void shutdownCommand(redisClient
*c
) {
4860 if (prepareForShutdown() == REDIS_OK
)
4862 addReplySds(c
, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4865 static void renameGenericCommand(redisClient
*c
, int nx
) {
4868 /* To use the same key as src and dst is probably an error */
4869 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
4870 addReply(c
,shared
.sameobjecterr
);
4874 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nokeyerr
)) == NULL
)
4878 deleteIfVolatile(c
->db
,c
->argv
[2]);
4879 if (dbAdd(c
->db
,c
->argv
[2],o
) == REDIS_ERR
) {
4882 addReply(c
,shared
.czero
);
4885 dbReplace(c
->db
,c
->argv
[2],o
);
4887 dbDelete(c
->db
,c
->argv
[1]);
4888 touchWatchedKey(c
->db
,c
->argv
[2]);
4890 addReply(c
,nx
? shared
.cone
: shared
.ok
);
4893 static void renameCommand(redisClient
*c
) {
4894 renameGenericCommand(c
,0);
4897 static void renamenxCommand(redisClient
*c
) {
4898 renameGenericCommand(c
,1);
4901 static void moveCommand(redisClient
*c
) {
4906 /* Obtain source and target DB pointers */
4909 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
4910 addReply(c
,shared
.outofrangeerr
);
4914 selectDb(c
,srcid
); /* Back to the source DB */
4916 /* If the user is moving using as target the same
4917 * DB as the source DB it is probably an error. */
4919 addReply(c
,shared
.sameobjecterr
);
4923 /* Check if the element exists and get a reference */
4924 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4926 addReply(c
,shared
.czero
);
4930 /* Try to add the element to the target DB */
4931 deleteIfVolatile(dst
,c
->argv
[1]);
4932 if (dbAdd(dst
,c
->argv
[1],o
) == REDIS_ERR
) {
4933 addReply(c
,shared
.czero
);
4938 /* OK! key moved, free the entry in the source DB */
4939 dbDelete(src
,c
->argv
[1]);
4941 addReply(c
,shared
.cone
);
4944 /* =================================== Lists ================================ */
4947 /* Check the argument length to see if it requires us to convert the ziplist
4948 * to a real list. Only check raw-encoded objects because integer encoded
4949 * objects are never too long. */
4950 static void listTypeTryConversion(robj
*subject
, robj
*value
) {
4951 if (subject
->encoding
!= REDIS_ENCODING_ZIPLIST
) return;
4952 if (value
->encoding
== REDIS_ENCODING_RAW
&&
4953 sdslen(value
->ptr
) > server
.list_max_ziplist_value
)
4954 listTypeConvert(subject
,REDIS_ENCODING_LIST
);
4957 static void listTypePush(robj
*subject
, robj
*value
, int where
) {
4958 /* Check if we need to convert the ziplist */
4959 listTypeTryConversion(subject
,value
);
4960 if (subject
->encoding
== REDIS_ENCODING_ZIPLIST
&&
4961 ziplistLen(subject
->ptr
) > server
.list_max_ziplist_entries
)
4962 listTypeConvert(subject
,REDIS_ENCODING_LIST
);
4964 if (subject
->encoding
== REDIS_ENCODING_ZIPLIST
) {
4965 int pos
= (where
== REDIS_HEAD
) ? ZIPLIST_HEAD
: ZIPLIST_TAIL
;
4966 value
= getDecodedObject(value
);
4967 subject
->ptr
= ziplistPush(subject
->ptr
,value
->ptr
,sdslen(value
->ptr
),pos
);
4968 decrRefCount(value
);
4969 } else if (subject
->encoding
== REDIS_ENCODING_LIST
) {
4970 if (where
== REDIS_HEAD
) {
4971 listAddNodeHead(subject
->ptr
,value
);
4973 listAddNodeTail(subject
->ptr
,value
);
4975 incrRefCount(value
);
4977 redisPanic("Unknown list encoding");
4981 static robj
*listTypePop(robj
*subject
, int where
) {
4983 if (subject
->encoding
== REDIS_ENCODING_ZIPLIST
) {
4985 unsigned char *vstr
;
4988 int pos
= (where
== REDIS_HEAD
) ? 0 : -1;
4989 p
= ziplistIndex(subject
->ptr
,pos
);
4990 if (ziplistGet(p
,&vstr
,&vlen
,&vlong
)) {
4992 value
= createStringObject((char*)vstr
,vlen
);
4994 value
= createStringObjectFromLongLong(vlong
);
4996 /* We only need to delete an element when it exists */
4997 subject
->ptr
= ziplistDelete(subject
->ptr
,&p
);
4999 } else if (subject
->encoding
== REDIS_ENCODING_LIST
) {
5000 list
*list
= subject
->ptr
;
5002 if (where
== REDIS_HEAD
) {
5003 ln
= listFirst(list
);
5005 ln
= listLast(list
);
5008 value
= listNodeValue(ln
);
5009 incrRefCount(value
);
5010 listDelNode(list
,ln
);
5013 redisPanic("Unknown list encoding");
5018 static unsigned long listTypeLength(robj
*subject
) {
5019 if (subject
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5020 return ziplistLen(subject
->ptr
);
5021 } else if (subject
->encoding
== REDIS_ENCODING_LIST
) {
5022 return listLength((list
*)subject
->ptr
);
5024 redisPanic("Unknown list encoding");
5028 /* Structure to hold set iteration abstraction. */
5031 unsigned char encoding
;
5032 unsigned char direction
; /* Iteration direction */
5037 /* Structure for an entry while iterating over a list. */
5039 listTypeIterator
*li
;
5040 unsigned char *zi
; /* Entry in ziplist */
5041 listNode
*ln
; /* Entry in linked list */
5044 /* Initialize an iterator at the specified index. */
5045 static listTypeIterator
*listTypeInitIterator(robj
*subject
, int index
, unsigned char direction
) {
5046 listTypeIterator
*li
= zmalloc(sizeof(listTypeIterator
));
5047 li
->subject
= subject
;
5048 li
->encoding
= subject
->encoding
;
5049 li
->direction
= direction
;
5050 if (li
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5051 li
->zi
= ziplistIndex(subject
->ptr
,index
);
5052 } else if (li
->encoding
== REDIS_ENCODING_LIST
) {
5053 li
->ln
= listIndex(subject
->ptr
,index
);
5055 redisPanic("Unknown list encoding");
5060 /* Clean up the iterator. */
5061 static void listTypeReleaseIterator(listTypeIterator
*li
) {
5065 /* Stores pointer to current the entry in the provided entry structure
5066 * and advances the position of the iterator. Returns 1 when the current
5067 * entry is in fact an entry, 0 otherwise. */
5068 static int listTypeNext(listTypeIterator
*li
, listTypeEntry
*entry
) {
5069 /* Protect from converting when iterating */
5070 redisAssert(li
->subject
->encoding
== li
->encoding
);
5073 if (li
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5075 if (entry
->zi
!= NULL
) {
5076 if (li
->direction
== REDIS_TAIL
)
5077 li
->zi
= ziplistNext(li
->subject
->ptr
,li
->zi
);
5079 li
->zi
= ziplistPrev(li
->subject
->ptr
,li
->zi
);
5082 } else if (li
->encoding
== REDIS_ENCODING_LIST
) {
5084 if (entry
->ln
!= NULL
) {
5085 if (li
->direction
== REDIS_TAIL
)
5086 li
->ln
= li
->ln
->next
;
5088 li
->ln
= li
->ln
->prev
;
5092 redisPanic("Unknown list encoding");
5097 /* Return entry or NULL at the current position of the iterator. */
5098 static robj
*listTypeGet(listTypeEntry
*entry
) {
5099 listTypeIterator
*li
= entry
->li
;
5101 if (li
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5102 unsigned char *vstr
;
5105 redisAssert(entry
->zi
!= NULL
);
5106 if (ziplistGet(entry
->zi
,&vstr
,&vlen
,&vlong
)) {
5108 value
= createStringObject((char*)vstr
,vlen
);
5110 value
= createStringObjectFromLongLong(vlong
);
5113 } else if (li
->encoding
== REDIS_ENCODING_LIST
) {
5114 redisAssert(entry
->ln
!= NULL
);
5115 value
= listNodeValue(entry
->ln
);
5116 incrRefCount(value
);
5118 redisPanic("Unknown list encoding");
5123 /* Compare the given object with the entry at the current position. */
5124 static int listTypeEqual(listTypeEntry
*entry
, robj
*o
) {
5125 listTypeIterator
*li
= entry
->li
;
5126 if (li
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5127 redisAssert(o
->encoding
== REDIS_ENCODING_RAW
);
5128 return ziplistCompare(entry
->zi
,o
->ptr
,sdslen(o
->ptr
));
5129 } else if (li
->encoding
== REDIS_ENCODING_LIST
) {
5130 return equalStringObjects(o
,listNodeValue(entry
->ln
));
5132 redisPanic("Unknown list encoding");
5136 /* Delete the element pointed to. */
5137 static void listTypeDelete(listTypeEntry
*entry
) {
5138 listTypeIterator
*li
= entry
->li
;
5139 if (li
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5140 unsigned char *p
= entry
->zi
;
5141 li
->subject
->ptr
= ziplistDelete(li
->subject
->ptr
,&p
);
5143 /* Update position of the iterator depending on the direction */
5144 if (li
->direction
== REDIS_TAIL
)
5147 li
->zi
= ziplistPrev(li
->subject
->ptr
,p
);
5148 } else if (entry
->li
->encoding
== REDIS_ENCODING_LIST
) {
5150 if (li
->direction
== REDIS_TAIL
)
5151 next
= entry
->ln
->next
;
5153 next
= entry
->ln
->prev
;
5154 listDelNode(li
->subject
->ptr
,entry
->ln
);
5157 redisPanic("Unknown list encoding");
5161 static void listTypeConvert(robj
*subject
, int enc
) {
5162 listTypeIterator
*li
;
5163 listTypeEntry entry
;
5164 redisAssert(subject
->type
== REDIS_LIST
);
5166 if (enc
== REDIS_ENCODING_LIST
) {
5167 list
*l
= listCreate();
5168 listSetFreeMethod(l
,decrRefCount
);
5170 /* listTypeGet returns a robj with incremented refcount */
5171 li
= listTypeInitIterator(subject
,0,REDIS_TAIL
);
5172 while (listTypeNext(li
,&entry
)) listAddNodeTail(l
,listTypeGet(&entry
));
5173 listTypeReleaseIterator(li
);
5175 subject
->encoding
= REDIS_ENCODING_LIST
;
5176 zfree(subject
->ptr
);
5179 redisPanic("Unsupported list conversion");
5183 static void pushGenericCommand(redisClient
*c
, int where
) {
5184 robj
*lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5186 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
5187 addReply(c
,shared
.cone
);
5190 lobj
= createZiplistObject();
5191 dbAdd(c
->db
,c
->argv
[1],lobj
);
5193 if (lobj
->type
!= REDIS_LIST
) {
5194 addReply(c
,shared
.wrongtypeerr
);
5197 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
5198 addReply(c
,shared
.cone
);
5202 listTypePush(lobj
,c
->argv
[2],where
);
5203 addReplyLongLong(c
,listTypeLength(lobj
));
5207 static void lpushCommand(redisClient
*c
) {
5208 pushGenericCommand(c
,REDIS_HEAD
);
5211 static void rpushCommand(redisClient
*c
) {
5212 pushGenericCommand(c
,REDIS_TAIL
);
5215 static void llenCommand(redisClient
*c
) {
5216 robj
*o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
);
5217 if (o
== NULL
|| checkType(c
,o
,REDIS_LIST
)) return;
5218 addReplyUlong(c
,listTypeLength(o
));
5221 static void lindexCommand(redisClient
*c
) {
5222 robj
*o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
);
5223 if (o
== NULL
|| checkType(c
,o
,REDIS_LIST
)) return;
5224 int index
= atoi(c
->argv
[2]->ptr
);
5227 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5229 unsigned char *vstr
;
5232 p
= ziplistIndex(o
->ptr
,index
);
5233 if (ziplistGet(p
,&vstr
,&vlen
,&vlong
)) {
5235 value
= createStringObject((char*)vstr
,vlen
);
5237 value
= createStringObjectFromLongLong(vlong
);
5239 addReplyBulk(c
,value
);
5240 decrRefCount(value
);
5242 addReply(c
,shared
.nullbulk
);
5244 } else if (o
->encoding
== REDIS_ENCODING_LIST
) {
5245 listNode
*ln
= listIndex(o
->ptr
,index
);
5247 value
= listNodeValue(ln
);
5248 addReplyBulk(c
,value
);
5250 addReply(c
,shared
.nullbulk
);
5253 redisPanic("Unknown list encoding");
5257 static void lsetCommand(redisClient
*c
) {
5258 robj
*o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nokeyerr
);
5259 if (o
== NULL
|| checkType(c
,o
,REDIS_LIST
)) return;
5260 int index
= atoi(c
->argv
[2]->ptr
);
5261 robj
*value
= c
->argv
[3];
5263 listTypeTryConversion(o
,value
);
5264 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5265 unsigned char *p
, *zl
= o
->ptr
;
5266 p
= ziplistIndex(zl
,index
);
5268 addReply(c
,shared
.outofrangeerr
);
5270 o
->ptr
= ziplistDelete(o
->ptr
,&p
);
5271 value
= getDecodedObject(value
);
5272 o
->ptr
= ziplistInsert(o
->ptr
,p
,value
->ptr
,sdslen(value
->ptr
));
5273 decrRefCount(value
);
5274 addReply(c
,shared
.ok
);
5277 } else if (o
->encoding
== REDIS_ENCODING_LIST
) {
5278 listNode
*ln
= listIndex(o
->ptr
,index
);
5280 addReply(c
,shared
.outofrangeerr
);
5282 decrRefCount((robj
*)listNodeValue(ln
));
5283 listNodeValue(ln
) = value
;
5284 incrRefCount(value
);
5285 addReply(c
,shared
.ok
);
5289 redisPanic("Unknown list encoding");
5293 static void popGenericCommand(redisClient
*c
, int where
) {
5294 robj
*o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
);
5295 if (o
== NULL
|| checkType(c
,o
,REDIS_LIST
)) return;
5297 robj
*value
= listTypePop(o
,where
);
5298 if (value
== NULL
) {
5299 addReply(c
,shared
.nullbulk
);
5301 addReplyBulk(c
,value
);
5302 decrRefCount(value
);
5303 if (listTypeLength(o
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5308 static void lpopCommand(redisClient
*c
) {
5309 popGenericCommand(c
,REDIS_HEAD
);
5312 static void rpopCommand(redisClient
*c
) {
5313 popGenericCommand(c
,REDIS_TAIL
);
5316 static void lrangeCommand(redisClient
*c
) {
5318 int start
= atoi(c
->argv
[2]->ptr
);
5319 int end
= atoi(c
->argv
[3]->ptr
);
5322 listTypeEntry entry
;
5324 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
5325 || checkType(c
,o
,REDIS_LIST
)) return;
5326 llen
= listTypeLength(o
);
5328 /* convert negative indexes */
5329 if (start
< 0) start
= llen
+start
;
5330 if (end
< 0) end
= llen
+end
;
5331 if (start
< 0) start
= 0;
5332 if (end
< 0) end
= 0;
5334 /* indexes sanity checks */
5335 if (start
> end
|| start
>= llen
) {
5336 /* Out of range start or start > end result in empty list */
5337 addReply(c
,shared
.emptymultibulk
);
5340 if (end
>= llen
) end
= llen
-1;
5341 rangelen
= (end
-start
)+1;
5343 /* Return the result in form of a multi-bulk reply */
5344 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
5345 listTypeIterator
*li
= listTypeInitIterator(o
,start
,REDIS_TAIL
);
5346 for (j
= 0; j
< rangelen
; j
++) {
5347 redisAssert(listTypeNext(li
,&entry
));
5348 value
= listTypeGet(&entry
);
5349 addReplyBulk(c
,value
);
5350 decrRefCount(value
);
5352 listTypeReleaseIterator(li
);
5355 static void ltrimCommand(redisClient
*c
) {
5357 int start
= atoi(c
->argv
[2]->ptr
);
5358 int end
= atoi(c
->argv
[3]->ptr
);
5360 int j
, ltrim
, rtrim
;
5364 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.ok
)) == NULL
||
5365 checkType(c
,o
,REDIS_LIST
)) return;
5366 llen
= listTypeLength(o
);
5368 /* convert negative indexes */
5369 if (start
< 0) start
= llen
+start
;
5370 if (end
< 0) end
= llen
+end
;
5371 if (start
< 0) start
= 0;
5372 if (end
< 0) end
= 0;
5374 /* indexes sanity checks */
5375 if (start
> end
|| start
>= llen
) {
5376 /* Out of range start or start > end result in empty list */
5380 if (end
>= llen
) end
= llen
-1;
5385 /* Remove list elements to perform the trim */
5386 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
) {
5387 o
->ptr
= ziplistDeleteRange(o
->ptr
,0,ltrim
);
5388 o
->ptr
= ziplistDeleteRange(o
->ptr
,-rtrim
,rtrim
);
5389 } else if (o
->encoding
== REDIS_ENCODING_LIST
) {
5391 for (j
= 0; j
< ltrim
; j
++) {
5392 ln
= listFirst(list
);
5393 listDelNode(list
,ln
);
5395 for (j
= 0; j
< rtrim
; j
++) {
5396 ln
= listLast(list
);
5397 listDelNode(list
,ln
);
5400 redisPanic("Unknown list encoding");
5402 if (listTypeLength(o
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5404 addReply(c
,shared
.ok
);
5407 static void lremCommand(redisClient
*c
) {
5408 robj
*subject
, *obj
= c
->argv
[3];
5409 int toremove
= atoi(c
->argv
[2]->ptr
);
5411 listTypeEntry entry
;
5413 subject
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
);
5414 if (subject
== NULL
|| checkType(c
,subject
,REDIS_LIST
)) return;
5416 /* Make sure obj is raw when we're dealing with a ziplist */
5417 if (subject
->encoding
== REDIS_ENCODING_ZIPLIST
)
5418 obj
= getDecodedObject(obj
);
5420 listTypeIterator
*li
;
5422 toremove
= -toremove
;
5423 li
= listTypeInitIterator(subject
,-1,REDIS_HEAD
);
5425 li
= listTypeInitIterator(subject
,0,REDIS_TAIL
);
5428 while (listTypeNext(li
,&entry
)) {
5429 if (listTypeEqual(&entry
,obj
)) {
5430 listTypeDelete(&entry
);
5433 if (toremove
&& removed
== toremove
) break;
5436 listTypeReleaseIterator(li
);
5438 /* Clean up raw encoded object */
5439 if (subject
->encoding
== REDIS_ENCODING_ZIPLIST
)
5442 if (listTypeLength(subject
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5443 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
5446 /* This is the semantic of this command:
5447 * RPOPLPUSH srclist dstlist:
5448 * IF LLEN(srclist) > 0
5449 * element = RPOP srclist
5450 * LPUSH dstlist element
5457 * The idea is to be able to get an element from a list in a reliable way
5458 * since the element is not just returned but pushed against another list
5459 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5461 static void rpoplpushcommand(redisClient
*c
) {
5463 if ((sobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
5464 checkType(c
,sobj
,REDIS_LIST
)) return;
5466 if (listTypeLength(sobj
) == 0) {
5467 addReply(c
,shared
.nullbulk
);
5469 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
5470 if (dobj
&& checkType(c
,dobj
,REDIS_LIST
)) return;
5471 value
= listTypePop(sobj
,REDIS_TAIL
);
5473 /* Add the element to the target list (unless it's directly
5474 * passed to some BLPOP-ing client */
5475 if (!handleClientsWaitingListPush(c
,c
->argv
[2],value
)) {
5476 /* Create the list if the key does not exist */
5478 dobj
= createZiplistObject();
5479 dbAdd(c
->db
,c
->argv
[2],dobj
);
5481 listTypePush(dobj
,value
,REDIS_HEAD
);
5484 /* Send the element to the client as reply as well */
5485 addReplyBulk(c
,value
);
5487 /* listTypePop returns an object with its refcount incremented */
5488 decrRefCount(value
);
5490 /* Delete the source list when it is empty */
5491 if (listTypeLength(sobj
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5496 /* ==================================== Sets ================================ */
5498 /* Factory method to return a set that *can* hold "value". When the object has
5499 * an integer-encodable value, an intset will be returned. Otherwise a regular
5501 static robj
*setTypeCreate(robj
*value
) {
5502 if (getLongLongFromObject(value
,NULL
) == REDIS_OK
)
5503 return createIntsetObject();
5504 return createSetObject();
5507 static int setTypeAdd(robj
*subject
, robj
*value
) {
5509 if (subject
->encoding
== REDIS_ENCODING_HT
) {
5510 if (dictAdd(subject
->ptr
,value
,NULL
) == DICT_OK
) {
5511 incrRefCount(value
);
5514 } else if (subject
->encoding
== REDIS_ENCODING_INTSET
) {
5515 if (getLongLongFromObject(value
,&llval
) == REDIS_OK
) {
5516 uint8_t success
= 0;
5517 subject
->ptr
= intsetAdd(subject
->ptr
,llval
,&success
);
5519 /* Convert to regular set when the intset contains
5520 * too many entries. */
5521 if (intsetLen(subject
->ptr
) > server
.set_max_intset_entries
)
5522 setTypeConvert(subject
,REDIS_ENCODING_HT
);
5526 /* Failed to get integer from object, convert to regular set. */
5527 setTypeConvert(subject
,REDIS_ENCODING_HT
);
5529 /* The set *was* an intset and this value is not integer
5530 * encodable, so dictAdd should always work. */
5531 redisAssert(dictAdd(subject
->ptr
,value
,NULL
) == DICT_OK
);
5532 incrRefCount(value
);
5536 redisPanic("Unknown set encoding");
5541 static int setTypeRemove(robj
*subject
, robj
*value
) {
5543 if (subject
->encoding
== REDIS_ENCODING_HT
) {
5544 if (dictDelete(subject
->ptr
,value
) == DICT_OK
) {
5545 if (htNeedsResize(subject
->ptr
)) dictResize(subject
->ptr
);
5548 } else if (subject
->encoding
== REDIS_ENCODING_INTSET
) {
5549 if (getLongLongFromObject(value
,&llval
) == REDIS_OK
) {
5551 subject
->ptr
= intsetRemove(subject
->ptr
,llval
,&success
);
5552 if (success
) return 1;
5555 redisPanic("Unknown set encoding");
5560 static int setTypeIsMember(robj
*subject
, robj
*value
) {
5562 if (subject
->encoding
== REDIS_ENCODING_HT
) {
5563 return dictFind((dict
*)subject
->ptr
,value
) != NULL
;
5564 } else if (subject
->encoding
== REDIS_ENCODING_INTSET
) {
5565 if (getLongLongFromObject(value
,&llval
) == REDIS_OK
) {
5566 return intsetFind((intset
*)subject
->ptr
,llval
);
5569 redisPanic("Unknown set encoding");
5574 /* Structure to hold set iteration abstraction. */
5578 int ii
; /* intset iterator */
5582 static setIterator
*setTypeInitIterator(robj
*subject
) {
5583 setIterator
*si
= zmalloc(sizeof(setIterator
));
5584 si
->subject
= subject
;
5585 si
->encoding
= subject
->encoding
;
5586 if (si
->encoding
== REDIS_ENCODING_HT
) {
5587 si
->di
= dictGetIterator(subject
->ptr
);
5588 } else if (si
->encoding
== REDIS_ENCODING_INTSET
) {
5591 redisPanic("Unknown set encoding");
5596 static void setTypeReleaseIterator(setIterator
*si
) {
5597 if (si
->encoding
== REDIS_ENCODING_HT
)
5598 dictReleaseIterator(si
->di
);
5602 /* Move to the next entry in the set. Returns the object at the current
5603 * position, or NULL when the end is reached. This object will have its
5604 * refcount incremented, so the caller needs to take care of this. */
5605 static robj
*setTypeNext(setIterator
*si
) {
5607 if (si
->encoding
== REDIS_ENCODING_HT
) {
5608 dictEntry
*de
= dictNext(si
->di
);
5610 ret
= dictGetEntryKey(de
);
5613 } else if (si
->encoding
== REDIS_ENCODING_INTSET
) {
5615 if (intsetGet(si
->subject
->ptr
,si
->ii
++,&llval
))
5616 ret
= createStringObjectFromLongLong(llval
);
5622 /* Return random element from set. The returned object will always have
5623 * an incremented refcount. */
5624 robj
*setTypeRandomElement(robj
*subject
) {
5626 if (subject
->encoding
== REDIS_ENCODING_HT
) {
5627 dictEntry
*de
= dictGetRandomKey(subject
->ptr
);
5628 ret
= dictGetEntryKey(de
);
5630 } else if (subject
->encoding
== REDIS_ENCODING_INTSET
) {
5631 long long llval
= intsetRandom(subject
->ptr
);
5632 ret
= createStringObjectFromLongLong(llval
);
5634 redisPanic("Unknown set encoding");
5639 static unsigned long setTypeSize(robj
*subject
) {
5640 if (subject
->encoding
== REDIS_ENCODING_HT
) {
5641 return dictSize((dict
*)subject
->ptr
);
5642 } else if (subject
->encoding
== REDIS_ENCODING_INTSET
) {
5643 return intsetLen((intset
*)subject
->ptr
);
5645 redisPanic("Unknown set encoding");
5649 static void setTypeConvert(robj
*subject
, int enc
) {
5652 redisAssert(subject
->type
== REDIS_SET
);
5654 if (enc
== REDIS_ENCODING_HT
) {
5655 dict
*d
= dictCreate(&setDictType
,NULL
);
5657 /* setTypeGet returns a robj with incremented refcount */
5658 si
= setTypeInitIterator(subject
);
5659 while ((element
= setTypeNext(si
)) != NULL
)
5660 redisAssert(dictAdd(d
,element
,NULL
) == DICT_OK
);
5661 setTypeReleaseIterator(si
);
5663 subject
->encoding
= REDIS_ENCODING_HT
;
5664 zfree(subject
->ptr
);
5667 redisPanic("Unsupported set conversion");
5671 static void saddCommand(redisClient
*c
) {
5674 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5676 set
= setTypeCreate(c
->argv
[2]);
5677 dbAdd(c
->db
,c
->argv
[1],set
);
5679 if (set
->type
!= REDIS_SET
) {
5680 addReply(c
,shared
.wrongtypeerr
);
5684 if (setTypeAdd(set
,c
->argv
[2])) {
5686 addReply(c
,shared
.cone
);
5688 addReply(c
,shared
.czero
);
5692 static void sremCommand(redisClient
*c
) {
5695 if ((set
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5696 checkType(c
,set
,REDIS_SET
)) return;
5698 if (setTypeRemove(set
,c
->argv
[2])) {
5699 if (setTypeSize(set
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5701 addReply(c
,shared
.cone
);
5703 addReply(c
,shared
.czero
);
5707 static void smoveCommand(redisClient
*c
) {
5708 robj
*srcset
, *dstset
, *ele
;
5709 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5710 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
5713 /* If the source key does not exist return 0 */
5714 if (srcset
== NULL
) {
5715 addReply(c
,shared
.czero
);
5719 /* If the source key has the wrong type, or the destination key
5720 * is set and has the wrong type, return with an error. */
5721 if (checkType(c
,srcset
,REDIS_SET
) ||
5722 (dstset
&& checkType(c
,dstset
,REDIS_SET
))) return;
5724 /* If srcset and dstset are equal, SMOVE is a no-op */
5725 if (srcset
== dstset
) {
5726 addReply(c
,shared
.cone
);
5730 /* If the element cannot be removed from the src set, return 0. */
5731 if (!setTypeRemove(srcset
,ele
)) {
5732 addReply(c
,shared
.czero
);
5736 /* Remove the src set from the database when empty */
5737 if (setTypeSize(srcset
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5740 /* Create the destination set when it doesn't exist */
5742 dstset
= setTypeCreate(ele
);
5743 dbAdd(c
->db
,c
->argv
[2],dstset
);
5746 /* An extra key has changed when ele was successfully added to dstset */
5747 if (setTypeAdd(dstset
,ele
)) server
.dirty
++;
5748 addReply(c
,shared
.cone
);
5751 static void sismemberCommand(redisClient
*c
) {
5754 if ((set
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5755 checkType(c
,set
,REDIS_SET
)) return;
5757 if (setTypeIsMember(set
,c
->argv
[2]))
5758 addReply(c
,shared
.cone
);
5760 addReply(c
,shared
.czero
);
5763 static void scardCommand(redisClient
*c
) {
5766 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5767 checkType(c
,o
,REDIS_SET
)) return;
5769 addReplyUlong(c
,setTypeSize(o
));
5772 static void spopCommand(redisClient
*c
) {
5775 if ((set
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
5776 checkType(c
,set
,REDIS_SET
)) return;
5778 ele
= setTypeRandomElement(set
);
5780 addReply(c
,shared
.nullbulk
);
5782 setTypeRemove(set
,ele
);
5783 addReplyBulk(c
,ele
);
5785 if (setTypeSize(set
) == 0) dbDelete(c
->db
,c
->argv
[1]);
5790 static void srandmemberCommand(redisClient
*c
) {
5793 if ((set
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
5794 checkType(c
,set
,REDIS_SET
)) return;
5796 ele
= setTypeRandomElement(set
);
5798 addReply(c
,shared
.nullbulk
);
5800 addReplyBulk(c
,ele
);
5805 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
5806 return setTypeSize(*(robj
**)s1
)-setTypeSize(*(robj
**)s2
);
5809 static void sinterGenericCommand(redisClient
*c
, robj
**setkeys
, unsigned long setnum
, robj
*dstkey
) {
5810 robj
**sets
= zmalloc(sizeof(robj
*)*setnum
);
5812 robj
*ele
, *lenobj
= NULL
, *dstset
= NULL
;
5813 unsigned long j
, cardinality
= 0;
5815 for (j
= 0; j
< setnum
; j
++) {
5816 robj
*setobj
= dstkey
?
5817 lookupKeyWrite(c
->db
,setkeys
[j
]) :
5818 lookupKeyRead(c
->db
,setkeys
[j
]);
5822 if (dbDelete(c
->db
,dstkey
))
5824 addReply(c
,shared
.czero
);
5826 addReply(c
,shared
.emptymultibulk
);
5830 if (checkType(c
,setobj
,REDIS_SET
)) {
5836 /* Sort sets from the smallest to largest, this will improve our
5837 * algorithm's performace */
5838 qsort(sets
,setnum
,sizeof(robj
*),qsortCompareSetsByCardinality
);
5840 /* The first thing we should output is the total number of elements...
5841 * since this is a multi-bulk write, but at this stage we don't know
5842 * the intersection set size, so we use a trick, append an empty object
5843 * to the output list and save the pointer to later modify it with the
5846 lenobj
= createObject(REDIS_STRING
,NULL
);
5848 decrRefCount(lenobj
);
5850 /* If we have a target key where to store the resulting set
5851 * create this key with an empty set inside */
5852 dstset
= createIntsetObject();
5855 /* Iterate all the elements of the first (smallest) set, and test
5856 * the element against all the other sets, if at least one set does
5857 * not include the element it is discarded */
5858 si
= setTypeInitIterator(sets
[0]);
5859 while((ele
= setTypeNext(si
)) != NULL
) {
5860 for (j
= 1; j
< setnum
; j
++)
5861 if (!setTypeIsMember(sets
[j
],ele
)) break;
5863 /* Only take action when all sets contain the member */
5866 addReplyBulk(c
,ele
);
5869 setTypeAdd(dstset
,ele
);
5874 setTypeReleaseIterator(si
);
5877 /* Store the resulting set into the target, if the intersection
5878 * is not an empty set. */
5879 dbDelete(c
->db
,dstkey
);
5880 if (setTypeSize(dstset
) > 0) {
5881 dbAdd(c
->db
,dstkey
,dstset
);
5882 addReplyLongLong(c
,setTypeSize(dstset
));
5884 decrRefCount(dstset
);
5885 addReply(c
,shared
.czero
);
5889 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
5894 static void sinterCommand(redisClient
*c
) {
5895 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
5898 static void sinterstoreCommand(redisClient
*c
) {
5899 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
5902 #define REDIS_OP_UNION 0
5903 #define REDIS_OP_DIFF 1
5904 #define REDIS_OP_INTER 2
5906 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setkeys
, int setnum
, robj
*dstkey
, int op
) {
5907 robj
**sets
= zmalloc(sizeof(robj
*)*setnum
);
5909 robj
*ele
, *dstset
= NULL
;
5910 int j
, cardinality
= 0;
5912 for (j
= 0; j
< setnum
; j
++) {
5913 robj
*setobj
= dstkey
?
5914 lookupKeyWrite(c
->db
,setkeys
[j
]) :
5915 lookupKeyRead(c
->db
,setkeys
[j
]);
5920 if (checkType(c
,setobj
,REDIS_SET
)) {
5927 /* We need a temp set object to store our union. If the dstkey
5928 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5929 * this set object will be the resulting object to set into the target key*/
5930 dstset
= createIntsetObject();
5932 /* Iterate all the elements of all the sets, add every element a single
5933 * time to the result set */
5934 for (j
= 0; j
< setnum
; j
++) {
5935 if (op
== REDIS_OP_DIFF
&& j
== 0 && !sets
[j
]) break; /* result set is empty */
5936 if (!sets
[j
]) continue; /* non existing keys are like empty sets */
5938 si
= setTypeInitIterator(sets
[j
]);
5939 while((ele
= setTypeNext(si
)) != NULL
) {
5940 if (op
== REDIS_OP_UNION
|| j
== 0) {
5941 if (setTypeAdd(dstset
,ele
)) {
5944 } else if (op
== REDIS_OP_DIFF
) {
5945 if (setTypeRemove(dstset
,ele
)) {
5951 setTypeReleaseIterator(si
);
5953 /* Exit when result set is empty. */
5954 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break;
5957 /* Output the content of the resulting set, if not in STORE mode */
5959 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
5960 si
= setTypeInitIterator(dstset
);
5961 while((ele
= setTypeNext(si
)) != NULL
) {
5962 addReplyBulk(c
,ele
);
5965 setTypeReleaseIterator(si
);
5966 decrRefCount(dstset
);
5968 /* If we have a target key where to store the resulting set
5969 * create this key with the result set inside */
5970 dbDelete(c
->db
,dstkey
);
5971 if (setTypeSize(dstset
) > 0) {
5972 dbAdd(c
->db
,dstkey
,dstset
);
5973 addReplyLongLong(c
,setTypeSize(dstset
));
5975 decrRefCount(dstset
);
5976 addReply(c
,shared
.czero
);
5983 static void sunionCommand(redisClient
*c
) {
5984 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
5987 static void sunionstoreCommand(redisClient
*c
) {
5988 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
5991 static void sdiffCommand(redisClient
*c
) {
5992 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
5995 static void sdiffstoreCommand(redisClient
*c
) {
5996 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
5999 /* ==================================== ZSets =============================== */
6001 /* ZSETs are ordered sets using two data structures to hold the same elements
6002 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
6005 * The elements are added to an hash table mapping Redis objects to scores.
6006 * At the same time the elements are added to a skip list mapping scores
6007 * to Redis objects (so objects are sorted by scores in this "view"). */
6009 /* This skiplist implementation is almost a C translation of the original
6010 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
6011 * Alternative to Balanced Trees", modified in three ways:
6012 * a) this implementation allows for repeated values.
6013 * b) the comparison is not just by key (our 'score') but by satellite data.
6014 * c) there is a back pointer, so it's a doubly linked list with the back
6015 * pointers being only at "level 1". This allows to traverse the list
6016 * from tail to head, useful for ZREVRANGE. */
6018 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
6019 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
6021 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
6023 zn
->span
= zmalloc(sizeof(unsigned int) * (level
- 1));
6031 static zskiplist
*zslCreate(void) {
6035 zsl
= zmalloc(sizeof(*zsl
));
6038 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
6039 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
6040 zsl
->header
->forward
[j
] = NULL
;
6042 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
6043 if (j
< ZSKIPLIST_MAXLEVEL
-1)
6044 zsl
->header
->span
[j
] = 0;
6046 zsl
->header
->backward
= NULL
;
6051 static void zslFreeNode(zskiplistNode
*node
) {
6052 decrRefCount(node
->obj
);
6053 zfree(node
->forward
);
6058 static void zslFree(zskiplist
*zsl
) {
6059 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
6061 zfree(zsl
->header
->forward
);
6062 zfree(zsl
->header
->span
);
6065 next
= node
->forward
[0];
6072 static int zslRandomLevel(void) {
6074 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
6076 return (level
<ZSKIPLIST_MAXLEVEL
) ? level
: ZSKIPLIST_MAXLEVEL
;
6079 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
6080 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
6081 unsigned int rank
[ZSKIPLIST_MAXLEVEL
];
6085 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6086 /* store rank that is crossed to reach the insert position */
6087 rank
[i
] = i
== (zsl
->level
-1) ? 0 : rank
[i
+1];
6089 while (x
->forward
[i
] &&
6090 (x
->forward
[i
]->score
< score
||
6091 (x
->forward
[i
]->score
== score
&&
6092 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
6093 rank
[i
] += i
> 0 ? x
->span
[i
-1] : 1;
6098 /* we assume the key is not already inside, since we allow duplicated
6099 * scores, and the re-insertion of score and redis object should never
6100 * happpen since the caller of zslInsert() should test in the hash table
6101 * if the element is already inside or not. */
6102 level
= zslRandomLevel();
6103 if (level
> zsl
->level
) {
6104 for (i
= zsl
->level
; i
< level
; i
++) {
6106 update
[i
] = zsl
->header
;
6107 update
[i
]->span
[i
-1] = zsl
->length
;
6111 x
= zslCreateNode(level
,score
,obj
);
6112 for (i
= 0; i
< level
; i
++) {
6113 x
->forward
[i
] = update
[i
]->forward
[i
];
6114 update
[i
]->forward
[i
] = x
;
6116 /* update span covered by update[i] as x is inserted here */
6118 x
->span
[i
-1] = update
[i
]->span
[i
-1] - (rank
[0] - rank
[i
]);
6119 update
[i
]->span
[i
-1] = (rank
[0] - rank
[i
]) + 1;
6123 /* increment span for untouched levels */
6124 for (i
= level
; i
< zsl
->level
; i
++) {
6125 update
[i
]->span
[i
-1]++;
6128 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
6130 x
->forward
[0]->backward
= x
;
6136 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6137 void zslDeleteNode(zskiplist
*zsl
, zskiplistNode
*x
, zskiplistNode
**update
) {
6139 for (i
= 0; i
< zsl
->level
; i
++) {
6140 if (update
[i
]->forward
[i
] == x
) {
6142 update
[i
]->span
[i
-1] += x
->span
[i
-1] - 1;
6144 update
[i
]->forward
[i
] = x
->forward
[i
];
6146 /* invariant: i > 0, because update[0]->forward[0]
6147 * is always equal to x */
6148 update
[i
]->span
[i
-1] -= 1;
6151 if (x
->forward
[0]) {
6152 x
->forward
[0]->backward
= x
->backward
;
6154 zsl
->tail
= x
->backward
;
6156 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
6161 /* Delete an element with matching score/object from the skiplist. */
6162 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
6163 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
6167 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6168 while (x
->forward
[i
] &&
6169 (x
->forward
[i
]->score
< score
||
6170 (x
->forward
[i
]->score
== score
&&
6171 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
6175 /* We may have multiple elements with the same score, what we need
6176 * is to find the element with both the right score and object. */
6178 if (x
&& score
== x
->score
&& equalStringObjects(x
->obj
,obj
)) {
6179 zslDeleteNode(zsl
, x
, update
);
6183 return 0; /* not found */
6185 return 0; /* not found */
6188 /* Delete all the elements with score between min and max from the skiplist.
6189 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6190 * Note that this function takes the reference to the hash table view of the
6191 * sorted set, in order to remove the elements from the hash table too. */
6192 static unsigned long zslDeleteRangeByScore(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
6193 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
6194 unsigned long removed
= 0;
6198 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6199 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
6203 /* We may have multiple elements with the same score, what we need
6204 * is to find the element with both the right score and object. */
6206 while (x
&& x
->score
<= max
) {
6207 zskiplistNode
*next
= x
->forward
[0];
6208 zslDeleteNode(zsl
, x
, update
);
6209 dictDelete(dict
,x
->obj
);
6214 return removed
; /* not found */
6217 /* Delete all the elements with rank between start and end from the skiplist.
6218 * Start and end are inclusive. Note that start and end need to be 1-based */
6219 static unsigned long zslDeleteRangeByRank(zskiplist
*zsl
, unsigned int start
, unsigned int end
, dict
*dict
) {
6220 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
6221 unsigned long traversed
= 0, removed
= 0;
6225 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6226 while (x
->forward
[i
] && (traversed
+ (i
> 0 ? x
->span
[i
-1] : 1)) < start
) {
6227 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
6235 while (x
&& traversed
<= end
) {
6236 zskiplistNode
*next
= x
->forward
[0];
6237 zslDeleteNode(zsl
, x
, update
);
6238 dictDelete(dict
,x
->obj
);
6247 /* Find the first node having a score equal or greater than the specified one.
6248 * Returns NULL if there is no match. */
6249 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
6254 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6255 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
6258 /* We may have multiple elements with the same score, what we need
6259 * is to find the element with both the right score and object. */
6260 return x
->forward
[0];
6263 /* Find the rank for an element by both score and key.
6264 * Returns 0 when the element cannot be found, rank otherwise.
6265 * Note that the rank is 1-based due to the span of zsl->header to the
6267 static unsigned long zslistTypeGetRank(zskiplist
*zsl
, double score
, robj
*o
) {
6269 unsigned long rank
= 0;
6273 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6274 while (x
->forward
[i
] &&
6275 (x
->forward
[i
]->score
< score
||
6276 (x
->forward
[i
]->score
== score
&&
6277 compareStringObjects(x
->forward
[i
]->obj
,o
) <= 0))) {
6278 rank
+= i
> 0 ? x
->span
[i
-1] : 1;
6282 /* x might be equal to zsl->header, so test if obj is non-NULL */
6283 if (x
->obj
&& equalStringObjects(x
->obj
,o
)) {
6290 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6291 zskiplistNode
* zslistTypeGetElementByRank(zskiplist
*zsl
, unsigned long rank
) {
6293 unsigned long traversed
= 0;
6297 for (i
= zsl
->level
-1; i
>= 0; i
--) {
6298 while (x
->forward
[i
] && (traversed
+ (i
>0 ? x
->span
[i
-1] : 1)) <= rank
)
6300 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
6303 if (traversed
== rank
) {
6310 /* The actual Z-commands implementations */
6312 /* This generic command implements both ZADD and ZINCRBY.
6313 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6314 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6315 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
6320 if (isnan(scoreval
)) {
6321 addReplySds(c
,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6325 zsetobj
= lookupKeyWrite(c
->db
,key
);
6326 if (zsetobj
== NULL
) {
6327 zsetobj
= createZsetObject();
6328 dbAdd(c
->db
,key
,zsetobj
);
6330 if (zsetobj
->type
!= REDIS_ZSET
) {
6331 addReply(c
,shared
.wrongtypeerr
);
6337 /* Ok now since we implement both ZADD and ZINCRBY here the code
6338 * needs to handle the two different conditions. It's all about setting
6339 * '*score', that is, the new score to set, to the right value. */
6340 score
= zmalloc(sizeof(double));
6344 /* Read the old score. If the element was not present starts from 0 */
6345 de
= dictFind(zs
->dict
,ele
);
6347 double *oldscore
= dictGetEntryVal(de
);
6348 *score
= *oldscore
+ scoreval
;
6352 if (isnan(*score
)) {
6354 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6356 /* Note that we don't need to check if the zset may be empty and
6357 * should be removed here, as we can only obtain Nan as score if
6358 * there was already an element in the sorted set. */
6365 /* What follows is a simple remove and re-insert operation that is common
6366 * to both ZADD and ZINCRBY... */
6367 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
6368 /* case 1: New element */
6369 incrRefCount(ele
); /* added to hash */
6370 zslInsert(zs
->zsl
,*score
,ele
);
6371 incrRefCount(ele
); /* added to skiplist */
6374 addReplyDouble(c
,*score
);
6376 addReply(c
,shared
.cone
);
6381 /* case 2: Score update operation */
6382 de
= dictFind(zs
->dict
,ele
);
6383 redisAssert(de
!= NULL
);
6384 oldscore
= dictGetEntryVal(de
);
6385 if (*score
!= *oldscore
) {
6388 /* Remove and insert the element in the skip list with new score */
6389 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
6390 redisAssert(deleted
!= 0);
6391 zslInsert(zs
->zsl
,*score
,ele
);
6393 /* Update the score in the hash table */
6394 dictReplace(zs
->dict
,ele
,score
);
6400 addReplyDouble(c
,*score
);
6402 addReply(c
,shared
.czero
);
6406 static void zaddCommand(redisClient
*c
) {
6409 if (getDoubleFromObjectOrReply(c
, c
->argv
[2], &scoreval
, NULL
) != REDIS_OK
) return;
6410 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
6413 static void zincrbyCommand(redisClient
*c
) {
6416 if (getDoubleFromObjectOrReply(c
, c
->argv
[2], &scoreval
, NULL
) != REDIS_OK
) return;
6417 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
6420 static void zremCommand(redisClient
*c
) {
6427 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6428 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
6431 de
= dictFind(zs
->dict
,c
->argv
[2]);
6433 addReply(c
,shared
.czero
);
6436 /* Delete from the skiplist */
6437 oldscore
= dictGetEntryVal(de
);
6438 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
6439 redisAssert(deleted
!= 0);
6441 /* Delete from the hash table */
6442 dictDelete(zs
->dict
,c
->argv
[2]);
6443 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
6444 if (dictSize(zs
->dict
) == 0) dbDelete(c
->db
,c
->argv
[1]);
6446 addReply(c
,shared
.cone
);
6449 static void zremrangebyscoreCommand(redisClient
*c
) {
6456 if ((getDoubleFromObjectOrReply(c
, c
->argv
[2], &min
, NULL
) != REDIS_OK
) ||
6457 (getDoubleFromObjectOrReply(c
, c
->argv
[3], &max
, NULL
) != REDIS_OK
)) return;
6459 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6460 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
6463 deleted
= zslDeleteRangeByScore(zs
->zsl
,min
,max
,zs
->dict
);
6464 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
6465 if (dictSize(zs
->dict
) == 0) dbDelete(c
->db
,c
->argv
[1]);
6466 server
.dirty
+= deleted
;
6467 addReplyLongLong(c
,deleted
);
6470 static void zremrangebyrankCommand(redisClient
*c
) {
6478 if ((getLongFromObjectOrReply(c
, c
->argv
[2], &start
, NULL
) != REDIS_OK
) ||
6479 (getLongFromObjectOrReply(c
, c
->argv
[3], &end
, NULL
) != REDIS_OK
)) return;
6481 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6482 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
6484 llen
= zs
->zsl
->length
;
6486 /* convert negative indexes */
6487 if (start
< 0) start
= llen
+start
;
6488 if (end
< 0) end
= llen
+end
;
6489 if (start
< 0) start
= 0;
6490 if (end
< 0) end
= 0;
6492 /* indexes sanity checks */
6493 if (start
> end
|| start
>= llen
) {
6494 addReply(c
,shared
.czero
);
6497 if (end
>= llen
) end
= llen
-1;
6499 /* increment start and end because zsl*Rank functions
6500 * use 1-based rank */
6501 deleted
= zslDeleteRangeByRank(zs
->zsl
,start
+1,end
+1,zs
->dict
);
6502 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
6503 if (dictSize(zs
->dict
) == 0) dbDelete(c
->db
,c
->argv
[1]);
6504 server
.dirty
+= deleted
;
6505 addReplyLongLong(c
, deleted
);
6513 static int qsortCompareZsetopsrcByCardinality(const void *s1
, const void *s2
) {
6514 zsetopsrc
*d1
= (void*) s1
, *d2
= (void*) s2
;
6515 unsigned long size1
, size2
;
6516 size1
= d1
->dict
? dictSize(d1
->dict
) : 0;
6517 size2
= d2
->dict
? dictSize(d2
->dict
) : 0;
6518 return size1
- size2
;
6521 #define REDIS_AGGR_SUM 1
6522 #define REDIS_AGGR_MIN 2
6523 #define REDIS_AGGR_MAX 3
6524 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6526 inline static void zunionInterAggregate(double *target
, double val
, int aggregate
) {
6527 if (aggregate
== REDIS_AGGR_SUM
) {
6528 *target
= *target
+ val
;
6529 } else if (aggregate
== REDIS_AGGR_MIN
) {
6530 *target
= val
< *target
? val
: *target
;
6531 } else if (aggregate
== REDIS_AGGR_MAX
) {
6532 *target
= val
> *target
? val
: *target
;
6535 redisPanic("Unknown ZUNION/INTER aggregate type");
6539 static void zunionInterGenericCommand(redisClient
*c
, robj
*dstkey
, int op
) {
6541 int aggregate
= REDIS_AGGR_SUM
;
6548 /* expect setnum input keys to be given */
6549 setnum
= atoi(c
->argv
[2]->ptr
);
6551 addReplySds(c
,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6555 /* test if the expected number of keys would overflow */
6556 if (3+setnum
> c
->argc
) {
6557 addReply(c
,shared
.syntaxerr
);
6561 /* read keys to be used for input */
6562 src
= zmalloc(sizeof(zsetopsrc
) * setnum
);
6563 for (i
= 0, j
= 3; i
< setnum
; i
++, j
++) {
6564 robj
*obj
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
6568 if (obj
->type
== REDIS_ZSET
) {
6569 src
[i
].dict
= ((zset
*)obj
->ptr
)->dict
;
6570 } else if (obj
->type
== REDIS_SET
) {
6571 src
[i
].dict
= (obj
->ptr
);
6574 addReply(c
,shared
.wrongtypeerr
);
6579 /* default all weights to 1 */
6580 src
[i
].weight
= 1.0;
6583 /* parse optional extra arguments */
6585 int remaining
= c
->argc
- j
;
6588 if (remaining
>= (setnum
+ 1) && !strcasecmp(c
->argv
[j
]->ptr
,"weights")) {
6590 for (i
= 0; i
< setnum
; i
++, j
++, remaining
--) {
6591 if (getDoubleFromObjectOrReply(c
, c
->argv
[j
], &src
[i
].weight
, NULL
) != REDIS_OK
)
6594 } else if (remaining
>= 2 && !strcasecmp(c
->argv
[j
]->ptr
,"aggregate")) {
6596 if (!strcasecmp(c
->argv
[j
]->ptr
,"sum")) {
6597 aggregate
= REDIS_AGGR_SUM
;
6598 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"min")) {
6599 aggregate
= REDIS_AGGR_MIN
;
6600 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"max")) {
6601 aggregate
= REDIS_AGGR_MAX
;
6604 addReply(c
,shared
.syntaxerr
);
6610 addReply(c
,shared
.syntaxerr
);
6616 /* sort sets from the smallest to largest, this will improve our
6617 * algorithm's performance */
6618 qsort(src
,setnum
,sizeof(zsetopsrc
),qsortCompareZsetopsrcByCardinality
);
6620 dstobj
= createZsetObject();
6621 dstzset
= dstobj
->ptr
;
6623 if (op
== REDIS_OP_INTER
) {
6624 /* skip going over all entries if the smallest zset is NULL or empty */
6625 if (src
[0].dict
&& dictSize(src
[0].dict
) > 0) {
6626 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6627 * from small to large, all src[i > 0].dict are non-empty too */
6628 di
= dictGetIterator(src
[0].dict
);
6629 while((de
= dictNext(di
)) != NULL
) {
6630 double *score
= zmalloc(sizeof(double)), value
;
6631 *score
= src
[0].weight
* zunionInterDictValue(de
);
6633 for (j
= 1; j
< setnum
; j
++) {
6634 dictEntry
*other
= dictFind(src
[j
].dict
,dictGetEntryKey(de
));
6636 value
= src
[j
].weight
* zunionInterDictValue(other
);
6637 zunionInterAggregate(score
, value
, aggregate
);
6643 /* skip entry when not present in every source dict */
6647 robj
*o
= dictGetEntryKey(de
);
6648 dictAdd(dstzset
->dict
,o
,score
);
6649 incrRefCount(o
); /* added to dictionary */
6650 zslInsert(dstzset
->zsl
,*score
,o
);
6651 incrRefCount(o
); /* added to skiplist */
6654 dictReleaseIterator(di
);
6656 } else if (op
== REDIS_OP_UNION
) {
6657 for (i
= 0; i
< setnum
; i
++) {
6658 if (!src
[i
].dict
) continue;
6660 di
= dictGetIterator(src
[i
].dict
);
6661 while((de
= dictNext(di
)) != NULL
) {
6662 /* skip key when already processed */
6663 if (dictFind(dstzset
->dict
,dictGetEntryKey(de
)) != NULL
) continue;
6665 double *score
= zmalloc(sizeof(double)), value
;
6666 *score
= src
[i
].weight
* zunionInterDictValue(de
);
6668 /* because the zsets are sorted by size, its only possible
6669 * for sets at larger indices to hold this entry */
6670 for (j
= (i
+1); j
< setnum
; j
++) {
6671 dictEntry
*other
= dictFind(src
[j
].dict
,dictGetEntryKey(de
));
6673 value
= src
[j
].weight
* zunionInterDictValue(other
);
6674 zunionInterAggregate(score
, value
, aggregate
);
6678 robj
*o
= dictGetEntryKey(de
);
6679 dictAdd(dstzset
->dict
,o
,score
);
6680 incrRefCount(o
); /* added to dictionary */
6681 zslInsert(dstzset
->zsl
,*score
,o
);
6682 incrRefCount(o
); /* added to skiplist */
6684 dictReleaseIterator(di
);
6687 /* unknown operator */
6688 redisAssert(op
== REDIS_OP_INTER
|| op
== REDIS_OP_UNION
);
6691 dbDelete(c
->db
,dstkey
);
6692 if (dstzset
->zsl
->length
) {
6693 dbAdd(c
->db
,dstkey
,dstobj
);
6694 addReplyLongLong(c
, dstzset
->zsl
->length
);
6697 decrRefCount(dstobj
);
6698 addReply(c
, shared
.czero
);
6703 static void zunionstoreCommand(redisClient
*c
) {
6704 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_UNION
);
6707 static void zinterstoreCommand(redisClient
*c
) {
6708 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_INTER
);
6711 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
6723 if ((getLongFromObjectOrReply(c
, c
->argv
[2], &start
, NULL
) != REDIS_OK
) ||
6724 (getLongFromObjectOrReply(c
, c
->argv
[3], &end
, NULL
) != REDIS_OK
)) return;
6726 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
6728 } else if (c
->argc
>= 5) {
6729 addReply(c
,shared
.syntaxerr
);
6733 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
6734 || checkType(c
,o
,REDIS_ZSET
)) return;
6739 /* convert negative indexes */
6740 if (start
< 0) start
= llen
+start
;
6741 if (end
< 0) end
= llen
+end
;
6742 if (start
< 0) start
= 0;
6743 if (end
< 0) end
= 0;
6745 /* indexes sanity checks */
6746 if (start
> end
|| start
>= llen
) {
6747 /* Out of range start or start > end result in empty list */
6748 addReply(c
,shared
.emptymultibulk
);
6751 if (end
>= llen
) end
= llen
-1;
6752 rangelen
= (end
-start
)+1;
6754 /* check if starting point is trivial, before searching
6755 * the element in log(N) time */
6757 ln
= start
== 0 ? zsl
->tail
: zslistTypeGetElementByRank(zsl
, llen
-start
);
6760 zsl
->header
->forward
[0] : zslistTypeGetElementByRank(zsl
, start
+1);
6763 /* Return the result in form of a multi-bulk reply */
6764 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
6765 withscores
? (rangelen
*2) : rangelen
));
6766 for (j
= 0; j
< rangelen
; j
++) {
6768 addReplyBulk(c
,ele
);
6770 addReplyDouble(c
,ln
->score
);
6771 ln
= reverse
? ln
->backward
: ln
->forward
[0];
6775 static void zrangeCommand(redisClient
*c
) {
6776 zrangeGenericCommand(c
,0);
6779 static void zrevrangeCommand(redisClient
*c
) {
6780 zrangeGenericCommand(c
,1);
6783 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6784 * If justcount is non-zero, just the count is returned. */
6785 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
6788 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
6789 int offset
= 0, limit
= -1;
6793 /* Parse the min-max interval. If one of the values is prefixed
6794 * by the "(" character, it's considered "open". For instance
6795 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6796 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6797 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
6798 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
6801 min
= strtod(c
->argv
[2]->ptr
,NULL
);
6803 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
6804 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
6807 max
= strtod(c
->argv
[3]->ptr
,NULL
);
6810 /* Parse "WITHSCORES": note that if the command was called with
6811 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6812 * enter the following paths to parse WITHSCORES and LIMIT. */
6813 if (c
->argc
== 5 || c
->argc
== 8) {
6814 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
6819 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
6823 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6828 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
6829 addReply(c
,shared
.syntaxerr
);
6831 } else if (c
->argc
== (7 + withscores
)) {
6832 offset
= atoi(c
->argv
[5]->ptr
);
6833 limit
= atoi(c
->argv
[6]->ptr
);
6834 if (offset
< 0) offset
= 0;
6837 /* Ok, lookup the key and get the range */
6838 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
6840 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
6842 if (o
->type
!= REDIS_ZSET
) {
6843 addReply(c
,shared
.wrongtypeerr
);
6845 zset
*zsetobj
= o
->ptr
;
6846 zskiplist
*zsl
= zsetobj
->zsl
;
6848 robj
*ele
, *lenobj
= NULL
;
6849 unsigned long rangelen
= 0;
6851 /* Get the first node with the score >= min, or with
6852 * score > min if 'minex' is true. */
6853 ln
= zslFirstWithScore(zsl
,min
);
6854 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
6857 /* No element matching the speciifed interval */
6858 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
6862 /* We don't know in advance how many matching elements there
6863 * are in the list, so we push this object that will represent
6864 * the multi-bulk length in the output buffer, and will "fix"
6867 lenobj
= createObject(REDIS_STRING
,NULL
);
6869 decrRefCount(lenobj
);
6872 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
6875 ln
= ln
->forward
[0];
6878 if (limit
== 0) break;
6881 addReplyBulk(c
,ele
);
6883 addReplyDouble(c
,ln
->score
);
6885 ln
= ln
->forward
[0];
6887 if (limit
> 0) limit
--;
6890 addReplyLongLong(c
,(long)rangelen
);
6892 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
6893 withscores
? (rangelen
*2) : rangelen
);
6899 static void zrangebyscoreCommand(redisClient
*c
) {
6900 genericZrangebyscoreCommand(c
,0);
6903 static void zcountCommand(redisClient
*c
) {
6904 genericZrangebyscoreCommand(c
,1);
6907 static void zcardCommand(redisClient
*c
) {
6911 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6912 checkType(c
,o
,REDIS_ZSET
)) return;
6915 addReplyUlong(c
,zs
->zsl
->length
);
6918 static void zscoreCommand(redisClient
*c
) {
6923 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6924 checkType(c
,o
,REDIS_ZSET
)) return;
6927 de
= dictFind(zs
->dict
,c
->argv
[2]);
6929 addReply(c
,shared
.nullbulk
);
6931 double *score
= dictGetEntryVal(de
);
6933 addReplyDouble(c
,*score
);
6937 static void zrankGenericCommand(redisClient
*c
, int reverse
) {
6945 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6946 checkType(c
,o
,REDIS_ZSET
)) return;
6950 de
= dictFind(zs
->dict
,c
->argv
[2]);
6952 addReply(c
,shared
.nullbulk
);
6956 score
= dictGetEntryVal(de
);
6957 rank
= zslistTypeGetRank(zsl
, *score
, c
->argv
[2]);
6960 addReplyLongLong(c
, zsl
->length
- rank
);
6962 addReplyLongLong(c
, rank
-1);
6965 addReply(c
,shared
.nullbulk
);
6969 static void zrankCommand(redisClient
*c
) {
6970 zrankGenericCommand(c
, 0);
6973 static void zrevrankCommand(redisClient
*c
) {
6974 zrankGenericCommand(c
, 1);
6977 /* ========================= Hashes utility functions ======================= */
6978 #define REDIS_HASH_KEY 1
6979 #define REDIS_HASH_VALUE 2
6981 /* Check the length of a number of objects to see if we need to convert a
6982 * zipmap to a real hash. Note that we only check string encoded objects
6983 * as their string length can be queried in constant time. */
6984 static void hashTypeTryConversion(robj
*subject
, robj
**argv
, int start
, int end
) {
6986 if (subject
->encoding
!= REDIS_ENCODING_ZIPMAP
) return;
6988 for (i
= start
; i
<= end
; i
++) {
6989 if (argv
[i
]->encoding
== REDIS_ENCODING_RAW
&&
6990 sdslen(argv
[i
]->ptr
) > server
.hash_max_zipmap_value
)
6992 convertToRealHash(subject
);
6998 /* Encode given objects in-place when the hash uses a dict. */
6999 static void hashTypeTryObjectEncoding(robj
*subject
, robj
**o1
, robj
**o2
) {
7000 if (subject
->encoding
== REDIS_ENCODING_HT
) {
7001 if (o1
) *o1
= tryObjectEncoding(*o1
);
7002 if (o2
) *o2
= tryObjectEncoding(*o2
);
7006 /* Get the value from a hash identified by key. Returns either a string
7007 * object or NULL if the value cannot be found. The refcount of the object
7008 * is always increased by 1 when the value was found. */
7009 static robj
*hashTypeGet(robj
*o
, robj
*key
) {
7011 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7014 key
= getDecodedObject(key
);
7015 if (zipmapGet(o
->ptr
,key
->ptr
,sdslen(key
->ptr
),&v
,&vlen
)) {
7016 value
= createStringObject((char*)v
,vlen
);
7020 dictEntry
*de
= dictFind(o
->ptr
,key
);
7022 value
= dictGetEntryVal(de
);
7023 incrRefCount(value
);
7029 /* Test if the key exists in the given hash. Returns 1 if the key
7030 * exists and 0 when it doesn't. */
7031 static int hashTypeExists(robj
*o
, robj
*key
) {
7032 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7033 key
= getDecodedObject(key
);
7034 if (zipmapExists(o
->ptr
,key
->ptr
,sdslen(key
->ptr
))) {
7040 if (dictFind(o
->ptr
,key
) != NULL
) {
7047 /* Add an element, discard the old if the key already exists.
7048 * Return 0 on insert and 1 on update. */
7049 static int hashTypeSet(robj
*o
, robj
*key
, robj
*value
) {
7051 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7052 key
= getDecodedObject(key
);
7053 value
= getDecodedObject(value
);
7054 o
->ptr
= zipmapSet(o
->ptr
,
7055 key
->ptr
,sdslen(key
->ptr
),
7056 value
->ptr
,sdslen(value
->ptr
), &update
);
7058 decrRefCount(value
);
7060 /* Check if the zipmap needs to be upgraded to a real hash table */
7061 if (zipmapLen(o
->ptr
) > server
.hash_max_zipmap_entries
)
7062 convertToRealHash(o
);
7064 if (dictReplace(o
->ptr
,key
,value
)) {
7071 incrRefCount(value
);
7076 /* Delete an element from a hash.
7077 * Return 1 on deleted and 0 on not found. */
7078 static int hashTypeDelete(robj
*o
, robj
*key
) {
7080 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7081 key
= getDecodedObject(key
);
7082 o
->ptr
= zipmapDel(o
->ptr
,key
->ptr
,sdslen(key
->ptr
), &deleted
);
7085 deleted
= dictDelete((dict
*)o
->ptr
,key
) == DICT_OK
;
7086 /* Always check if the dictionary needs a resize after a delete. */
7087 if (deleted
&& htNeedsResize(o
->ptr
)) dictResize(o
->ptr
);
7092 /* Return the number of elements in a hash. */
7093 static unsigned long hashTypeLength(robj
*o
) {
7094 return (o
->encoding
== REDIS_ENCODING_ZIPMAP
) ?
7095 zipmapLen((unsigned char*)o
->ptr
) : dictSize((dict
*)o
->ptr
);
7098 /* Structure to hold hash iteration abstration. Note that iteration over
7099 * hashes involves both fields and values. Because it is possible that
7100 * not both are required, store pointers in the iterator to avoid
7101 * unnecessary memory allocation for fields/values. */
7105 unsigned char *zk
, *zv
;
7106 unsigned int zklen
, zvlen
;
7112 static hashTypeIterator
*hashTypeInitIterator(robj
*subject
) {
7113 hashTypeIterator
*hi
= zmalloc(sizeof(hashTypeIterator
));
7114 hi
->encoding
= subject
->encoding
;
7115 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7116 hi
->zi
= zipmapRewind(subject
->ptr
);
7117 } else if (hi
->encoding
== REDIS_ENCODING_HT
) {
7118 hi
->di
= dictGetIterator(subject
->ptr
);
7125 static void hashTypeReleaseIterator(hashTypeIterator
*hi
) {
7126 if (hi
->encoding
== REDIS_ENCODING_HT
) {
7127 dictReleaseIterator(hi
->di
);
7132 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7133 * could be found and REDIS_ERR when the iterator reaches the end. */
7134 static int hashTypeNext(hashTypeIterator
*hi
) {
7135 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7136 if ((hi
->zi
= zipmapNext(hi
->zi
, &hi
->zk
, &hi
->zklen
,
7137 &hi
->zv
, &hi
->zvlen
)) == NULL
) return REDIS_ERR
;
7139 if ((hi
->de
= dictNext(hi
->di
)) == NULL
) return REDIS_ERR
;
7144 /* Get key or value object at current iteration position.
7145 * This increases the refcount of the field object by 1. */
7146 static robj
*hashTypeCurrent(hashTypeIterator
*hi
, int what
) {
7148 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7149 if (what
& REDIS_HASH_KEY
) {
7150 o
= createStringObject((char*)hi
->zk
,hi
->zklen
);
7152 o
= createStringObject((char*)hi
->zv
,hi
->zvlen
);
7155 if (what
& REDIS_HASH_KEY
) {
7156 o
= dictGetEntryKey(hi
->de
);
7158 o
= dictGetEntryVal(hi
->de
);
7165 static robj
*hashTypeLookupWriteOrCreate(redisClient
*c
, robj
*key
) {
7166 robj
*o
= lookupKeyWrite(c
->db
,key
);
7168 o
= createHashObject();
7171 if (o
->type
!= REDIS_HASH
) {
7172 addReply(c
,shared
.wrongtypeerr
);
7179 /* ============================= Hash commands ============================== */
7180 static void hsetCommand(redisClient
*c
) {
7184 if ((o
= hashTypeLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
7185 hashTypeTryConversion(o
,c
->argv
,2,3);
7186 hashTypeTryObjectEncoding(o
,&c
->argv
[2], &c
->argv
[3]);
7187 update
= hashTypeSet(o
,c
->argv
[2],c
->argv
[3]);
7188 addReply(c
, update
? shared
.czero
: shared
.cone
);
7192 static void hsetnxCommand(redisClient
*c
) {
7194 if ((o
= hashTypeLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
7195 hashTypeTryConversion(o
,c
->argv
,2,3);
7197 if (hashTypeExists(o
, c
->argv
[2])) {
7198 addReply(c
, shared
.czero
);
7200 hashTypeTryObjectEncoding(o
,&c
->argv
[2], &c
->argv
[3]);
7201 hashTypeSet(o
,c
->argv
[2],c
->argv
[3]);
7202 addReply(c
, shared
.cone
);
7207 static void hmsetCommand(redisClient
*c
) {
7211 if ((c
->argc
% 2) == 1) {
7212 addReplySds(c
,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7216 if ((o
= hashTypeLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
7217 hashTypeTryConversion(o
,c
->argv
,2,c
->argc
-1);
7218 for (i
= 2; i
< c
->argc
; i
+= 2) {
7219 hashTypeTryObjectEncoding(o
,&c
->argv
[i
], &c
->argv
[i
+1]);
7220 hashTypeSet(o
,c
->argv
[i
],c
->argv
[i
+1]);
7222 addReply(c
, shared
.ok
);
7226 static void hincrbyCommand(redisClient
*c
) {
7227 long long value
, incr
;
7228 robj
*o
, *current
, *new;
7230 if (getLongLongFromObjectOrReply(c
,c
->argv
[3],&incr
,NULL
) != REDIS_OK
) return;
7231 if ((o
= hashTypeLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
7232 if ((current
= hashTypeGet(o
,c
->argv
[2])) != NULL
) {
7233 if (getLongLongFromObjectOrReply(c
,current
,&value
,
7234 "hash value is not an integer") != REDIS_OK
) {
7235 decrRefCount(current
);
7238 decrRefCount(current
);
7244 new = createStringObjectFromLongLong(value
);
7245 hashTypeTryObjectEncoding(o
,&c
->argv
[2],NULL
);
7246 hashTypeSet(o
,c
->argv
[2],new);
7248 addReplyLongLong(c
,value
);
7252 static void hgetCommand(redisClient
*c
) {
7254 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
7255 checkType(c
,o
,REDIS_HASH
)) return;
7257 if ((value
= hashTypeGet(o
,c
->argv
[2])) != NULL
) {
7258 addReplyBulk(c
,value
);
7259 decrRefCount(value
);
7261 addReply(c
,shared
.nullbulk
);
7265 static void hmgetCommand(redisClient
*c
) {
7268 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
7269 if (o
!= NULL
&& o
->type
!= REDIS_HASH
) {
7270 addReply(c
,shared
.wrongtypeerr
);
7273 /* Note the check for o != NULL happens inside the loop. This is
7274 * done because objects that cannot be found are considered to be
7275 * an empty hash. The reply should then be a series of NULLs. */
7276 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-2));
7277 for (i
= 2; i
< c
->argc
; i
++) {
7278 if (o
!= NULL
&& (value
= hashTypeGet(o
,c
->argv
[i
])) != NULL
) {
7279 addReplyBulk(c
,value
);
7280 decrRefCount(value
);
7282 addReply(c
,shared
.nullbulk
);
7287 static void hdelCommand(redisClient
*c
) {
7289 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
7290 checkType(c
,o
,REDIS_HASH
)) return;
7292 if (hashTypeDelete(o
,c
->argv
[2])) {
7293 if (hashTypeLength(o
) == 0) dbDelete(c
->db
,c
->argv
[1]);
7294 addReply(c
,shared
.cone
);
7297 addReply(c
,shared
.czero
);
7301 static void hlenCommand(redisClient
*c
) {
7303 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
7304 checkType(c
,o
,REDIS_HASH
)) return;
7306 addReplyUlong(c
,hashTypeLength(o
));
7309 static void genericHgetallCommand(redisClient
*c
, int flags
) {
7310 robj
*o
, *lenobj
, *obj
;
7311 unsigned long count
= 0;
7312 hashTypeIterator
*hi
;
7314 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
7315 || checkType(c
,o
,REDIS_HASH
)) return;
7317 lenobj
= createObject(REDIS_STRING
,NULL
);
7319 decrRefCount(lenobj
);
7321 hi
= hashTypeInitIterator(o
);
7322 while (hashTypeNext(hi
) != REDIS_ERR
) {
7323 if (flags
& REDIS_HASH_KEY
) {
7324 obj
= hashTypeCurrent(hi
,REDIS_HASH_KEY
);
7325 addReplyBulk(c
,obj
);
7329 if (flags
& REDIS_HASH_VALUE
) {
7330 obj
= hashTypeCurrent(hi
,REDIS_HASH_VALUE
);
7331 addReplyBulk(c
,obj
);
7336 hashTypeReleaseIterator(hi
);
7338 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",count
);
7341 static void hkeysCommand(redisClient
*c
) {
7342 genericHgetallCommand(c
,REDIS_HASH_KEY
);
7345 static void hvalsCommand(redisClient
*c
) {
7346 genericHgetallCommand(c
,REDIS_HASH_VALUE
);
7349 static void hgetallCommand(redisClient
*c
) {
7350 genericHgetallCommand(c
,REDIS_HASH_KEY
|REDIS_HASH_VALUE
);
7353 static void hexistsCommand(redisClient
*c
) {
7355 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
7356 checkType(c
,o
,REDIS_HASH
)) return;
7358 addReply(c
, hashTypeExists(o
,c
->argv
[2]) ? shared
.cone
: shared
.czero
);
7361 static void convertToRealHash(robj
*o
) {
7362 unsigned char *key
, *val
, *p
, *zm
= o
->ptr
;
7363 unsigned int klen
, vlen
;
7364 dict
*dict
= dictCreate(&hashDictType
,NULL
);
7366 assert(o
->type
== REDIS_HASH
&& o
->encoding
!= REDIS_ENCODING_HT
);
7367 p
= zipmapRewind(zm
);
7368 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
7369 robj
*keyobj
, *valobj
;
7371 keyobj
= createStringObject((char*)key
,klen
);
7372 valobj
= createStringObject((char*)val
,vlen
);
7373 keyobj
= tryObjectEncoding(keyobj
);
7374 valobj
= tryObjectEncoding(valobj
);
7375 dictAdd(dict
,keyobj
,valobj
);
7377 o
->encoding
= REDIS_ENCODING_HT
;
7382 /* ========================= Non type-specific commands ==================== */
7384 static void flushdbCommand(redisClient
*c
) {
7385 server
.dirty
+= dictSize(c
->db
->dict
);
7386 touchWatchedKeysOnFlush(c
->db
->id
);
7387 dictEmpty(c
->db
->dict
);
7388 dictEmpty(c
->db
->expires
);
7389 addReply(c
,shared
.ok
);
7392 static void flushallCommand(redisClient
*c
) {
7393 touchWatchedKeysOnFlush(-1);
7394 server
.dirty
+= emptyDb();
7395 addReply(c
,shared
.ok
);
7396 if (server
.bgsavechildpid
!= -1) {
7397 kill(server
.bgsavechildpid
,SIGKILL
);
7398 rdbRemoveTempFile(server
.bgsavechildpid
);
7400 rdbSave(server
.dbfilename
);
7404 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
7405 redisSortOperation
*so
= zmalloc(sizeof(*so
));
7407 so
->pattern
= pattern
;
7411 /* Return the value associated to the key with a name obtained
7412 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7413 * The returned object will always have its refcount increased by 1
7414 * when it is non-NULL. */
7415 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
7418 robj keyobj
, fieldobj
, *o
;
7419 int prefixlen
, sublen
, postfixlen
, fieldlen
;
7420 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7424 char buf
[REDIS_SORTKEY_MAX
+1];
7425 } keyname
, fieldname
;
7427 /* If the pattern is "#" return the substitution object itself in order
7428 * to implement the "SORT ... GET #" feature. */
7429 spat
= pattern
->ptr
;
7430 if (spat
[0] == '#' && spat
[1] == '\0') {
7431 incrRefCount(subst
);
7435 /* The substitution object may be specially encoded. If so we create
7436 * a decoded object on the fly. Otherwise getDecodedObject will just
7437 * increment the ref count, that we'll decrement later. */
7438 subst
= getDecodedObject(subst
);
7441 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
7442 p
= strchr(spat
,'*');
7444 decrRefCount(subst
);
7448 /* Find out if we're dealing with a hash dereference. */
7449 if ((f
= strstr(p
+1, "->")) != NULL
) {
7450 fieldlen
= sdslen(spat
)-(f
-spat
);
7451 /* this also copies \0 character */
7452 memcpy(fieldname
.buf
,f
+2,fieldlen
-1);
7453 fieldname
.len
= fieldlen
-2;
7459 sublen
= sdslen(ssub
);
7460 postfixlen
= sdslen(spat
)-(prefixlen
+1)-fieldlen
;
7461 memcpy(keyname
.buf
,spat
,prefixlen
);
7462 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
7463 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
7464 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
7465 keyname
.len
= prefixlen
+sublen
+postfixlen
;
7466 decrRefCount(subst
);
7468 /* Lookup substituted key */
7469 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2));
7470 o
= lookupKeyRead(db
,&keyobj
);
7471 if (o
== NULL
) return NULL
;
7474 if (o
->type
!= REDIS_HASH
|| fieldname
.len
< 1) return NULL
;
7476 /* Retrieve value from hash by the field name. This operation
7477 * already increases the refcount of the returned object. */
7478 initStaticStringObject(fieldobj
,((char*)&fieldname
)+(sizeof(long)*2));
7479 o
= hashTypeGet(o
, &fieldobj
);
7481 if (o
->type
!= REDIS_STRING
) return NULL
;
7483 /* Every object that this function returns needs to have its refcount
7484 * increased. sortCommand decreases it again. */
7491 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7492 * the additional parameter is not standard but a BSD-specific we have to
7493 * pass sorting parameters via the global 'server' structure */
7494 static int sortCompare(const void *s1
, const void *s2
) {
7495 const redisSortObject
*so1
= s1
, *so2
= s2
;
7498 if (!server
.sort_alpha
) {
7499 /* Numeric sorting. Here it's trivial as we precomputed scores */
7500 if (so1
->u
.score
> so2
->u
.score
) {
7502 } else if (so1
->u
.score
< so2
->u
.score
) {
7508 /* Alphanumeric sorting */
7509 if (server
.sort_bypattern
) {
7510 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
7511 /* At least one compare object is NULL */
7512 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
7514 else if (so1
->u
.cmpobj
== NULL
)
7519 /* We have both the objects, use strcoll */
7520 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
7523 /* Compare elements directly. */
7524 cmp
= compareStringObjects(so1
->obj
,so2
->obj
);
7527 return server
.sort_desc
? -cmp
: cmp
;
7530 /* The SORT command is the most complex command in Redis. Warning: this code
7531 * is optimized for speed and a bit less for readability */
7532 static void sortCommand(redisClient
*c
) {
7534 unsigned int outputlen
= 0;
7535 int desc
= 0, alpha
= 0;
7536 int limit_start
= 0, limit_count
= -1, start
, end
;
7537 int j
, dontsort
= 0, vectorlen
;
7538 int getop
= 0; /* GET operation counter */
7539 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
7540 redisSortObject
*vector
; /* Resulting vector to sort */
7542 /* Lookup the key to sort. It must be of the right types */
7543 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
7544 if (sortval
== NULL
) {
7545 addReply(c
,shared
.emptymultibulk
);
7548 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
7549 sortval
->type
!= REDIS_ZSET
)
7551 addReply(c
,shared
.wrongtypeerr
);
7555 /* Create a list of operations to perform for every sorted element.
7556 * Operations can be GET/DEL/INCR/DECR */
7557 operations
= listCreate();
7558 listSetFreeMethod(operations
,zfree
);
7561 /* Now we need to protect sortval incrementing its count, in the future
7562 * SORT may have options able to overwrite/delete keys during the sorting
7563 * and the sorted key itself may get destroied */
7564 incrRefCount(sortval
);
7566 /* The SORT command has an SQL-alike syntax, parse it */
7567 while(j
< c
->argc
) {
7568 int leftargs
= c
->argc
-j
-1;
7569 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
7571 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
7573 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
7575 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
7576 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
7577 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
7579 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
7580 storekey
= c
->argv
[j
+1];
7582 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
7583 sortby
= c
->argv
[j
+1];
7584 /* If the BY pattern does not contain '*', i.e. it is constant,
7585 * we don't need to sort nor to lookup the weight keys. */
7586 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
7588 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
7589 listAddNodeTail(operations
,createSortOperation(
7590 REDIS_SORT_GET
,c
->argv
[j
+1]));
7594 decrRefCount(sortval
);
7595 listRelease(operations
);
7596 addReply(c
,shared
.syntaxerr
);
7602 /* Load the sorting vector with all the objects to sort */
7603 switch(sortval
->type
) {
7604 case REDIS_LIST
: vectorlen
= listTypeLength(sortval
); break;
7605 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
7606 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
7607 default: vectorlen
= 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7609 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
7612 if (sortval
->type
== REDIS_LIST
) {
7613 listTypeIterator
*li
= listTypeInitIterator(sortval
,0,REDIS_TAIL
);
7614 listTypeEntry entry
;
7615 while(listTypeNext(li
,&entry
)) {
7616 vector
[j
].obj
= listTypeGet(&entry
);
7617 vector
[j
].u
.score
= 0;
7618 vector
[j
].u
.cmpobj
= NULL
;
7621 listTypeReleaseIterator(li
);
7627 if (sortval
->type
== REDIS_SET
) {
7630 zset
*zs
= sortval
->ptr
;
7634 di
= dictGetIterator(set
);
7635 while((setele
= dictNext(di
)) != NULL
) {
7636 vector
[j
].obj
= dictGetEntryKey(setele
);
7637 vector
[j
].u
.score
= 0;
7638 vector
[j
].u
.cmpobj
= NULL
;
7641 dictReleaseIterator(di
);
7643 redisAssert(j
== vectorlen
);
7645 /* Now it's time to load the right scores in the sorting vector */
7646 if (dontsort
== 0) {
7647 for (j
= 0; j
< vectorlen
; j
++) {
7650 /* lookup value to sort by */
7651 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
7652 if (!byval
) continue;
7654 /* use object itself to sort by */
7655 byval
= vector
[j
].obj
;
7659 if (sortby
) vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
7661 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
7662 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
7663 } else if (byval
->encoding
== REDIS_ENCODING_INT
) {
7664 /* Don't need to decode the object if it's
7665 * integer-encoded (the only encoding supported) so
7666 * far. We can just cast it */
7667 vector
[j
].u
.score
= (long)byval
->ptr
;
7669 redisAssert(1 != 1);
7673 /* when the object was retrieved using lookupKeyByPattern,
7674 * its refcount needs to be decreased. */
7676 decrRefCount(byval
);
7681 /* We are ready to sort the vector... perform a bit of sanity check
7682 * on the LIMIT option too. We'll use a partial version of quicksort. */
7683 start
= (limit_start
< 0) ? 0 : limit_start
;
7684 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
7685 if (start
>= vectorlen
) {
7686 start
= vectorlen
-1;
7689 if (end
>= vectorlen
) end
= vectorlen
-1;
7691 if (dontsort
== 0) {
7692 server
.sort_desc
= desc
;
7693 server
.sort_alpha
= alpha
;
7694 server
.sort_bypattern
= sortby
? 1 : 0;
7695 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
7696 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
7698 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
7701 /* Send command output to the output buffer, performing the specified
7702 * GET/DEL/INCR/DECR operations if any. */
7703 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
7704 if (storekey
== NULL
) {
7705 /* STORE option not specified, sent the sorting result to client */
7706 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
7707 for (j
= start
; j
<= end
; j
++) {
7711 if (!getop
) addReplyBulk(c
,vector
[j
].obj
);
7712 listRewind(operations
,&li
);
7713 while((ln
= listNext(&li
))) {
7714 redisSortOperation
*sop
= ln
->value
;
7715 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
7718 if (sop
->type
== REDIS_SORT_GET
) {
7720 addReply(c
,shared
.nullbulk
);
7722 addReplyBulk(c
,val
);
7726 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
7731 robj
*sobj
= createZiplistObject();
7733 /* STORE option specified, set the sorting result as a List object */
7734 for (j
= start
; j
<= end
; j
++) {
7739 listTypePush(sobj
,vector
[j
].obj
,REDIS_TAIL
);
7741 listRewind(operations
,&li
);
7742 while((ln
= listNext(&li
))) {
7743 redisSortOperation
*sop
= ln
->value
;
7744 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
7747 if (sop
->type
== REDIS_SORT_GET
) {
7748 if (!val
) val
= createStringObject("",0);
7750 /* listTypePush does an incrRefCount, so we should take care
7751 * care of the incremented refcount caused by either
7752 * lookupKeyByPattern or createStringObject("",0) */
7753 listTypePush(sobj
,val
,REDIS_TAIL
);
7757 redisAssert(sop
->type
== REDIS_SORT_GET
);
7762 dbReplace(c
->db
,storekey
,sobj
);
7763 /* Note: we add 1 because the DB is dirty anyway since even if the
7764 * SORT result is empty a new key is set and maybe the old content
7766 server
.dirty
+= 1+outputlen
;
7767 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
7771 if (sortval
->type
== REDIS_LIST
)
7772 for (j
= 0; j
< vectorlen
; j
++)
7773 decrRefCount(vector
[j
].obj
);
7774 decrRefCount(sortval
);
7775 listRelease(operations
);
7776 for (j
= 0; j
< vectorlen
; j
++) {
7777 if (alpha
&& vector
[j
].u
.cmpobj
)
7778 decrRefCount(vector
[j
].u
.cmpobj
);
7783 /* Convert an amount of bytes into a human readable string in the form
7784 * of 100B, 2G, 100M, 4K, and so forth. */
7785 static void bytesToHuman(char *s
, unsigned long long n
) {
7790 sprintf(s
,"%lluB",n
);
7792 } else if (n
< (1024*1024)) {
7793 d
= (double)n
/(1024);
7794 sprintf(s
,"%.2fK",d
);
7795 } else if (n
< (1024LL*1024*1024)) {
7796 d
= (double)n
/(1024*1024);
7797 sprintf(s
,"%.2fM",d
);
7798 } else if (n
< (1024LL*1024*1024*1024)) {
7799 d
= (double)n
/(1024LL*1024*1024);
7800 sprintf(s
,"%.2fG",d
);
7804 /* Create the string returned by the INFO command. This is decoupled
7805 * by the INFO command itself as we need to report the same information
7806 * on memory corruption problems. */
7807 static sds
genRedisInfoString(void) {
7809 time_t uptime
= time(NULL
)-server
.stat_starttime
;
7813 bytesToHuman(hmem
,zmalloc_used_memory());
7814 info
= sdscatprintf(sdsempty(),
7815 "redis_version:%s\r\n"
7816 "redis_git_sha1:%s\r\n"
7817 "redis_git_dirty:%d\r\n"
7819 "multiplexing_api:%s\r\n"
7820 "process_id:%ld\r\n"
7821 "uptime_in_seconds:%ld\r\n"
7822 "uptime_in_days:%ld\r\n"
7823 "connected_clients:%d\r\n"
7824 "connected_slaves:%d\r\n"
7825 "blocked_clients:%d\r\n"
7826 "used_memory:%zu\r\n"
7827 "used_memory_human:%s\r\n"
7828 "changes_since_last_save:%lld\r\n"
7829 "bgsave_in_progress:%d\r\n"
7830 "last_save_time:%ld\r\n"
7831 "bgrewriteaof_in_progress:%d\r\n"
7832 "total_connections_received:%lld\r\n"
7833 "total_commands_processed:%lld\r\n"
7834 "expired_keys:%lld\r\n"
7835 "hash_max_zipmap_entries:%zu\r\n"
7836 "hash_max_zipmap_value:%zu\r\n"
7837 "pubsub_channels:%ld\r\n"
7838 "pubsub_patterns:%u\r\n"
7843 strtol(REDIS_GIT_DIRTY
,NULL
,10) > 0,
7844 (sizeof(long) == 8) ? "64" : "32",
7849 listLength(server
.clients
)-listLength(server
.slaves
),
7850 listLength(server
.slaves
),
7851 server
.blpop_blocked_clients
,
7852 zmalloc_used_memory(),
7855 server
.bgsavechildpid
!= -1,
7857 server
.bgrewritechildpid
!= -1,
7858 server
.stat_numconnections
,
7859 server
.stat_numcommands
,
7860 server
.stat_expiredkeys
,
7861 server
.hash_max_zipmap_entries
,
7862 server
.hash_max_zipmap_value
,
7863 dictSize(server
.pubsub_channels
),
7864 listLength(server
.pubsub_patterns
),
7865 server
.vm_enabled
!= 0,
7866 server
.masterhost
== NULL
? "master" : "slave"
7868 if (server
.masterhost
) {
7869 info
= sdscatprintf(info
,
7870 "master_host:%s\r\n"
7871 "master_port:%d\r\n"
7872 "master_link_status:%s\r\n"
7873 "master_last_io_seconds_ago:%d\r\n"
7876 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
7878 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
7881 if (server
.vm_enabled
) {
7883 info
= sdscatprintf(info
,
7884 "vm_conf_max_memory:%llu\r\n"
7885 "vm_conf_page_size:%llu\r\n"
7886 "vm_conf_pages:%llu\r\n"
7887 "vm_stats_used_pages:%llu\r\n"
7888 "vm_stats_swapped_objects:%llu\r\n"
7889 "vm_stats_swappin_count:%llu\r\n"
7890 "vm_stats_swappout_count:%llu\r\n"
7891 "vm_stats_io_newjobs_len:%lu\r\n"
7892 "vm_stats_io_processing_len:%lu\r\n"
7893 "vm_stats_io_processed_len:%lu\r\n"
7894 "vm_stats_io_active_threads:%lu\r\n"
7895 "vm_stats_blocked_clients:%lu\r\n"
7896 ,(unsigned long long) server
.vm_max_memory
,
7897 (unsigned long long) server
.vm_page_size
,
7898 (unsigned long long) server
.vm_pages
,
7899 (unsigned long long) server
.vm_stats_used_pages
,
7900 (unsigned long long) server
.vm_stats_swapped_objects
,
7901 (unsigned long long) server
.vm_stats_swapins
,
7902 (unsigned long long) server
.vm_stats_swapouts
,
7903 (unsigned long) listLength(server
.io_newjobs
),
7904 (unsigned long) listLength(server
.io_processing
),
7905 (unsigned long) listLength(server
.io_processed
),
7906 (unsigned long) server
.io_active_threads
,
7907 (unsigned long) server
.vm_blocked_clients
7911 for (j
= 0; j
< server
.dbnum
; j
++) {
7912 long long keys
, vkeys
;
7914 keys
= dictSize(server
.db
[j
].dict
);
7915 vkeys
= dictSize(server
.db
[j
].expires
);
7916 if (keys
|| vkeys
) {
7917 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
7924 static void infoCommand(redisClient
*c
) {
7925 sds info
= genRedisInfoString();
7926 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
7927 (unsigned long)sdslen(info
)));
7928 addReplySds(c
,info
);
7929 addReply(c
,shared
.crlf
);
7932 static void monitorCommand(redisClient
*c
) {
7933 /* ignore MONITOR if aleady slave or in monitor mode */
7934 if (c
->flags
& REDIS_SLAVE
) return;
7936 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
7938 listAddNodeTail(server
.monitors
,c
);
7939 addReply(c
,shared
.ok
);
7942 /* ================================= Expire ================================= */
7943 static int removeExpire(redisDb
*db
, robj
*key
) {
7944 if (dictDelete(db
->expires
,key
->ptr
) == DICT_OK
) {
7951 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
7952 sds copy
= sdsdup(key
->ptr
);
7953 if (dictAdd(db
->expires
,copy
,(void*)when
) == DICT_ERR
) {
7961 /* Return the expire time of the specified key, or -1 if no expire
7962 * is associated with this key (i.e. the key is non volatile) */
7963 static time_t getExpire(redisDb
*db
, robj
*key
) {
7966 /* No expire? return ASAP */
7967 if (dictSize(db
->expires
) == 0 ||
7968 (de
= dictFind(db
->expires
,key
->ptr
)) == NULL
) return -1;
7970 return (time_t) dictGetEntryVal(de
);
7973 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
7977 /* No expire? return ASAP */
7978 if (dictSize(db
->expires
) == 0 ||
7979 (de
= dictFind(db
->expires
,key
->ptr
)) == NULL
) return 0;
7981 /* Lookup the expire */
7982 when
= (time_t) dictGetEntryVal(de
);
7983 if (time(NULL
) <= when
) return 0;
7985 /* Delete the key */
7987 server
.stat_expiredkeys
++;
7991 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
7994 /* No expire? return ASAP */
7995 if (dictSize(db
->expires
) == 0 ||
7996 (de
= dictFind(db
->expires
,key
->ptr
)) == NULL
) return 0;
7998 /* Delete the key */
8000 server
.stat_expiredkeys
++;
8001 dictDelete(db
->expires
,key
->ptr
);
8002 return dictDelete(db
->dict
,key
->ptr
) == DICT_OK
;
8005 static void expireGenericCommand(redisClient
*c
, robj
*key
, robj
*param
, long offset
) {
8009 if (getLongFromObjectOrReply(c
, param
, &seconds
, NULL
) != REDIS_OK
) return;
8013 de
= dictFind(c
->db
->dict
,key
->ptr
);
8015 addReply(c
,shared
.czero
);
8019 if (dbDelete(c
->db
,key
)) server
.dirty
++;
8020 addReply(c
, shared
.cone
);
8023 time_t when
= time(NULL
)+seconds
;
8024 if (setExpire(c
->db
,key
,when
)) {
8025 addReply(c
,shared
.cone
);
8028 addReply(c
,shared
.czero
);
8034 static void expireCommand(redisClient
*c
) {
8035 expireGenericCommand(c
,c
->argv
[1],c
->argv
[2],0);
8038 static void expireatCommand(redisClient
*c
) {
8039 expireGenericCommand(c
,c
->argv
[1],c
->argv
[2],time(NULL
));
8042 static void ttlCommand(redisClient
*c
) {
8046 expire
= getExpire(c
->db
,c
->argv
[1]);
8048 ttl
= (int) (expire
-time(NULL
));
8049 if (ttl
< 0) ttl
= -1;
8051 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
8054 /* ================================ MULTI/EXEC ============================== */
8056 /* Client state initialization for MULTI/EXEC */
8057 static void initClientMultiState(redisClient
*c
) {
8058 c
->mstate
.commands
= NULL
;
8059 c
->mstate
.count
= 0;
8062 /* Release all the resources associated with MULTI/EXEC state */
8063 static void freeClientMultiState(redisClient
*c
) {
8066 for (j
= 0; j
< c
->mstate
.count
; j
++) {
8068 multiCmd
*mc
= c
->mstate
.commands
+j
;
8070 for (i
= 0; i
< mc
->argc
; i
++)
8071 decrRefCount(mc
->argv
[i
]);
8074 zfree(c
->mstate
.commands
);
8077 /* Add a new command into the MULTI commands queue */
8078 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
8082 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
8083 sizeof(multiCmd
)*(c
->mstate
.count
+1));
8084 mc
= c
->mstate
.commands
+c
->mstate
.count
;
8087 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
8088 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
8089 for (j
= 0; j
< c
->argc
; j
++)
8090 incrRefCount(mc
->argv
[j
]);
8094 static void multiCommand(redisClient
*c
) {
8095 if (c
->flags
& REDIS_MULTI
) {
8096 addReplySds(c
,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8099 c
->flags
|= REDIS_MULTI
;
8100 addReply(c
,shared
.ok
);
8103 static void discardCommand(redisClient
*c
) {
8104 if (!(c
->flags
& REDIS_MULTI
)) {
8105 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
8109 freeClientMultiState(c
);
8110 initClientMultiState(c
);
8111 c
->flags
&= (~REDIS_MULTI
);
8113 addReply(c
,shared
.ok
);
8116 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8117 * implememntation for more information. */
8118 static void execCommandReplicateMulti(redisClient
*c
) {
8119 struct redisCommand
*cmd
;
8120 robj
*multistring
= createStringObject("MULTI",5);
8122 cmd
= lookupCommand("multi");
8123 if (server
.appendonly
)
8124 feedAppendOnlyFile(cmd
,c
->db
->id
,&multistring
,1);
8125 if (listLength(server
.slaves
))
8126 replicationFeedSlaves(server
.slaves
,c
->db
->id
,&multistring
,1);
8127 decrRefCount(multistring
);
8130 static void execCommand(redisClient
*c
) {
8135 if (!(c
->flags
& REDIS_MULTI
)) {
8136 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
8140 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8141 * A failed EXEC will return a multi bulk nil object. */
8142 if (c
->flags
& REDIS_DIRTY_CAS
) {
8143 freeClientMultiState(c
);
8144 initClientMultiState(c
);
8145 c
->flags
&= ~(REDIS_MULTI
|REDIS_DIRTY_CAS
);
8147 addReply(c
,shared
.nullmultibulk
);
8151 /* Replicate a MULTI request now that we are sure the block is executed.
8152 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8153 * both the AOF and the replication link will have the same consistency
8154 * and atomicity guarantees. */
8155 execCommandReplicateMulti(c
);
8157 /* Exec all the queued commands */
8158 unwatchAllKeys(c
); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8159 orig_argv
= c
->argv
;
8160 orig_argc
= c
->argc
;
8161 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
8162 for (j
= 0; j
< c
->mstate
.count
; j
++) {
8163 c
->argc
= c
->mstate
.commands
[j
].argc
;
8164 c
->argv
= c
->mstate
.commands
[j
].argv
;
8165 call(c
,c
->mstate
.commands
[j
].cmd
);
8167 c
->argv
= orig_argv
;
8168 c
->argc
= orig_argc
;
8169 freeClientMultiState(c
);
8170 initClientMultiState(c
);
8171 c
->flags
&= ~(REDIS_MULTI
|REDIS_DIRTY_CAS
);
8172 /* Make sure the EXEC command is always replicated / AOF, since we
8173 * always send the MULTI command (we can't know beforehand if the
8174 * next operations will contain at least a modification to the DB). */
8178 /* =========================== Blocking Operations ========================= */
8180 /* Currently Redis blocking operations support is limited to list POP ops,
8181 * so the current implementation is not fully generic, but it is also not
8182 * completely specific so it will not require a rewrite to support new
8183 * kind of blocking operations in the future.
8185 * Still it's important to note that list blocking operations can be already
8186 * used as a notification mechanism in order to implement other blocking
8187 * operations at application level, so there must be a very strong evidence
8188 * of usefulness and generality before new blocking operations are implemented.
8190 * This is how the current blocking POP works, we use BLPOP as example:
8191 * - If the user calls BLPOP and the key exists and contains a non empty list
8192 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8193 * if there is not to block.
8194 * - If instead BLPOP is called and the key does not exists or the list is
8195 * empty we need to block. In order to do so we remove the notification for
8196 * new data to read in the client socket (so that we'll not serve new
8197 * requests if the blocking request is not served). Also we put the client
8198 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8199 * blocking for this keys.
8200 * - If a PUSH operation against a key with blocked clients waiting is
8201 * performed, we serve the first in the list: basically instead to push
8202 * the new element inside the list we return it to the (first / oldest)
8203 * blocking client, unblock the client, and remove it form the list.
8205 * The above comment and the source code should be enough in order to understand
8206 * the implementation and modify / fix it later.
8209 /* Set a client in blocking mode for the specified key, with the specified
8211 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
8216 c
->blocking_keys
= zmalloc(sizeof(robj
*)*numkeys
);
8217 c
->blocking_keys_num
= numkeys
;
8218 c
->blockingto
= timeout
;
8219 for (j
= 0; j
< numkeys
; j
++) {
8220 /* Add the key in the client structure, to map clients -> keys */
8221 c
->blocking_keys
[j
] = keys
[j
];
8222 incrRefCount(keys
[j
]);
8224 /* And in the other "side", to map keys -> clients */
8225 de
= dictFind(c
->db
->blocking_keys
,keys
[j
]);
8229 /* For every key we take a list of clients blocked for it */
8231 retval
= dictAdd(c
->db
->blocking_keys
,keys
[j
],l
);
8232 incrRefCount(keys
[j
]);
8233 assert(retval
== DICT_OK
);
8235 l
= dictGetEntryVal(de
);
8237 listAddNodeTail(l
,c
);
8239 /* Mark the client as a blocked client */
8240 c
->flags
|= REDIS_BLOCKED
;
8241 server
.blpop_blocked_clients
++;
8244 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8245 static void unblockClientWaitingData(redisClient
*c
) {
8250 assert(c
->blocking_keys
!= NULL
);
8251 /* The client may wait for multiple keys, so unblock it for every key. */
8252 for (j
= 0; j
< c
->blocking_keys_num
; j
++) {
8253 /* Remove this client from the list of clients waiting for this key. */
8254 de
= dictFind(c
->db
->blocking_keys
,c
->blocking_keys
[j
]);
8256 l
= dictGetEntryVal(de
);
8257 listDelNode(l
,listSearchKey(l
,c
));
8258 /* If the list is empty we need to remove it to avoid wasting memory */
8259 if (listLength(l
) == 0)
8260 dictDelete(c
->db
->blocking_keys
,c
->blocking_keys
[j
]);
8261 decrRefCount(c
->blocking_keys
[j
]);
8263 /* Cleanup the client structure */
8264 zfree(c
->blocking_keys
);
8265 c
->blocking_keys
= NULL
;
8266 c
->flags
&= (~REDIS_BLOCKED
);
8267 server
.blpop_blocked_clients
--;
8268 /* We want to process data if there is some command waiting
8269 * in the input buffer. Note that this is safe even if
8270 * unblockClientWaitingData() gets called from freeClient() because
8271 * freeClient() will be smart enough to call this function
8272 * *after* c->querybuf was set to NULL. */
8273 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
8276 /* This should be called from any function PUSHing into lists.
8277 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8278 * 'ele' is the element pushed.
8280 * If the function returns 0 there was no client waiting for a list push
8283 * If the function returns 1 there was a client waiting for a list push
8284 * against this key, the element was passed to this client thus it's not
8285 * needed to actually add it to the list and the caller should return asap. */
8286 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
8287 struct dictEntry
*de
;
8288 redisClient
*receiver
;
8292 de
= dictFind(c
->db
->blocking_keys
,key
);
8293 if (de
== NULL
) return 0;
8294 l
= dictGetEntryVal(de
);
8297 receiver
= ln
->value
;
8299 addReplySds(receiver
,sdsnew("*2\r\n"));
8300 addReplyBulk(receiver
,key
);
8301 addReplyBulk(receiver
,ele
);
8302 unblockClientWaitingData(receiver
);
8306 /* Blocking RPOP/LPOP */
8307 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
8312 for (j
= 1; j
< c
->argc
-1; j
++) {
8313 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
8315 if (o
->type
!= REDIS_LIST
) {
8316 addReply(c
,shared
.wrongtypeerr
);
8319 list
*list
= o
->ptr
;
8320 if (listLength(list
) != 0) {
8321 /* If the list contains elements fall back to the usual
8322 * non-blocking POP operation */
8323 robj
*argv
[2], **orig_argv
;
8326 /* We need to alter the command arguments before to call
8327 * popGenericCommand() as the command takes a single key. */
8328 orig_argv
= c
->argv
;
8329 orig_argc
= c
->argc
;
8330 argv
[1] = c
->argv
[j
];
8334 /* Also the return value is different, we need to output
8335 * the multi bulk reply header and the key name. The
8336 * "real" command will add the last element (the value)
8337 * for us. If this souds like an hack to you it's just
8338 * because it is... */
8339 addReplySds(c
,sdsnew("*2\r\n"));
8340 addReplyBulk(c
,argv
[1]);
8341 popGenericCommand(c
,where
);
8343 /* Fix the client structure with the original stuff */
8344 c
->argv
= orig_argv
;
8345 c
->argc
= orig_argc
;
8351 /* If the list is empty or the key does not exists we must block */
8352 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
8353 if (timeout
> 0) timeout
+= time(NULL
);
8354 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
8357 static void blpopCommand(redisClient
*c
) {
8358 blockingPopGenericCommand(c
,REDIS_HEAD
);
8361 static void brpopCommand(redisClient
*c
) {
8362 blockingPopGenericCommand(c
,REDIS_TAIL
);
8365 /* =============================== Replication ============================= */
8367 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
8368 ssize_t nwritten
, ret
= size
;
8369 time_t start
= time(NULL
);
8373 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
8374 nwritten
= write(fd
,ptr
,size
);
8375 if (nwritten
== -1) return -1;
8379 if ((time(NULL
)-start
) > timeout
) {
8387 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
8388 ssize_t nread
, totread
= 0;
8389 time_t start
= time(NULL
);
8393 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
8394 nread
= read(fd
,ptr
,size
);
8395 if (nread
== -1) return -1;
8400 if ((time(NULL
)-start
) > timeout
) {
8408 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
8415 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
8418 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
8429 static void syncCommand(redisClient
*c
) {
8430 /* ignore SYNC if aleady slave or in monitor mode */
8431 if (c
->flags
& REDIS_SLAVE
) return;
8433 /* SYNC can't be issued when the server has pending data to send to
8434 * the client about already issued commands. We need a fresh reply
8435 * buffer registering the differences between the BGSAVE and the current
8436 * dataset, so that we can copy to other slaves if needed. */
8437 if (listLength(c
->reply
) != 0) {
8438 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8442 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
8443 /* Here we need to check if there is a background saving operation
8444 * in progress, or if it is required to start one */
8445 if (server
.bgsavechildpid
!= -1) {
8446 /* Ok a background save is in progress. Let's check if it is a good
8447 * one for replication, i.e. if there is another slave that is
8448 * registering differences since the server forked to save */
8453 listRewind(server
.slaves
,&li
);
8454 while((ln
= listNext(&li
))) {
8456 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
8459 /* Perfect, the server is already registering differences for
8460 * another slave. Set the right state, and copy the buffer. */
8461 listRelease(c
->reply
);
8462 c
->reply
= listDup(slave
->reply
);
8463 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
8464 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
8466 /* No way, we need to wait for the next BGSAVE in order to
8467 * register differences */
8468 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
8469 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
8472 /* Ok we don't have a BGSAVE in progress, let's start one */
8473 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
8474 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
8475 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
8476 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
8479 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
8482 c
->flags
|= REDIS_SLAVE
;
8484 listAddNodeTail(server
.slaves
,c
);
8488 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
8489 redisClient
*slave
= privdata
;
8491 REDIS_NOTUSED(mask
);
8492 char buf
[REDIS_IOBUF_LEN
];
8493 ssize_t nwritten
, buflen
;
8495 if (slave
->repldboff
== 0) {
8496 /* Write the bulk write count before to transfer the DB. In theory here
8497 * we don't know how much room there is in the output buffer of the
8498 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8499 * operations) will never be smaller than the few bytes we need. */
8502 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8504 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
8512 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
8513 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
8515 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
8516 (buflen
== 0) ? "premature EOF" : strerror(errno
));
8520 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
8521 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
8526 slave
->repldboff
+= nwritten
;
8527 if (slave
->repldboff
== slave
->repldbsize
) {
8528 close(slave
->repldbfd
);
8529 slave
->repldbfd
= -1;
8530 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
8531 slave
->replstate
= REDIS_REPL_ONLINE
;
8532 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
8533 sendReplyToClient
, slave
) == AE_ERR
) {
8537 addReplySds(slave
,sdsempty());
8538 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
8542 /* This function is called at the end of every backgrond saving.
8543 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8544 * otherwise REDIS_ERR is passed to the function.
8546 * The goal of this function is to handle slaves waiting for a successful
8547 * background saving in order to perform non-blocking synchronization. */
8548 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
8550 int startbgsave
= 0;
8553 listRewind(server
.slaves
,&li
);
8554 while((ln
= listNext(&li
))) {
8555 redisClient
*slave
= ln
->value
;
8557 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
8559 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
8560 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
8561 struct redis_stat buf
;
8563 if (bgsaveerr
!= REDIS_OK
) {
8565 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
8568 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
8569 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
8571 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
8574 slave
->repldboff
= 0;
8575 slave
->repldbsize
= buf
.st_size
;
8576 slave
->replstate
= REDIS_REPL_SEND_BULK
;
8577 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
8578 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
8585 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
8588 listRewind(server
.slaves
,&li
);
8589 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
8590 while((ln
= listNext(&li
))) {
8591 redisClient
*slave
= ln
->value
;
8593 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
8600 static int syncWithMaster(void) {
8601 char buf
[1024], tmpfile
[256], authcmd
[1024];
8603 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
8604 int dfd
, maxtries
= 5;
8607 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
8612 /* AUTH with the master if required. */
8613 if(server
.masterauth
) {
8614 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
8615 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
8617 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
8621 /* Read the AUTH result. */
8622 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
8624 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
8628 if (buf
[0] != '+') {
8630 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
8635 /* Issue the SYNC command */
8636 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
8638 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
8642 /* Read the bulk write count */
8643 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
8645 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
8649 if (buf
[0] != '$') {
8651 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8654 dumpsize
= strtol(buf
+1,NULL
,10);
8655 redisLog(REDIS_NOTICE
,"Receiving %ld bytes data dump from MASTER",dumpsize
);
8656 /* Read the bulk write data on a temp file */
8658 snprintf(tmpfile
,256,
8659 "temp-%d.%ld.rdb",(int)time(NULL
),(long int)getpid());
8660 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
|O_EXCL
,0644);
8661 if (dfd
!= -1) break;
8666 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
8670 int nread
, nwritten
;
8672 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
8674 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
8680 nwritten
= write(dfd
,buf
,nread
);
8681 if (nwritten
== -1) {
8682 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
8690 if (rename(tmpfile
,server
.dbfilename
) == -1) {
8691 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
8697 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
8698 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
8702 server
.master
= createClient(fd
);
8703 server
.master
->flags
|= REDIS_MASTER
;
8704 server
.master
->authenticated
= 1;
8705 server
.replstate
= REDIS_REPL_CONNECTED
;
8709 static void slaveofCommand(redisClient
*c
) {
8710 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
8711 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
8712 if (server
.masterhost
) {
8713 sdsfree(server
.masterhost
);
8714 server
.masterhost
= NULL
;
8715 if (server
.master
) freeClient(server
.master
);
8716 server
.replstate
= REDIS_REPL_NONE
;
8717 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
8720 sdsfree(server
.masterhost
);
8721 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
8722 server
.masterport
= atoi(c
->argv
[2]->ptr
);
8723 if (server
.master
) freeClient(server
.master
);
8724 server
.replstate
= REDIS_REPL_CONNECT
;
8725 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
8726 server
.masterhost
, server
.masterport
);
8728 addReply(c
,shared
.ok
);
8731 /* ============================ Maxmemory directive ======================== */
8733 /* Try to free one object form the pre-allocated objects free list.
8734 * This is useful under low mem conditions as by default we take 1 million
8735 * free objects allocated. On success REDIS_OK is returned, otherwise
8737 static int tryFreeOneObjectFromFreelist(void) {
8740 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
8741 if (listLength(server
.objfreelist
)) {
8742 listNode
*head
= listFirst(server
.objfreelist
);
8743 o
= listNodeValue(head
);
8744 listDelNode(server
.objfreelist
,head
);
8745 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
8749 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
8754 /* This function gets called when 'maxmemory' is set on the config file to limit
8755 * the max memory used by the server, and we are out of memory.
8756 * This function will try to, in order:
8758 * - Free objects from the free list
8759 * - Try to remove keys with an EXPIRE set
8761 * It is not possible to free enough memory to reach used-memory < maxmemory
8762 * the server will start refusing commands that will enlarge even more the
8765 static void freeMemoryIfNeeded(void) {
8766 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
8767 int j
, k
, freed
= 0;
8769 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
8770 for (j
= 0; j
< server
.dbnum
; j
++) {
8772 robj
*minkey
= NULL
;
8773 struct dictEntry
*de
;
8775 if (dictSize(server
.db
[j
].expires
)) {
8777 /* From a sample of three keys drop the one nearest to
8778 * the natural expire */
8779 for (k
= 0; k
< 3; k
++) {
8782 de
= dictGetRandomKey(server
.db
[j
].expires
);
8783 t
= (time_t) dictGetEntryVal(de
);
8784 if (minttl
== -1 || t
< minttl
) {
8785 minkey
= dictGetEntryKey(de
);
8789 dbDelete(server
.db
+j
,minkey
);
8792 if (!freed
) return; /* nothing to free... */
8796 /* ============================== Append Only file ========================== */
8798 /* Called when the user switches from "appendonly yes" to "appendonly no"
8799 * at runtime using the CONFIG command. */
8800 static void stopAppendOnly(void) {
8801 flushAppendOnlyFile();
8802 aof_fsync(server
.appendfd
);
8803 close(server
.appendfd
);
8805 server
.appendfd
= -1;
8806 server
.appendseldb
= -1;
8807 server
.appendonly
= 0;
8808 /* rewrite operation in progress? kill it, wait child exit */
8809 if (server
.bgsavechildpid
!= -1) {
8812 if (kill(server
.bgsavechildpid
,SIGKILL
) != -1)
8813 wait3(&statloc
,0,NULL
);
8814 /* reset the buffer accumulating changes while the child saves */
8815 sdsfree(server
.bgrewritebuf
);
8816 server
.bgrewritebuf
= sdsempty();
8817 server
.bgsavechildpid
= -1;
8821 /* Called when the user switches from "appendonly no" to "appendonly yes"
8822 * at runtime using the CONFIG command. */
8823 static int startAppendOnly(void) {
8824 server
.appendonly
= 1;
8825 server
.lastfsync
= time(NULL
);
8826 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
8827 if (server
.appendfd
== -1) {
8828 redisLog(REDIS_WARNING
,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno
));
8831 if (rewriteAppendOnlyFileBackground() == REDIS_ERR
) {
8832 server
.appendonly
= 0;
8833 close(server
.appendfd
);
8834 redisLog(REDIS_WARNING
,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno
));
8840 /* Write the append only file buffer on disk.
8842 * Since we are required to write the AOF before replying to the client,
8843 * and the only way the client socket can get a write is entering when the
8844 * the event loop, we accumulate all the AOF writes in a memory
8845 * buffer and write it on disk using this function just before entering
8846 * the event loop again. */
8847 static void flushAppendOnlyFile(void) {
8851 if (sdslen(server
.aofbuf
) == 0) return;
8853 /* We want to perform a single write. This should be guaranteed atomic
8854 * at least if the filesystem we are writing is a real physical one.
8855 * While this will save us against the server being killed I don't think
8856 * there is much to do about the whole server stopping for power problems
8858 nwritten
= write(server
.appendfd
,server
.aofbuf
,sdslen(server
.aofbuf
));
8859 if (nwritten
!= (signed)sdslen(server
.aofbuf
)) {
8860 /* Ooops, we are in troubles. The best thing to do for now is
8861 * aborting instead of giving the illusion that everything is
8862 * working as expected. */
8863 if (nwritten
== -1) {
8864 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
8866 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
8870 sdsfree(server
.aofbuf
);
8871 server
.aofbuf
= sdsempty();
8873 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8874 * childs performing heavy I/O on disk. */
8875 if (server
.no_appendfsync_on_rewrite
&&
8876 (server
.bgrewritechildpid
!= -1 || server
.bgsavechildpid
!= -1))
8878 /* Fsync if needed */
8880 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
8881 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
8882 now
-server
.lastfsync
> 1))
8884 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8885 * flushing metadata. */
8886 aof_fsync(server
.appendfd
); /* Let's try to get this data on the disk */
8887 server
.lastfsync
= now
;
8891 static sds
catAppendOnlyGenericCommand(sds buf
, int argc
, robj
**argv
) {
8893 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
8894 for (j
= 0; j
< argc
; j
++) {
8895 robj
*o
= getDecodedObject(argv
[j
]);
8896 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
8897 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
8898 buf
= sdscatlen(buf
,"\r\n",2);
8904 static sds
catAppendOnlyExpireAtCommand(sds buf
, robj
*key
, robj
*seconds
) {
8909 /* Make sure we can use strtol */
8910 seconds
= getDecodedObject(seconds
);
8911 when
= time(NULL
)+strtol(seconds
->ptr
,NULL
,10);
8912 decrRefCount(seconds
);
8914 argv
[0] = createStringObject("EXPIREAT",8);
8916 argv
[2] = createObject(REDIS_STRING
,
8917 sdscatprintf(sdsempty(),"%ld",when
));
8918 buf
= catAppendOnlyGenericCommand(buf
, argc
, argv
);
8919 decrRefCount(argv
[0]);
8920 decrRefCount(argv
[2]);
8924 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
8925 sds buf
= sdsempty();
8928 /* The DB this command was targetting is not the same as the last command
8929 * we appendend. To issue a SELECT command is needed. */
8930 if (dictid
!= server
.appendseldb
) {
8933 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
8934 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8935 (unsigned long)strlen(seldb
),seldb
);
8936 server
.appendseldb
= dictid
;
8939 if (cmd
->proc
== expireCommand
) {
8940 /* Translate EXPIRE into EXPIREAT */
8941 buf
= catAppendOnlyExpireAtCommand(buf
,argv
[1],argv
[2]);
8942 } else if (cmd
->proc
== setexCommand
) {
8943 /* Translate SETEX to SET and EXPIREAT */
8944 tmpargv
[0] = createStringObject("SET",3);
8945 tmpargv
[1] = argv
[1];
8946 tmpargv
[2] = argv
[3];
8947 buf
= catAppendOnlyGenericCommand(buf
,3,tmpargv
);
8948 decrRefCount(tmpargv
[0]);
8949 buf
= catAppendOnlyExpireAtCommand(buf
,argv
[1],argv
[2]);
8951 buf
= catAppendOnlyGenericCommand(buf
,argc
,argv
);
8954 /* Append to the AOF buffer. This will be flushed on disk just before
8955 * of re-entering the event loop, so before the client will get a
8956 * positive reply about the operation performed. */
8957 server
.aofbuf
= sdscatlen(server
.aofbuf
,buf
,sdslen(buf
));
8959 /* If a background append only file rewriting is in progress we want to
8960 * accumulate the differences between the child DB and the current one
8961 * in a buffer, so that when the child process will do its work we
8962 * can append the differences to the new append only file. */
8963 if (server
.bgrewritechildpid
!= -1)
8964 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
8969 /* In Redis commands are always executed in the context of a client, so in
8970 * order to load the append only file we need to create a fake client. */
8971 static struct redisClient
*createFakeClient(void) {
8972 struct redisClient
*c
= zmalloc(sizeof(*c
));
8976 c
->querybuf
= sdsempty();
8980 /* We set the fake client as a slave waiting for the synchronization
8981 * so that Redis will not try to send replies to this client. */
8982 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
8983 c
->reply
= listCreate();
8984 listSetFreeMethod(c
->reply
,decrRefCount
);
8985 listSetDupMethod(c
->reply
,dupClientReplyValue
);
8986 initClientMultiState(c
);
8990 static void freeFakeClient(struct redisClient
*c
) {
8991 sdsfree(c
->querybuf
);
8992 listRelease(c
->reply
);
8993 freeClientMultiState(c
);
8997 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8998 * error (the append only file is zero-length) REDIS_ERR is returned. On
8999 * fatal error an error message is logged and the program exists. */
9000 int loadAppendOnlyFile(char *filename
) {
9001 struct redisClient
*fakeClient
;
9002 FILE *fp
= fopen(filename
,"r");
9003 struct redis_stat sb
;
9004 int appendonly
= server
.appendonly
;
9006 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
9010 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
9014 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
9015 * to the same file we're about to read. */
9016 server
.appendonly
= 0;
9018 fakeClient
= createFakeClient();
9025 struct redisCommand
*cmd
;
9028 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
9034 if (buf
[0] != '*') goto fmterr
;
9036 argv
= zmalloc(sizeof(robj
*)*argc
);
9037 for (j
= 0; j
< argc
; j
++) {
9038 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
9039 if (buf
[0] != '$') goto fmterr
;
9040 len
= strtol(buf
+1,NULL
,10);
9041 argsds
= sdsnewlen(NULL
,len
);
9042 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
9043 argv
[j
] = createObject(REDIS_STRING
,argsds
);
9044 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
9047 /* Command lookup */
9048 cmd
= lookupCommand(argv
[0]->ptr
);
9050 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
9053 /* Try object encoding */
9054 if (cmd
->flags
& REDIS_CMD_BULK
)
9055 argv
[argc
-1] = tryObjectEncoding(argv
[argc
-1]);
9056 /* Run the command in the context of a fake client */
9057 fakeClient
->argc
= argc
;
9058 fakeClient
->argv
= argv
;
9059 cmd
->proc(fakeClient
);
9060 /* Discard the reply objects list from the fake client */
9061 while(listLength(fakeClient
->reply
))
9062 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
9063 /* Clean up, ready for the next command */
9064 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
9066 /* Handle swapping while loading big datasets when VM is on */
9068 if ((zmalloc_used_memory() - server
.vm_max_memory
) > 1024*1024*32)
9071 if (server
.vm_enabled
&& force_swapout
) {
9072 while (zmalloc_used_memory() > server
.vm_max_memory
) {
9073 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
9078 /* This point can only be reached when EOF is reached without errors.
9079 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
9080 if (fakeClient
->flags
& REDIS_MULTI
) goto readerr
;
9083 freeFakeClient(fakeClient
);
9084 server
.appendonly
= appendonly
;
9089 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
9091 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
9095 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
9099 /* Write binary-safe string into a file in the bulkformat
9100 * $<count>\r\n<payload>\r\n */
9101 static int fwriteBulkString(FILE *fp
, char *s
, unsigned long len
) {
9105 clen
= 1+ll2string(cbuf
+1,sizeof(cbuf
)-1,len
);
9106 cbuf
[clen
++] = '\r';
9107 cbuf
[clen
++] = '\n';
9108 if (fwrite(cbuf
,clen
,1,fp
) == 0) return 0;
9109 if (len
> 0 && fwrite(s
,len
,1,fp
) == 0) return 0;
9110 if (fwrite("\r\n",2,1,fp
) == 0) return 0;
9114 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9115 static int fwriteBulkDouble(FILE *fp
, double d
) {
9116 char buf
[128], dbuf
[128];
9118 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
9119 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
9120 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
9121 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
9125 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9126 static int fwriteBulkLongLong(FILE *fp
, long long l
) {
9127 char bbuf
[128], lbuf
[128];
9128 unsigned int blen
, llen
;
9129 llen
= ll2string(lbuf
,32,l
);
9130 blen
= snprintf(bbuf
,sizeof(bbuf
),"$%u\r\n%s\r\n",llen
,lbuf
);
9131 if (fwrite(bbuf
,blen
,1,fp
) == 0) return 0;
9135 /* Delegate writing an object to writing a bulk string or bulk long long. */
9136 static int fwriteBulkObject(FILE *fp
, robj
*obj
) {
9137 /* Avoid using getDecodedObject to help copy-on-write (we are often
9138 * in a child process when this function is called). */
9139 if (obj
->encoding
== REDIS_ENCODING_INT
) {
9140 return fwriteBulkLongLong(fp
,(long)obj
->ptr
);
9141 } else if (obj
->encoding
== REDIS_ENCODING_RAW
) {
9142 return fwriteBulkString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
9144 redisPanic("Unknown string encoding");
9148 /* Write a sequence of commands able to fully rebuild the dataset into
9149 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9150 static int rewriteAppendOnlyFile(char *filename
) {
9151 dictIterator
*di
= NULL
;
9156 time_t now
= time(NULL
);
9158 /* Note that we have to use a different temp name here compared to the
9159 * one used by rewriteAppendOnlyFileBackground() function. */
9160 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
9161 fp
= fopen(tmpfile
,"w");
9163 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
9166 for (j
= 0; j
< server
.dbnum
; j
++) {
9167 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
9168 redisDb
*db
= server
.db
+j
;
9170 if (dictSize(d
) == 0) continue;
9171 di
= dictGetIterator(d
);
9177 /* SELECT the new DB */
9178 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
9179 if (fwriteBulkLongLong(fp
,j
) == 0) goto werr
;
9181 /* Iterate this DB writing every entry */
9182 while((de
= dictNext(di
)) != NULL
) {
9183 sds keystr
= dictGetEntryKey(de
);
9188 keystr
= dictGetEntryKey(de
);
9189 o
= dictGetEntryVal(de
);
9190 initStaticStringObject(key
,keystr
);
9191 /* If the value for this key is swapped, load a preview in memory.
9192 * We use a "swapped" flag to remember if we need to free the
9193 * value object instead to just increment the ref count anyway
9194 * in order to avoid copy-on-write of pages if we are forked() */
9195 if (!server
.vm_enabled
|| o
->storage
== REDIS_VM_MEMORY
||
9196 o
->storage
== REDIS_VM_SWAPPING
) {
9199 o
= vmPreviewObject(o
);
9202 expiretime
= getExpire(db
,&key
);
9204 /* Save the key and associated value */
9205 if (o
->type
== REDIS_STRING
) {
9206 /* Emit a SET command */
9207 char cmd
[]="*3\r\n$3\r\nSET\r\n";
9208 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9210 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9211 if (fwriteBulkObject(fp
,o
) == 0) goto werr
;
9212 } else if (o
->type
== REDIS_LIST
) {
9213 /* Emit the RPUSHes needed to rebuild the list */
9214 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
9215 if (o
->encoding
== REDIS_ENCODING_ZIPLIST
) {
9216 unsigned char *zl
= o
->ptr
;
9217 unsigned char *p
= ziplistIndex(zl
,0);
9218 unsigned char *vstr
;
9222 while(ziplistGet(p
,&vstr
,&vlen
,&vlong
)) {
9223 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9224 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9226 if (fwriteBulkString(fp
,(char*)vstr
,vlen
) == 0)
9229 if (fwriteBulkLongLong(fp
,vlong
) == 0)
9232 p
= ziplistNext(zl
,p
);
9234 } else if (o
->encoding
== REDIS_ENCODING_LIST
) {
9235 list
*list
= o
->ptr
;
9239 listRewind(list
,&li
);
9240 while((ln
= listNext(&li
))) {
9241 robj
*eleobj
= listNodeValue(ln
);
9243 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9244 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9245 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
9248 redisPanic("Unknown list encoding");
9250 } else if (o
->type
== REDIS_SET
) {
9251 /* Emit the SADDs needed to rebuild the set */
9253 dictIterator
*di
= dictGetIterator(set
);
9256 while((de
= dictNext(di
)) != NULL
) {
9257 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
9258 robj
*eleobj
= dictGetEntryKey(de
);
9260 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9261 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9262 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
9264 dictReleaseIterator(di
);
9265 } else if (o
->type
== REDIS_ZSET
) {
9266 /* Emit the ZADDs needed to rebuild the sorted set */
9268 dictIterator
*di
= dictGetIterator(zs
->dict
);
9271 while((de
= dictNext(di
)) != NULL
) {
9272 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
9273 robj
*eleobj
= dictGetEntryKey(de
);
9274 double *score
= dictGetEntryVal(de
);
9276 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9277 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9278 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
9279 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
9281 dictReleaseIterator(di
);
9282 } else if (o
->type
== REDIS_HASH
) {
9283 char cmd
[]="*4\r\n$4\r\nHSET\r\n";
9285 /* Emit the HSETs needed to rebuild the hash */
9286 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
9287 unsigned char *p
= zipmapRewind(o
->ptr
);
9288 unsigned char *field
, *val
;
9289 unsigned int flen
, vlen
;
9291 while((p
= zipmapNext(p
,&field
,&flen
,&val
,&vlen
)) != NULL
) {
9292 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9293 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9294 if (fwriteBulkString(fp
,(char*)field
,flen
) == -1)
9296 if (fwriteBulkString(fp
,(char*)val
,vlen
) == -1)
9300 dictIterator
*di
= dictGetIterator(o
->ptr
);
9303 while((de
= dictNext(di
)) != NULL
) {
9304 robj
*field
= dictGetEntryKey(de
);
9305 robj
*val
= dictGetEntryVal(de
);
9307 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9308 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9309 if (fwriteBulkObject(fp
,field
) == -1) return -1;
9310 if (fwriteBulkObject(fp
,val
) == -1) return -1;
9312 dictReleaseIterator(di
);
9315 redisPanic("Unknown object type");
9317 /* Save the expire time */
9318 if (expiretime
!= -1) {
9319 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
9320 /* If this key is already expired skip it */
9321 if (expiretime
< now
) continue;
9322 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
9323 if (fwriteBulkObject(fp
,&key
) == 0) goto werr
;
9324 if (fwriteBulkLongLong(fp
,expiretime
) == 0) goto werr
;
9326 if (swapped
) decrRefCount(o
);
9328 dictReleaseIterator(di
);
9331 /* Make sure data will not remain on the OS's output buffers */
9333 aof_fsync(fileno(fp
));
9336 /* Use RENAME to make sure the DB file is changed atomically only
9337 * if the generate DB file is ok. */
9338 if (rename(tmpfile
,filename
) == -1) {
9339 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
9343 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
9349 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
9350 if (di
) dictReleaseIterator(di
);
9354 /* This is how rewriting of the append only file in background works:
9356 * 1) The user calls BGREWRITEAOF
9357 * 2) Redis calls this function, that forks():
9358 * 2a) the child rewrite the append only file in a temp file.
9359 * 2b) the parent accumulates differences in server.bgrewritebuf.
9360 * 3) When the child finished '2a' exists.
9361 * 4) The parent will trap the exit code, if it's OK, will append the
9362 * data accumulated into server.bgrewritebuf into the temp file, and
9363 * finally will rename(2) the temp file in the actual file name.
9364 * The the new file is reopened as the new append only file. Profit!
9366 static int rewriteAppendOnlyFileBackground(void) {
9369 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
9370 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
9371 if ((childpid
= fork()) == 0) {
9375 if (server
.vm_enabled
) vmReopenSwapFile();
9377 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9378 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
9385 if (childpid
== -1) {
9386 redisLog(REDIS_WARNING
,
9387 "Can't rewrite append only file in background: fork: %s",
9391 redisLog(REDIS_NOTICE
,
9392 "Background append only file rewriting started by pid %d",childpid
);
9393 server
.bgrewritechildpid
= childpid
;
9394 updateDictResizePolicy();
9395 /* We set appendseldb to -1 in order to force the next call to the
9396 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9397 * accumulated by the parent into server.bgrewritebuf will start
9398 * with a SELECT statement and it will be safe to merge. */
9399 server
.appendseldb
= -1;
9402 return REDIS_OK
; /* unreached */
9405 static void bgrewriteaofCommand(redisClient
*c
) {
9406 if (server
.bgrewritechildpid
!= -1) {
9407 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9410 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
9411 char *status
= "+Background append only file rewriting started\r\n";
9412 addReplySds(c
,sdsnew(status
));
9414 addReply(c
,shared
.err
);
9418 static void aofRemoveTempFile(pid_t childpid
) {
9421 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
9425 /* Virtual Memory is composed mainly of two subsystems:
9426 * - Blocking Virutal Memory
9427 * - Threaded Virtual Memory I/O
9428 * The two parts are not fully decoupled, but functions are split among two
9429 * different sections of the source code (delimited by comments) in order to
9430 * make more clear what functionality is about the blocking VM and what about
9431 * the threaded (not blocking) VM.
9435 * Redis VM is a blocking VM (one that blocks reading swapped values from
9436 * disk into memory when a value swapped out is needed in memory) that is made
9437 * unblocking by trying to examine the command argument vector in order to
9438 * load in background values that will likely be needed in order to exec
9439 * the command. The command is executed only once all the relevant keys
9440 * are loaded into memory.
9442 * This basically is almost as simple of a blocking VM, but almost as parallel
9443 * as a fully non-blocking VM.
9446 /* =================== Virtual Memory - Blocking Side ====================== */
9448 /* Create a VM pointer object. This kind of objects are used in place of
9449 * values in the key -> value hash table, for swapped out objects. */
9450 static vmpointer
*createVmPointer(int vtype
) {
9451 vmpointer
*vp
= zmalloc(sizeof(vmpointer
));
9453 vp
->type
= REDIS_VMPOINTER
;
9454 vp
->storage
= REDIS_VM_SWAPPED
;
9459 static void vmInit(void) {
9465 if (server
.vm_max_threads
!= 0)
9466 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9468 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
9469 /* Try to open the old swap file, otherwise create it */
9470 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
9471 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
9473 if (server
.vm_fp
== NULL
) {
9474 redisLog(REDIS_WARNING
,
9475 "Can't open the swap file: %s. Exiting.",
9479 server
.vm_fd
= fileno(server
.vm_fp
);
9480 /* Lock the swap file for writing, this is useful in order to avoid
9481 * another instance to use the same swap file for a config error. */
9482 fl
.l_type
= F_WRLCK
;
9483 fl
.l_whence
= SEEK_SET
;
9484 fl
.l_start
= fl
.l_len
= 0;
9485 if (fcntl(server
.vm_fd
,F_SETLK
,&fl
) == -1) {
9486 redisLog(REDIS_WARNING
,
9487 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server
.vm_swap_file
, strerror(errno
));
9491 server
.vm_next_page
= 0;
9492 server
.vm_near_pages
= 0;
9493 server
.vm_stats_used_pages
= 0;
9494 server
.vm_stats_swapped_objects
= 0;
9495 server
.vm_stats_swapouts
= 0;
9496 server
.vm_stats_swapins
= 0;
9497 totsize
= server
.vm_pages
*server
.vm_page_size
;
9498 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
9499 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
9500 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
9504 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
9506 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
9507 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
9508 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
9509 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
9511 /* Initialize threaded I/O (used by Virtual Memory) */
9512 server
.io_newjobs
= listCreate();
9513 server
.io_processing
= listCreate();
9514 server
.io_processed
= listCreate();
9515 server
.io_ready_clients
= listCreate();
9516 pthread_mutex_init(&server
.io_mutex
,NULL
);
9517 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
9518 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
9519 server
.io_active_threads
= 0;
9520 if (pipe(pipefds
) == -1) {
9521 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
9525 server
.io_ready_pipe_read
= pipefds
[0];
9526 server
.io_ready_pipe_write
= pipefds
[1];
9527 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
9528 /* LZF requires a lot of stack */
9529 pthread_attr_init(&server
.io_threads_attr
);
9530 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
9531 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
9532 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
9533 /* Listen for events in the threaded I/O pipe */
9534 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
9535 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
9536 oom("creating file event");
9539 /* Mark the page as used */
9540 static void vmMarkPageUsed(off_t page
) {
9541 off_t byte
= page
/8;
9543 redisAssert(vmFreePage(page
) == 1);
9544 server
.vm_bitmap
[byte
] |= 1<<bit
;
9547 /* Mark N contiguous pages as used, with 'page' being the first. */
9548 static void vmMarkPagesUsed(off_t page
, off_t count
) {
9551 for (j
= 0; j
< count
; j
++)
9552 vmMarkPageUsed(page
+j
);
9553 server
.vm_stats_used_pages
+= count
;
9554 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
9555 (long long)count
, (long long)page
);
9558 /* Mark the page as free */
9559 static void vmMarkPageFree(off_t page
) {
9560 off_t byte
= page
/8;
9562 redisAssert(vmFreePage(page
) == 0);
9563 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
9566 /* Mark N contiguous pages as free, with 'page' being the first. */
9567 static void vmMarkPagesFree(off_t page
, off_t count
) {
9570 for (j
= 0; j
< count
; j
++)
9571 vmMarkPageFree(page
+j
);
9572 server
.vm_stats_used_pages
-= count
;
9573 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
9574 (long long)count
, (long long)page
);
9577 /* Test if the page is free */
9578 static int vmFreePage(off_t page
) {
9579 off_t byte
= page
/8;
9581 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
9584 /* Find N contiguous free pages storing the first page of the cluster in *first.
9585 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9586 * REDIS_ERR is returned.
9588 * This function uses a simple algorithm: we try to allocate
9589 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9590 * again from the start of the swap file searching for free spaces.
9592 * If it looks pretty clear that there are no free pages near our offset
9593 * we try to find less populated places doing a forward jump of
9594 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9595 * without hurry, and then we jump again and so forth...
9597 * This function can be improved using a free list to avoid to guess
9598 * too much, since we could collect data about freed pages.
9600 * note: I implemented this function just after watching an episode of
9601 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9603 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
9604 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
9606 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
9607 server
.vm_near_pages
= 0;
9608 server
.vm_next_page
= 0;
9610 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
9611 base
= server
.vm_next_page
;
9613 while(offset
< server
.vm_pages
) {
9614 off_t
this = base
+offset
;
9616 /* If we overflow, restart from page zero */
9617 if (this >= server
.vm_pages
) {
9618 this -= server
.vm_pages
;
9620 /* Just overflowed, what we found on tail is no longer
9621 * interesting, as it's no longer contiguous. */
9625 if (vmFreePage(this)) {
9626 /* This is a free page */
9628 /* Already got N free pages? Return to the caller, with success */
9630 *first
= this-(n
-1);
9631 server
.vm_next_page
= this+1;
9632 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
9636 /* The current one is not a free page */
9640 /* Fast-forward if the current page is not free and we already
9641 * searched enough near this place. */
9643 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
9644 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
9646 /* Note that even if we rewind after the jump, we are don't need
9647 * to make sure numfree is set to zero as we only jump *if* it
9648 * is set to zero. */
9650 /* Otherwise just check the next page */
9657 /* Write the specified object at the specified page of the swap file */
9658 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
9659 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
9660 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
9661 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
9662 redisLog(REDIS_WARNING
,
9663 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9667 rdbSaveObject(server
.vm_fp
,o
);
9668 fflush(server
.vm_fp
);
9669 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
9673 /* Transfers the 'val' object to disk. Store all the information
9674 * a 'vmpointer' object containing all the information needed to load the
9675 * object back later is returned.
9677 * If we can't find enough contiguous empty pages to swap the object on disk
9678 * NULL is returned. */
9679 static vmpointer
*vmSwapObjectBlocking(robj
*val
) {
9680 off_t pages
= rdbSavedObjectPages(val
,NULL
);
9684 assert(val
->storage
== REDIS_VM_MEMORY
);
9685 assert(val
->refcount
== 1);
9686 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return NULL
;
9687 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return NULL
;
9689 vp
= createVmPointer(val
->type
);
9691 vp
->usedpages
= pages
;
9692 decrRefCount(val
); /* Deallocate the object from memory. */
9693 vmMarkPagesUsed(page
,pages
);
9694 redisLog(REDIS_DEBUG
,"VM: object %p swapped out at %lld (%lld pages)",
9696 (unsigned long long) page
, (unsigned long long) pages
);
9697 server
.vm_stats_swapped_objects
++;
9698 server
.vm_stats_swapouts
++;
9702 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
9705 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
9706 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
9707 redisLog(REDIS_WARNING
,
9708 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9712 o
= rdbLoadObject(type
,server
.vm_fp
);
9714 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
9717 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
9721 /* Load the specified object from swap to memory.
9722 * The newly allocated object is returned.
9724 * If preview is true the unserialized object is returned to the caller but
9725 * the pages are not marked as freed, nor the vp object is freed. */
9726 static robj
*vmGenericLoadObject(vmpointer
*vp
, int preview
) {
9729 redisAssert(vp
->type
== REDIS_VMPOINTER
&&
9730 (vp
->storage
== REDIS_VM_SWAPPED
|| vp
->storage
== REDIS_VM_LOADING
));
9731 val
= vmReadObjectFromSwap(vp
->page
,vp
->vtype
);
9733 redisLog(REDIS_DEBUG
, "VM: object %p loaded from disk", (void*)vp
);
9734 vmMarkPagesFree(vp
->page
,vp
->usedpages
);
9736 server
.vm_stats_swapped_objects
--;
9738 redisLog(REDIS_DEBUG
, "VM: object %p previewed from disk", (void*)vp
);
9740 server
.vm_stats_swapins
++;
9744 /* Plain object loading, from swap to memory.
9746 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9747 * The return value is the loaded object. */
9748 static robj
*vmLoadObject(robj
*o
) {
9749 /* If we are loading the object in background, stop it, we
9750 * need to load this object synchronously ASAP. */
9751 if (o
->storage
== REDIS_VM_LOADING
)
9752 vmCancelThreadedIOJob(o
);
9753 return vmGenericLoadObject((vmpointer
*)o
,0);
9756 /* Just load the value on disk, without to modify the key.
9757 * This is useful when we want to perform some operation on the value
9758 * without to really bring it from swap to memory, like while saving the
9759 * dataset or rewriting the append only log. */
9760 static robj
*vmPreviewObject(robj
*o
) {
9761 return vmGenericLoadObject((vmpointer
*)o
,1);
9764 /* How a good candidate is this object for swapping?
9765 * The better candidate it is, the greater the returned value.
9767 * Currently we try to perform a fast estimation of the object size in
9768 * memory, and combine it with aging informations.
9770 * Basically swappability = idle-time * log(estimated size)
9772 * Bigger objects are preferred over smaller objects, but not
9773 * proportionally, this is why we use the logarithm. This algorithm is
9774 * just a first try and will probably be tuned later. */
9775 static double computeObjectSwappability(robj
*o
) {
9776 /* actual age can be >= minage, but not < minage. As we use wrapping
9777 * 21 bit clocks with minutes resolution for the LRU. */
9778 time_t minage
= abs(server
.lruclock
- o
->lru
);
9782 struct dictEntry
*de
;
9785 if (minage
<= 0) return 0;
9788 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
9791 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
9796 listNode
*ln
= listFirst(l
);
9798 asize
= sizeof(list
);
9800 robj
*ele
= ln
->value
;
9803 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
9804 (sizeof(*o
)+sdslen(ele
->ptr
)) : sizeof(*o
);
9805 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
9810 z
= (o
->type
== REDIS_ZSET
);
9811 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
9813 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
9814 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
9819 de
= dictGetRandomKey(d
);
9820 ele
= dictGetEntryKey(de
);
9821 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
9822 (sizeof(*o
)+sdslen(ele
->ptr
)) : sizeof(*o
);
9823 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
9824 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
9828 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
9829 unsigned char *p
= zipmapRewind((unsigned char*)o
->ptr
);
9830 unsigned int len
= zipmapLen((unsigned char*)o
->ptr
);
9831 unsigned int klen
, vlen
;
9832 unsigned char *key
, *val
;
9834 if ((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) == NULL
) {
9838 asize
= len
*(klen
+vlen
+3);
9839 } else if (o
->encoding
== REDIS_ENCODING_HT
) {
9841 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
9846 de
= dictGetRandomKey(d
);
9847 ele
= dictGetEntryKey(de
);
9848 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
9849 (sizeof(*o
)+sdslen(ele
->ptr
)) : sizeof(*o
);
9850 ele
= dictGetEntryVal(de
);
9851 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
9852 (sizeof(*o
)+sdslen(ele
->ptr
)) : sizeof(*o
);
9853 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
9858 return (double)minage
*log(1+asize
);
9861 /* Try to swap an object that's a good candidate for swapping.
9862 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9863 * to swap any object at all.
9865 * If 'usethreaded' is true, Redis will try to swap the object in background
9866 * using I/O threads. */
9867 static int vmSwapOneObject(int usethreads
) {
9869 struct dictEntry
*best
= NULL
;
9870 double best_swappability
= 0;
9871 redisDb
*best_db
= NULL
;
9875 for (j
= 0; j
< server
.dbnum
; j
++) {
9876 redisDb
*db
= server
.db
+j
;
9877 /* Why maxtries is set to 100?
9878 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9879 * are swappable objects */
9882 if (dictSize(db
->dict
) == 0) continue;
9883 for (i
= 0; i
< 5; i
++) {
9885 double swappability
;
9887 if (maxtries
) maxtries
--;
9888 de
= dictGetRandomKey(db
->dict
);
9889 val
= dictGetEntryVal(de
);
9890 /* Only swap objects that are currently in memory.
9892 * Also don't swap shared objects: not a good idea in general and
9893 * we need to ensure that the main thread does not touch the
9894 * object while the I/O thread is using it, but we can't
9895 * control other keys without adding additional mutex. */
9896 if (val
->storage
!= REDIS_VM_MEMORY
|| val
->refcount
!= 1) {
9897 if (maxtries
) i
--; /* don't count this try */
9900 swappability
= computeObjectSwappability(val
);
9901 if (!best
|| swappability
> best_swappability
) {
9903 best_swappability
= swappability
;
9908 if (best
== NULL
) return REDIS_ERR
;
9909 key
= dictGetEntryKey(best
);
9910 val
= dictGetEntryVal(best
);
9912 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
9913 key
, best_swappability
);
9917 robj
*keyobj
= createStringObject(key
,sdslen(key
));
9918 vmSwapObjectThreaded(keyobj
,val
,best_db
);
9919 decrRefCount(keyobj
);
9924 if ((vp
= vmSwapObjectBlocking(val
)) != NULL
) {
9925 dictGetEntryVal(best
) = vp
;
9933 static int vmSwapOneObjectBlocking() {
9934 return vmSwapOneObject(0);
9937 static int vmSwapOneObjectThreaded() {
9938 return vmSwapOneObject(1);
9941 /* Return true if it's safe to swap out objects in a given moment.
9942 * Basically we don't want to swap objects out while there is a BGSAVE
9943 * or a BGAEOREWRITE running in backgroud. */
9944 static int vmCanSwapOut(void) {
9945 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
9948 /* =================== Virtual Memory - Threaded I/O ======================= */
9950 static void freeIOJob(iojob
*j
) {
9951 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
9952 j
->type
== REDIS_IOJOB_DO_SWAP
||
9953 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
9955 /* we fix the storage type, otherwise decrRefCount() will try to
9956 * kill the I/O thread Job (that does no longer exists). */
9957 if (j
->val
->storage
== REDIS_VM_SWAPPING
)
9958 j
->val
->storage
= REDIS_VM_MEMORY
;
9959 decrRefCount(j
->val
);
9961 decrRefCount(j
->key
);
9965 /* Every time a thread finished a Job, it writes a byte into the write side
9966 * of an unix pipe in order to "awake" the main thread, and this function
9968 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
9972 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
9974 REDIS_NOTUSED(mask
);
9975 REDIS_NOTUSED(privdata
);
9977 /* For every byte we read in the read side of the pipe, there is one
9978 * I/O job completed to process. */
9979 while((retval
= read(fd
,buf
,1)) == 1) {
9982 struct dictEntry
*de
;
9984 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
9986 /* Get the processed element (the oldest one) */
9988 assert(listLength(server
.io_processed
) != 0);
9989 if (toprocess
== -1) {
9990 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
9991 if (toprocess
<= 0) toprocess
= 1;
9993 ln
= listFirst(server
.io_processed
);
9995 listDelNode(server
.io_processed
,ln
);
9997 /* If this job is marked as canceled, just ignore it */
10002 /* Post process it in the main thread, as there are things we
10003 * can do just here to avoid race conditions and/or invasive locks */
10004 redisLog(REDIS_DEBUG
,"COMPLETED Job type: %d, ID %p, key: %s", j
->type
, (void*)j
->id
, (unsigned char*)j
->key
->ptr
);
10005 de
= dictFind(j
->db
->dict
,j
->key
->ptr
);
10006 redisAssert(de
!= NULL
);
10007 if (j
->type
== REDIS_IOJOB_LOAD
) {
10009 vmpointer
*vp
= dictGetEntryVal(de
);
10011 /* Key loaded, bring it at home */
10012 vmMarkPagesFree(vp
->page
,vp
->usedpages
);
10013 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
10014 (unsigned char*) j
->key
->ptr
);
10015 server
.vm_stats_swapped_objects
--;
10016 server
.vm_stats_swapins
++;
10017 dictGetEntryVal(de
) = j
->val
;
10018 incrRefCount(j
->val
);
10020 /* Handle clients waiting for this key to be loaded. */
10021 handleClientsBlockedOnSwappedKey(db
,j
->key
);
10024 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
10025 /* Now we know the amount of pages required to swap this object.
10026 * Let's find some space for it, and queue this task again
10027 * rebranded as REDIS_IOJOB_DO_SWAP. */
10028 if (!vmCanSwapOut() ||
10029 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
10031 /* Ooops... no space or we can't swap as there is
10032 * a fork()ed Redis trying to save stuff on disk. */
10033 j
->val
->storage
= REDIS_VM_MEMORY
; /* undo operation */
10036 /* Note that we need to mark this pages as used now,
10037 * if the job will be canceled, we'll mark them as freed
10039 vmMarkPagesUsed(j
->page
,j
->pages
);
10040 j
->type
= REDIS_IOJOB_DO_SWAP
;
10043 unlockThreadedIO();
10045 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
10048 /* Key swapped. We can finally free some memory. */
10049 if (j
->val
->storage
!= REDIS_VM_SWAPPING
) {
10050 vmpointer
*vp
= (vmpointer
*) j
->id
;
10051 printf("storage: %d\n",vp
->storage
);
10052 printf("key->name: %s\n",(char*)j
->key
->ptr
);
10053 printf("val: %p\n",(void*)j
->val
);
10054 printf("val->type: %d\n",j
->val
->type
);
10055 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
10057 redisAssert(j
->val
->storage
== REDIS_VM_SWAPPING
);
10058 vp
= createVmPointer(j
->val
->type
);
10059 vp
->page
= j
->page
;
10060 vp
->usedpages
= j
->pages
;
10061 dictGetEntryVal(de
) = vp
;
10062 /* Fix the storage otherwise decrRefCount will attempt to
10063 * remove the associated I/O job */
10064 j
->val
->storage
= REDIS_VM_MEMORY
;
10065 decrRefCount(j
->val
);
10066 redisLog(REDIS_DEBUG
,
10067 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
10068 (unsigned char*) j
->key
->ptr
,
10069 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
10070 server
.vm_stats_swapped_objects
++;
10071 server
.vm_stats_swapouts
++;
10073 /* Put a few more swap requests in queue if we are still
10075 if (trytoswap
&& vmCanSwapOut() &&
10076 zmalloc_used_memory() > server
.vm_max_memory
)
10081 more
= listLength(server
.io_newjobs
) <
10082 (unsigned) server
.vm_max_threads
;
10083 unlockThreadedIO();
10084 /* Don't waste CPU time if swappable objects are rare. */
10085 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
10093 if (processed
== toprocess
) return;
10095 if (retval
< 0 && errno
!= EAGAIN
) {
10096 redisLog(REDIS_WARNING
,
10097 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10102 static void lockThreadedIO(void) {
10103 pthread_mutex_lock(&server
.io_mutex
);
10106 static void unlockThreadedIO(void) {
10107 pthread_mutex_unlock(&server
.io_mutex
);
10110 /* Remove the specified object from the threaded I/O queue if still not
10111 * processed, otherwise make sure to flag it as canceled. */
10112 static void vmCancelThreadedIOJob(robj
*o
) {
10114 server
.io_newjobs
, /* 0 */
10115 server
.io_processing
, /* 1 */
10116 server
.io_processed
/* 2 */
10120 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
10123 /* Search for a matching object in one of the queues */
10124 for (i
= 0; i
< 3; i
++) {
10128 listRewind(lists
[i
],&li
);
10129 while ((ln
= listNext(&li
)) != NULL
) {
10130 iojob
*job
= ln
->value
;
10132 if (job
->canceled
) continue; /* Skip this, already canceled. */
10133 if (job
->id
== o
) {
10134 redisLog(REDIS_DEBUG
,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10135 (void*)job
, (char*)job
->key
->ptr
, job
->type
, i
);
10136 /* Mark the pages as free since the swap didn't happened
10137 * or happened but is now discarded. */
10138 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
10139 vmMarkPagesFree(job
->page
,job
->pages
);
10140 /* Cancel the job. It depends on the list the job is
10143 case 0: /* io_newjobs */
10144 /* If the job was yet not processed the best thing to do
10145 * is to remove it from the queue at all */
10147 listDelNode(lists
[i
],ln
);
10149 case 1: /* io_processing */
10150 /* Oh Shi- the thread is messing with the Job:
10152 * Probably it's accessing the object if this is a
10153 * PREPARE_SWAP or DO_SWAP job.
10154 * If it's a LOAD job it may be reading from disk and
10155 * if we don't wait for the job to terminate before to
10156 * cancel it, maybe in a few microseconds data can be
10157 * corrupted in this pages. So the short story is:
10159 * Better to wait for the job to move into the
10160 * next queue (processed)... */
10162 /* We try again and again until the job is completed. */
10163 unlockThreadedIO();
10164 /* But let's wait some time for the I/O thread
10165 * to finish with this job. After all this condition
10166 * should be very rare. */
10169 case 2: /* io_processed */
10170 /* The job was already processed, that's easy...
10171 * just mark it as canceled so that we'll ignore it
10172 * when processing completed jobs. */
10176 /* Finally we have to adjust the storage type of the object
10177 * in order to "UNDO" the operaiton. */
10178 if (o
->storage
== REDIS_VM_LOADING
)
10179 o
->storage
= REDIS_VM_SWAPPED
;
10180 else if (o
->storage
== REDIS_VM_SWAPPING
)
10181 o
->storage
= REDIS_VM_MEMORY
;
10182 unlockThreadedIO();
10183 redisLog(REDIS_DEBUG
,"*** DONE");
10188 unlockThreadedIO();
10189 printf("Not found: %p\n", (void*)o
);
10190 redisAssert(1 != 1); /* We should never reach this */
10193 static void *IOThreadEntryPoint(void *arg
) {
10196 REDIS_NOTUSED(arg
);
10198 pthread_detach(pthread_self());
10200 /* Get a new job to process */
10202 if (listLength(server
.io_newjobs
) == 0) {
10203 /* No new jobs in queue, exit. */
10204 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
10205 (long) pthread_self());
10206 server
.io_active_threads
--;
10207 unlockThreadedIO();
10210 ln
= listFirst(server
.io_newjobs
);
10212 listDelNode(server
.io_newjobs
,ln
);
10213 /* Add the job in the processing queue */
10214 j
->thread
= pthread_self();
10215 listAddNodeTail(server
.io_processing
,j
);
10216 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
10217 unlockThreadedIO();
10218 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
10219 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
10221 /* Process the Job */
10222 if (j
->type
== REDIS_IOJOB_LOAD
) {
10223 vmpointer
*vp
= (vmpointer
*)j
->id
;
10224 j
->val
= vmReadObjectFromSwap(j
->page
,vp
->vtype
);
10225 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
10226 FILE *fp
= fopen("/dev/null","w+");
10227 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
10229 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
10230 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
10234 /* Done: insert the job into the processed queue */
10235 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
10236 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
10238 listDelNode(server
.io_processing
,ln
);
10239 listAddNodeTail(server
.io_processed
,j
);
10240 unlockThreadedIO();
10242 /* Signal the main thread there is new stuff to process */
10243 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
10245 return NULL
; /* never reached */
10248 static void spawnIOThread(void) {
10250 sigset_t mask
, omask
;
10253 sigemptyset(&mask
);
10254 sigaddset(&mask
,SIGCHLD
);
10255 sigaddset(&mask
,SIGHUP
);
10256 sigaddset(&mask
,SIGPIPE
);
10257 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
10258 while ((err
= pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
)) != 0) {
10259 redisLog(REDIS_WARNING
,"Unable to spawn an I/O thread: %s",
10263 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
10264 server
.io_active_threads
++;
10267 /* We need to wait for the last thread to exit before we are able to
10268 * fork() in order to BGSAVE or BGREWRITEAOF. */
10269 static void waitEmptyIOJobsQueue(void) {
10271 int io_processed_len
;
10274 if (listLength(server
.io_newjobs
) == 0 &&
10275 listLength(server
.io_processing
) == 0 &&
10276 server
.io_active_threads
== 0)
10278 unlockThreadedIO();
10281 /* While waiting for empty jobs queue condition we post-process some
10282 * finshed job, as I/O threads may be hanging trying to write against
10283 * the io_ready_pipe_write FD but there are so much pending jobs that
10284 * it's blocking. */
10285 io_processed_len
= listLength(server
.io_processed
);
10286 unlockThreadedIO();
10287 if (io_processed_len
) {
10288 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
10289 usleep(1000); /* 1 millisecond */
10291 usleep(10000); /* 10 milliseconds */
10296 static void vmReopenSwapFile(void) {
10297 /* Note: we don't close the old one as we are in the child process
10298 * and don't want to mess at all with the original file object. */
10299 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
10300 if (server
.vm_fp
== NULL
) {
10301 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
10302 server
.vm_swap_file
);
10305 server
.vm_fd
= fileno(server
.vm_fp
);
10308 /* This function must be called while with threaded IO locked */
10309 static void queueIOJob(iojob
*j
) {
10310 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
10311 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
10312 listAddNodeTail(server
.io_newjobs
,j
);
10313 if (server
.io_active_threads
< server
.vm_max_threads
)
10317 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
10320 j
= zmalloc(sizeof(*j
));
10321 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
10325 j
->id
= j
->val
= val
;
10328 j
->thread
= (pthread_t
) -1;
10329 val
->storage
= REDIS_VM_SWAPPING
;
10333 unlockThreadedIO();
10337 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10339 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10340 * If there is not already a job loading the key, it is craeted.
10341 * The key is added to the io_keys list in the client structure, and also
10342 * in the hash table mapping swapped keys to waiting clients, that is,
10343 * server.io_waited_keys. */
10344 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
10345 struct dictEntry
*de
;
10349 /* If the key does not exist or is already in RAM we don't need to
10350 * block the client at all. */
10351 de
= dictFind(c
->db
->dict
,key
->ptr
);
10352 if (de
== NULL
) return 0;
10353 o
= dictGetEntryVal(de
);
10354 if (o
->storage
== REDIS_VM_MEMORY
) {
10356 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
10357 /* We were swapping the key, undo it! */
10358 vmCancelThreadedIOJob(o
);
10362 /* OK: the key is either swapped, or being loaded just now. */
10364 /* Add the key to the list of keys this client is waiting for.
10365 * This maps clients to keys they are waiting for. */
10366 listAddNodeTail(c
->io_keys
,key
);
10369 /* Add the client to the swapped keys => clients waiting map. */
10370 de
= dictFind(c
->db
->io_keys
,key
);
10374 /* For every key we take a list of clients blocked for it */
10376 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
10378 assert(retval
== DICT_OK
);
10380 l
= dictGetEntryVal(de
);
10382 listAddNodeTail(l
,c
);
10384 /* Are we already loading the key from disk? If not create a job */
10385 if (o
->storage
== REDIS_VM_SWAPPED
) {
10387 vmpointer
*vp
= (vmpointer
*)o
;
10389 o
->storage
= REDIS_VM_LOADING
;
10390 j
= zmalloc(sizeof(*j
));
10391 j
->type
= REDIS_IOJOB_LOAD
;
10396 j
->page
= vp
->page
;
10399 j
->thread
= (pthread_t
) -1;
10402 unlockThreadedIO();
10407 /* Preload keys for any command with first, last and step values for
10408 * the command keys prototype, as defined in the command table. */
10409 static void waitForMultipleSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
) {
10411 if (cmd
->vm_firstkey
== 0) return;
10412 last
= cmd
->vm_lastkey
;
10413 if (last
< 0) last
= argc
+last
;
10414 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
) {
10415 redisAssert(j
< argc
);
10416 waitForSwappedKey(c
,argv
[j
]);
10420 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10421 * Note that the number of keys to preload is user-defined, so we need to
10422 * apply a sanity check against argc. */
10423 static void zunionInterBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
) {
10425 REDIS_NOTUSED(cmd
);
10427 num
= atoi(argv
[2]->ptr
);
10428 if (num
> (argc
-3)) return;
10429 for (i
= 0; i
< num
; i
++) {
10430 waitForSwappedKey(c
,argv
[3+i
]);
10434 /* Preload keys needed to execute the entire MULTI/EXEC block.
10436 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10437 * and will block the client when any command requires a swapped out value. */
10438 static void execBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
) {
10440 struct redisCommand
*mcmd
;
10442 REDIS_NOTUSED(cmd
);
10443 REDIS_NOTUSED(argc
);
10444 REDIS_NOTUSED(argv
);
10446 if (!(c
->flags
& REDIS_MULTI
)) return;
10447 for (i
= 0; i
< c
->mstate
.count
; i
++) {
10448 mcmd
= c
->mstate
.commands
[i
].cmd
;
10449 margc
= c
->mstate
.commands
[i
].argc
;
10450 margv
= c
->mstate
.commands
[i
].argv
;
10452 if (mcmd
->vm_preload_proc
!= NULL
) {
10453 mcmd
->vm_preload_proc(c
,mcmd
,margc
,margv
);
10455 waitForMultipleSwappedKeys(c
,mcmd
,margc
,margv
);
10460 /* Is this client attempting to run a command against swapped keys?
10461 * If so, block it ASAP, load the keys in background, then resume it.
10463 * The important idea about this function is that it can fail! If keys will
10464 * still be swapped when the client is resumed, this key lookups will
10465 * just block loading keys from disk. In practical terms this should only
10466 * happen with SORT BY command or if there is a bug in this function.
10468 * Return 1 if the client is marked as blocked, 0 if the client can
10469 * continue as the keys it is going to access appear to be in memory. */
10470 static int blockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
) {
10471 if (cmd
->vm_preload_proc
!= NULL
) {
10472 cmd
->vm_preload_proc(c
,cmd
,c
->argc
,c
->argv
);
10474 waitForMultipleSwappedKeys(c
,cmd
,c
->argc
,c
->argv
);
10477 /* If the client was blocked for at least one key, mark it as blocked. */
10478 if (listLength(c
->io_keys
)) {
10479 c
->flags
|= REDIS_IO_WAIT
;
10480 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
10481 server
.vm_blocked_clients
++;
10488 /* Remove the 'key' from the list of blocked keys for a given client.
10490 * The function returns 1 when there are no longer blocking keys after
10491 * the current one was removed (and the client can be unblocked). */
10492 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
10496 struct dictEntry
*de
;
10498 /* Remove the key from the list of keys this client is waiting for. */
10499 listRewind(c
->io_keys
,&li
);
10500 while ((ln
= listNext(&li
)) != NULL
) {
10501 if (equalStringObjects(ln
->value
,key
)) {
10502 listDelNode(c
->io_keys
,ln
);
10506 assert(ln
!= NULL
);
10508 /* Remove the client form the key => waiting clients map. */
10509 de
= dictFind(c
->db
->io_keys
,key
);
10510 assert(de
!= NULL
);
10511 l
= dictGetEntryVal(de
);
10512 ln
= listSearchKey(l
,c
);
10513 assert(ln
!= NULL
);
10515 if (listLength(l
) == 0)
10516 dictDelete(c
->db
->io_keys
,key
);
10518 return listLength(c
->io_keys
) == 0;
10521 /* Every time we now a key was loaded back in memory, we handle clients
10522 * waiting for this key if any. */
10523 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
10524 struct dictEntry
*de
;
10529 de
= dictFind(db
->io_keys
,key
);
10532 l
= dictGetEntryVal(de
);
10533 len
= listLength(l
);
10534 /* Note: we can't use something like while(listLength(l)) as the list
10535 * can be freed by the calling function when we remove the last element. */
10538 redisClient
*c
= ln
->value
;
10540 if (dontWaitForSwappedKey(c
,key
)) {
10541 /* Put the client in the list of clients ready to go as we
10542 * loaded all the keys about it. */
10543 listAddNodeTail(server
.io_ready_clients
,c
);
10548 /* =========================== Remote Configuration ========================= */
10550 static void configSetCommand(redisClient
*c
) {
10551 robj
*o
= getDecodedObject(c
->argv
[3]);
10554 if (!strcasecmp(c
->argv
[2]->ptr
,"dbfilename")) {
10555 zfree(server
.dbfilename
);
10556 server
.dbfilename
= zstrdup(o
->ptr
);
10557 } else if (!strcasecmp(c
->argv
[2]->ptr
,"requirepass")) {
10558 zfree(server
.requirepass
);
10559 server
.requirepass
= zstrdup(o
->ptr
);
10560 } else if (!strcasecmp(c
->argv
[2]->ptr
,"masterauth")) {
10561 zfree(server
.masterauth
);
10562 server
.masterauth
= zstrdup(o
->ptr
);
10563 } else if (!strcasecmp(c
->argv
[2]->ptr
,"maxmemory")) {
10564 if (getLongLongFromObject(o
,&ll
) == REDIS_ERR
||
10565 ll
< 0) goto badfmt
;
10566 server
.maxmemory
= ll
;
10567 } else if (!strcasecmp(c
->argv
[2]->ptr
,"timeout")) {
10568 if (getLongLongFromObject(o
,&ll
) == REDIS_ERR
||
10569 ll
< 0 || ll
> LONG_MAX
) goto badfmt
;
10570 server
.maxidletime
= ll
;
10571 } else if (!strcasecmp(c
->argv
[2]->ptr
,"appendfsync")) {
10572 if (!strcasecmp(o
->ptr
,"no")) {
10573 server
.appendfsync
= APPENDFSYNC_NO
;
10574 } else if (!strcasecmp(o
->ptr
,"everysec")) {
10575 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
10576 } else if (!strcasecmp(o
->ptr
,"always")) {
10577 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
10581 } else if (!strcasecmp(c
->argv
[2]->ptr
,"no-appendfsync-on-rewrite")) {
10582 int yn
= yesnotoi(o
->ptr
);
10584 if (yn
== -1) goto badfmt
;
10585 server
.no_appendfsync_on_rewrite
= yn
;
10586 } else if (!strcasecmp(c
->argv
[2]->ptr
,"appendonly")) {
10587 int old
= server
.appendonly
;
10588 int new = yesnotoi(o
->ptr
);
10590 if (new == -1) goto badfmt
;
10595 if (startAppendOnly() == REDIS_ERR
) {
10596 addReplySds(c
,sdscatprintf(sdsempty(),
10597 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10603 } else if (!strcasecmp(c
->argv
[2]->ptr
,"save")) {
10605 sds
*v
= sdssplitlen(o
->ptr
,sdslen(o
->ptr
)," ",1,&vlen
);
10607 /* Perform sanity check before setting the new config:
10608 * - Even number of args
10609 * - Seconds >= 1, changes >= 0 */
10611 sdsfreesplitres(v
,vlen
);
10614 for (j
= 0; j
< vlen
; j
++) {
10618 val
= strtoll(v
[j
], &eptr
, 10);
10619 if (eptr
[0] != '\0' ||
10620 ((j
& 1) == 0 && val
< 1) ||
10621 ((j
& 1) == 1 && val
< 0)) {
10622 sdsfreesplitres(v
,vlen
);
10626 /* Finally set the new config */
10627 resetServerSaveParams();
10628 for (j
= 0; j
< vlen
; j
+= 2) {
10632 seconds
= strtoll(v
[j
],NULL
,10);
10633 changes
= strtoll(v
[j
+1],NULL
,10);
10634 appendServerSaveParams(seconds
, changes
);
10636 sdsfreesplitres(v
,vlen
);
10638 addReplySds(c
,sdscatprintf(sdsempty(),
10639 "-ERR not supported CONFIG parameter %s\r\n",
10640 (char*)c
->argv
[2]->ptr
));
10645 addReply(c
,shared
.ok
);
10648 badfmt
: /* Bad format errors */
10649 addReplySds(c
,sdscatprintf(sdsempty(),
10650 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10652 (char*)c
->argv
[2]->ptr
));
10656 static void configGetCommand(redisClient
*c
) {
10657 robj
*o
= getDecodedObject(c
->argv
[2]);
10658 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
10659 char *pattern
= o
->ptr
;
10662 addReply(c
,lenobj
);
10663 decrRefCount(lenobj
);
10665 if (stringmatch(pattern
,"dbfilename",0)) {
10666 addReplyBulkCString(c
,"dbfilename");
10667 addReplyBulkCString(c
,server
.dbfilename
);
10670 if (stringmatch(pattern
,"requirepass",0)) {
10671 addReplyBulkCString(c
,"requirepass");
10672 addReplyBulkCString(c
,server
.requirepass
);
10675 if (stringmatch(pattern
,"masterauth",0)) {
10676 addReplyBulkCString(c
,"masterauth");
10677 addReplyBulkCString(c
,server
.masterauth
);
10680 if (stringmatch(pattern
,"maxmemory",0)) {
10683 ll2string(buf
,128,server
.maxmemory
);
10684 addReplyBulkCString(c
,"maxmemory");
10685 addReplyBulkCString(c
,buf
);
10688 if (stringmatch(pattern
,"timeout",0)) {
10691 ll2string(buf
,128,server
.maxidletime
);
10692 addReplyBulkCString(c
,"timeout");
10693 addReplyBulkCString(c
,buf
);
10696 if (stringmatch(pattern
,"appendonly",0)) {
10697 addReplyBulkCString(c
,"appendonly");
10698 addReplyBulkCString(c
,server
.appendonly
? "yes" : "no");
10701 if (stringmatch(pattern
,"no-appendfsync-on-rewrite",0)) {
10702 addReplyBulkCString(c
,"no-appendfsync-on-rewrite");
10703 addReplyBulkCString(c
,server
.no_appendfsync_on_rewrite
? "yes" : "no");
10706 if (stringmatch(pattern
,"appendfsync",0)) {
10709 switch(server
.appendfsync
) {
10710 case APPENDFSYNC_NO
: policy
= "no"; break;
10711 case APPENDFSYNC_EVERYSEC
: policy
= "everysec"; break;
10712 case APPENDFSYNC_ALWAYS
: policy
= "always"; break;
10713 default: policy
= "unknown"; break; /* too harmless to panic */
10715 addReplyBulkCString(c
,"appendfsync");
10716 addReplyBulkCString(c
,policy
);
10719 if (stringmatch(pattern
,"save",0)) {
10720 sds buf
= sdsempty();
10723 for (j
= 0; j
< server
.saveparamslen
; j
++) {
10724 buf
= sdscatprintf(buf
,"%ld %d",
10725 server
.saveparams
[j
].seconds
,
10726 server
.saveparams
[j
].changes
);
10727 if (j
!= server
.saveparamslen
-1)
10728 buf
= sdscatlen(buf
," ",1);
10730 addReplyBulkCString(c
,"save");
10731 addReplyBulkCString(c
,buf
);
10736 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%d\r\n",matches
*2);
10739 static void configCommand(redisClient
*c
) {
10740 if (!strcasecmp(c
->argv
[1]->ptr
,"set")) {
10741 if (c
->argc
!= 4) goto badarity
;
10742 configSetCommand(c
);
10743 } else if (!strcasecmp(c
->argv
[1]->ptr
,"get")) {
10744 if (c
->argc
!= 3) goto badarity
;
10745 configGetCommand(c
);
10746 } else if (!strcasecmp(c
->argv
[1]->ptr
,"resetstat")) {
10747 if (c
->argc
!= 2) goto badarity
;
10748 server
.stat_numcommands
= 0;
10749 server
.stat_numconnections
= 0;
10750 server
.stat_expiredkeys
= 0;
10751 server
.stat_starttime
= time(NULL
);
10752 addReply(c
,shared
.ok
);
10754 addReplySds(c
,sdscatprintf(sdsempty(),
10755 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10760 addReplySds(c
,sdscatprintf(sdsempty(),
10761 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10762 (char*) c
->argv
[1]->ptr
));
10765 /* =========================== Pubsub implementation ======================== */
10767 static void freePubsubPattern(void *p
) {
10768 pubsubPattern
*pat
= p
;
10770 decrRefCount(pat
->pattern
);
10774 static int listMatchPubsubPattern(void *a
, void *b
) {
10775 pubsubPattern
*pa
= a
, *pb
= b
;
10777 return (pa
->client
== pb
->client
) &&
10778 (equalStringObjects(pa
->pattern
,pb
->pattern
));
10781 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10782 * 0 if the client was already subscribed to that channel. */
10783 static int pubsubSubscribeChannel(redisClient
*c
, robj
*channel
) {
10784 struct dictEntry
*de
;
10785 list
*clients
= NULL
;
10788 /* Add the channel to the client -> channels hash table */
10789 if (dictAdd(c
->pubsub_channels
,channel
,NULL
) == DICT_OK
) {
10791 incrRefCount(channel
);
10792 /* Add the client to the channel -> list of clients hash table */
10793 de
= dictFind(server
.pubsub_channels
,channel
);
10795 clients
= listCreate();
10796 dictAdd(server
.pubsub_channels
,channel
,clients
);
10797 incrRefCount(channel
);
10799 clients
= dictGetEntryVal(de
);
10801 listAddNodeTail(clients
,c
);
10803 /* Notify the client */
10804 addReply(c
,shared
.mbulk3
);
10805 addReply(c
,shared
.subscribebulk
);
10806 addReplyBulk(c
,channel
);
10807 addReplyLongLong(c
,dictSize(c
->pubsub_channels
)+listLength(c
->pubsub_patterns
));
10811 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10812 * 0 if the client was not subscribed to the specified channel. */
10813 static int pubsubUnsubscribeChannel(redisClient
*c
, robj
*channel
, int notify
) {
10814 struct dictEntry
*de
;
10819 /* Remove the channel from the client -> channels hash table */
10820 incrRefCount(channel
); /* channel may be just a pointer to the same object
10821 we have in the hash tables. Protect it... */
10822 if (dictDelete(c
->pubsub_channels
,channel
) == DICT_OK
) {
10824 /* Remove the client from the channel -> clients list hash table */
10825 de
= dictFind(server
.pubsub_channels
,channel
);
10826 assert(de
!= NULL
);
10827 clients
= dictGetEntryVal(de
);
10828 ln
= listSearchKey(clients
,c
);
10829 assert(ln
!= NULL
);
10830 listDelNode(clients
,ln
);
10831 if (listLength(clients
) == 0) {
10832 /* Free the list and associated hash entry at all if this was
10833 * the latest client, so that it will be possible to abuse
10834 * Redis PUBSUB creating millions of channels. */
10835 dictDelete(server
.pubsub_channels
,channel
);
10838 /* Notify the client */
10840 addReply(c
,shared
.mbulk3
);
10841 addReply(c
,shared
.unsubscribebulk
);
10842 addReplyBulk(c
,channel
);
10843 addReplyLongLong(c
,dictSize(c
->pubsub_channels
)+
10844 listLength(c
->pubsub_patterns
));
10847 decrRefCount(channel
); /* it is finally safe to release it */
10851 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10852 static int pubsubSubscribePattern(redisClient
*c
, robj
*pattern
) {
10855 if (listSearchKey(c
->pubsub_patterns
,pattern
) == NULL
) {
10857 pubsubPattern
*pat
;
10858 listAddNodeTail(c
->pubsub_patterns
,pattern
);
10859 incrRefCount(pattern
);
10860 pat
= zmalloc(sizeof(*pat
));
10861 pat
->pattern
= getDecodedObject(pattern
);
10863 listAddNodeTail(server
.pubsub_patterns
,pat
);
10865 /* Notify the client */
10866 addReply(c
,shared
.mbulk3
);
10867 addReply(c
,shared
.psubscribebulk
);
10868 addReplyBulk(c
,pattern
);
10869 addReplyLongLong(c
,dictSize(c
->pubsub_channels
)+listLength(c
->pubsub_patterns
));
10873 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10874 * 0 if the client was not subscribed to the specified channel. */
10875 static int pubsubUnsubscribePattern(redisClient
*c
, robj
*pattern
, int notify
) {
10880 incrRefCount(pattern
); /* Protect the object. May be the same we remove */
10881 if ((ln
= listSearchKey(c
->pubsub_patterns
,pattern
)) != NULL
) {
10883 listDelNode(c
->pubsub_patterns
,ln
);
10885 pat
.pattern
= pattern
;
10886 ln
= listSearchKey(server
.pubsub_patterns
,&pat
);
10887 listDelNode(server
.pubsub_patterns
,ln
);
10889 /* Notify the client */
10891 addReply(c
,shared
.mbulk3
);
10892 addReply(c
,shared
.punsubscribebulk
);
10893 addReplyBulk(c
,pattern
);
10894 addReplyLongLong(c
,dictSize(c
->pubsub_channels
)+
10895 listLength(c
->pubsub_patterns
));
10897 decrRefCount(pattern
);
10901 /* Unsubscribe from all the channels. Return the number of channels the
10902 * client was subscribed from. */
10903 static int pubsubUnsubscribeAllChannels(redisClient
*c
, int notify
) {
10904 dictIterator
*di
= dictGetIterator(c
->pubsub_channels
);
10908 while((de
= dictNext(di
)) != NULL
) {
10909 robj
*channel
= dictGetEntryKey(de
);
10911 count
+= pubsubUnsubscribeChannel(c
,channel
,notify
);
10913 dictReleaseIterator(di
);
10917 /* Unsubscribe from all the patterns. Return the number of patterns the
10918 * client was subscribed from. */
10919 static int pubsubUnsubscribeAllPatterns(redisClient
*c
, int notify
) {
10924 listRewind(c
->pubsub_patterns
,&li
);
10925 while ((ln
= listNext(&li
)) != NULL
) {
10926 robj
*pattern
= ln
->value
;
10928 count
+= pubsubUnsubscribePattern(c
,pattern
,notify
);
10933 /* Publish a message */
10934 static int pubsubPublishMessage(robj
*channel
, robj
*message
) {
10936 struct dictEntry
*de
;
10940 /* Send to clients listening for that channel */
10941 de
= dictFind(server
.pubsub_channels
,channel
);
10943 list
*list
= dictGetEntryVal(de
);
10947 listRewind(list
,&li
);
10948 while ((ln
= listNext(&li
)) != NULL
) {
10949 redisClient
*c
= ln
->value
;
10951 addReply(c
,shared
.mbulk3
);
10952 addReply(c
,shared
.messagebulk
);
10953 addReplyBulk(c
,channel
);
10954 addReplyBulk(c
,message
);
10958 /* Send to clients listening to matching channels */
10959 if (listLength(server
.pubsub_patterns
)) {
10960 listRewind(server
.pubsub_patterns
,&li
);
10961 channel
= getDecodedObject(channel
);
10962 while ((ln
= listNext(&li
)) != NULL
) {
10963 pubsubPattern
*pat
= ln
->value
;
10965 if (stringmatchlen((char*)pat
->pattern
->ptr
,
10966 sdslen(pat
->pattern
->ptr
),
10967 (char*)channel
->ptr
,
10968 sdslen(channel
->ptr
),0)) {
10969 addReply(pat
->client
,shared
.mbulk4
);
10970 addReply(pat
->client
,shared
.pmessagebulk
);
10971 addReplyBulk(pat
->client
,pat
->pattern
);
10972 addReplyBulk(pat
->client
,channel
);
10973 addReplyBulk(pat
->client
,message
);
10977 decrRefCount(channel
);
10982 static void subscribeCommand(redisClient
*c
) {
10985 for (j
= 1; j
< c
->argc
; j
++)
10986 pubsubSubscribeChannel(c
,c
->argv
[j
]);
10989 static void unsubscribeCommand(redisClient
*c
) {
10990 if (c
->argc
== 1) {
10991 pubsubUnsubscribeAllChannels(c
,1);
10996 for (j
= 1; j
< c
->argc
; j
++)
10997 pubsubUnsubscribeChannel(c
,c
->argv
[j
],1);
11001 static void psubscribeCommand(redisClient
*c
) {
11004 for (j
= 1; j
< c
->argc
; j
++)
11005 pubsubSubscribePattern(c
,c
->argv
[j
]);
11008 static void punsubscribeCommand(redisClient
*c
) {
11009 if (c
->argc
== 1) {
11010 pubsubUnsubscribeAllPatterns(c
,1);
11015 for (j
= 1; j
< c
->argc
; j
++)
11016 pubsubUnsubscribePattern(c
,c
->argv
[j
],1);
11020 static void publishCommand(redisClient
*c
) {
11021 int receivers
= pubsubPublishMessage(c
->argv
[1],c
->argv
[2]);
11022 addReplyLongLong(c
,receivers
);
11025 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
11027 * The implementation uses a per-DB hash table mapping keys to list of clients
11028 * WATCHing those keys, so that given a key that is going to be modified
11029 * we can mark all the associated clients as dirty.
11031 * Also every client contains a list of WATCHed keys so that's possible to
11032 * un-watch such keys when the client is freed or when UNWATCH is called. */
11034 /* In the client->watched_keys list we need to use watchedKey structures
11035 * as in order to identify a key in Redis we need both the key name and the
11037 typedef struct watchedKey
{
11042 /* Watch for the specified key */
11043 static void watchForKey(redisClient
*c
, robj
*key
) {
11044 list
*clients
= NULL
;
11049 /* Check if we are already watching for this key */
11050 listRewind(c
->watched_keys
,&li
);
11051 while((ln
= listNext(&li
))) {
11052 wk
= listNodeValue(ln
);
11053 if (wk
->db
== c
->db
&& equalStringObjects(key
,wk
->key
))
11054 return; /* Key already watched */
11056 /* This key is not already watched in this DB. Let's add it */
11057 clients
= dictFetchValue(c
->db
->watched_keys
,key
);
11059 clients
= listCreate();
11060 dictAdd(c
->db
->watched_keys
,key
,clients
);
11063 listAddNodeTail(clients
,c
);
11064 /* Add the new key to the lits of keys watched by this client */
11065 wk
= zmalloc(sizeof(*wk
));
11069 listAddNodeTail(c
->watched_keys
,wk
);
11072 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
11073 * flag is up to the caller. */
11074 static void unwatchAllKeys(redisClient
*c
) {
11078 if (listLength(c
->watched_keys
) == 0) return;
11079 listRewind(c
->watched_keys
,&li
);
11080 while((ln
= listNext(&li
))) {
11084 /* Lookup the watched key -> clients list and remove the client
11086 wk
= listNodeValue(ln
);
11087 clients
= dictFetchValue(wk
->db
->watched_keys
, wk
->key
);
11088 assert(clients
!= NULL
);
11089 listDelNode(clients
,listSearchKey(clients
,c
));
11090 /* Kill the entry at all if this was the only client */
11091 if (listLength(clients
) == 0)
11092 dictDelete(wk
->db
->watched_keys
, wk
->key
);
11093 /* Remove this watched key from the client->watched list */
11094 listDelNode(c
->watched_keys
,ln
);
11095 decrRefCount(wk
->key
);
11100 /* "Touch" a key, so that if this key is being WATCHed by some client the
11101 * next EXEC will fail. */
11102 static void touchWatchedKey(redisDb
*db
, robj
*key
) {
11107 if (dictSize(db
->watched_keys
) == 0) return;
11108 clients
= dictFetchValue(db
->watched_keys
, key
);
11109 if (!clients
) return;
11111 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11112 /* Check if we are already watching for this key */
11113 listRewind(clients
,&li
);
11114 while((ln
= listNext(&li
))) {
11115 redisClient
*c
= listNodeValue(ln
);
11117 c
->flags
|= REDIS_DIRTY_CAS
;
11121 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11122 * flush but will be deleted as effect of the flushing operation should
11123 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11124 * a FLUSHALL operation (all the DBs flushed). */
11125 static void touchWatchedKeysOnFlush(int dbid
) {
11129 /* For every client, check all the waited keys */
11130 listRewind(server
.clients
,&li1
);
11131 while((ln
= listNext(&li1
))) {
11132 redisClient
*c
= listNodeValue(ln
);
11133 listRewind(c
->watched_keys
,&li2
);
11134 while((ln
= listNext(&li2
))) {
11135 watchedKey
*wk
= listNodeValue(ln
);
11137 /* For every watched key matching the specified DB, if the
11138 * key exists, mark the client as dirty, as the key will be
11140 if (dbid
== -1 || wk
->db
->id
== dbid
) {
11141 if (dictFind(wk
->db
->dict
, wk
->key
->ptr
) != NULL
)
11142 c
->flags
|= REDIS_DIRTY_CAS
;
11148 static void watchCommand(redisClient
*c
) {
11151 if (c
->flags
& REDIS_MULTI
) {
11152 addReplySds(c
,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11155 for (j
= 1; j
< c
->argc
; j
++)
11156 watchForKey(c
,c
->argv
[j
]);
11157 addReply(c
,shared
.ok
);
11160 static void unwatchCommand(redisClient
*c
) {
11162 c
->flags
&= (~REDIS_DIRTY_CAS
);
11163 addReply(c
,shared
.ok
);
11166 /* ================================= Debugging ============================== */
11168 /* Compute the sha1 of string at 's' with 'len' bytes long.
11169 * The SHA1 is then xored againt the string pointed by digest.
11170 * Since xor is commutative, this operation is used in order to
11171 * "add" digests relative to unordered elements.
11173 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11174 static void xorDigest(unsigned char *digest
, void *ptr
, size_t len
) {
11176 unsigned char hash
[20], *s
= ptr
;
11180 SHA1Update(&ctx
,s
,len
);
11181 SHA1Final(hash
,&ctx
);
11183 for (j
= 0; j
< 20; j
++)
11184 digest
[j
] ^= hash
[j
];
11187 static void xorObjectDigest(unsigned char *digest
, robj
*o
) {
11188 o
= getDecodedObject(o
);
11189 xorDigest(digest
,o
->ptr
,sdslen(o
->ptr
));
11193 /* This function instead of just computing the SHA1 and xoring it
11194 * against diget, also perform the digest of "digest" itself and
11195 * replace the old value with the new one.
11197 * So the final digest will be:
11199 * digest = SHA1(digest xor SHA1(data))
11201 * This function is used every time we want to preserve the order so
11202 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11204 * Also note that mixdigest("foo") followed by mixdigest("bar")
11205 * will lead to a different digest compared to "fo", "obar".
11207 static void mixDigest(unsigned char *digest
, void *ptr
, size_t len
) {
11211 xorDigest(digest
,s
,len
);
11213 SHA1Update(&ctx
,digest
,20);
11214 SHA1Final(digest
,&ctx
);
11217 static void mixObjectDigest(unsigned char *digest
, robj
*o
) {
11218 o
= getDecodedObject(o
);
11219 mixDigest(digest
,o
->ptr
,sdslen(o
->ptr
));
11223 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11224 * are not ordered, we use a trick: every aggregate digest is the xor
11225 * of the digests of their elements. This way the order will not change
11226 * the result. For list instead we use a feedback entering the output digest
11227 * as input in order to ensure that a different ordered list will result in
11228 * a different digest. */
11229 static void computeDatasetDigest(unsigned char *final
) {
11230 unsigned char digest
[20];
11232 dictIterator
*di
= NULL
;
11237 memset(final
,0,20); /* Start with a clean result */
11239 for (j
= 0; j
< server
.dbnum
; j
++) {
11240 redisDb
*db
= server
.db
+j
;
11242 if (dictSize(db
->dict
) == 0) continue;
11243 di
= dictGetIterator(db
->dict
);
11245 /* hash the DB id, so the same dataset moved in a different
11246 * DB will lead to a different digest */
11248 mixDigest(final
,&aux
,sizeof(aux
));
11250 /* Iterate this DB writing every entry */
11251 while((de
= dictNext(di
)) != NULL
) {
11256 memset(digest
,0,20); /* This key-val digest */
11257 key
= dictGetEntryKey(de
);
11258 keyobj
= createStringObject(key
,sdslen(key
));
11260 mixDigest(digest
,key
,sdslen(key
));
11262 /* Make sure the key is loaded if VM is active */
11263 o
= lookupKeyRead(db
,keyobj
);
11265 aux
= htonl(o
->type
);
11266 mixDigest(digest
,&aux
,sizeof(aux
));
11267 expiretime
= getExpire(db
,keyobj
);
11269 /* Save the key and associated value */
11270 if (o
->type
== REDIS_STRING
) {
11271 mixObjectDigest(digest
,o
);
11272 } else if (o
->type
== REDIS_LIST
) {
11273 listTypeIterator
*li
= listTypeInitIterator(o
,0,REDIS_TAIL
);
11274 listTypeEntry entry
;
11275 while(listTypeNext(li
,&entry
)) {
11276 robj
*eleobj
= listTypeGet(&entry
);
11277 mixObjectDigest(digest
,eleobj
);
11278 decrRefCount(eleobj
);
11280 listTypeReleaseIterator(li
);
11281 } else if (o
->type
== REDIS_SET
) {
11282 dict
*set
= o
->ptr
;
11283 dictIterator
*di
= dictGetIterator(set
);
11286 while((de
= dictNext(di
)) != NULL
) {
11287 robj
*eleobj
= dictGetEntryKey(de
);
11289 xorObjectDigest(digest
,eleobj
);
11291 dictReleaseIterator(di
);
11292 } else if (o
->type
== REDIS_ZSET
) {
11294 dictIterator
*di
= dictGetIterator(zs
->dict
);
11297 while((de
= dictNext(di
)) != NULL
) {
11298 robj
*eleobj
= dictGetEntryKey(de
);
11299 double *score
= dictGetEntryVal(de
);
11300 unsigned char eledigest
[20];
11302 snprintf(buf
,sizeof(buf
),"%.17g",*score
);
11303 memset(eledigest
,0,20);
11304 mixObjectDigest(eledigest
,eleobj
);
11305 mixDigest(eledigest
,buf
,strlen(buf
));
11306 xorDigest(digest
,eledigest
,20);
11308 dictReleaseIterator(di
);
11309 } else if (o
->type
== REDIS_HASH
) {
11310 hashTypeIterator
*hi
;
11313 hi
= hashTypeInitIterator(o
);
11314 while (hashTypeNext(hi
) != REDIS_ERR
) {
11315 unsigned char eledigest
[20];
11317 memset(eledigest
,0,20);
11318 obj
= hashTypeCurrent(hi
,REDIS_HASH_KEY
);
11319 mixObjectDigest(eledigest
,obj
);
11321 obj
= hashTypeCurrent(hi
,REDIS_HASH_VALUE
);
11322 mixObjectDigest(eledigest
,obj
);
11324 xorDigest(digest
,eledigest
,20);
11326 hashTypeReleaseIterator(hi
);
11328 redisPanic("Unknown object type");
11330 /* If the key has an expire, add it to the mix */
11331 if (expiretime
!= -1) xorDigest(digest
,"!!expire!!",10);
11332 /* We can finally xor the key-val digest to the final digest */
11333 xorDigest(final
,digest
,20);
11334 decrRefCount(keyobj
);
11336 dictReleaseIterator(di
);
11340 static void debugCommand(redisClient
*c
) {
11341 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
11342 *((char*)-1) = 'x';
11343 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
11344 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
11345 addReply(c
,shared
.err
);
11349 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
11350 addReply(c
,shared
.err
);
11353 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
11354 addReply(c
,shared
.ok
);
11355 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
11357 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
11358 addReply(c
,shared
.err
);
11361 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
11362 addReply(c
,shared
.ok
);
11363 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
11364 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]->ptr
);
11368 addReply(c
,shared
.nokeyerr
);
11371 val
= dictGetEntryVal(de
);
11372 if (!server
.vm_enabled
|| (val
->storage
== REDIS_VM_MEMORY
||
11373 val
->storage
== REDIS_VM_SWAPPING
)) {
11377 if (val
->encoding
< (sizeof(strencoding
)/sizeof(char*))) {
11378 strenc
= strencoding
[val
->encoding
];
11380 snprintf(buf
,64,"unknown encoding %d\n", val
->encoding
);
11383 addReplySds(c
,sdscatprintf(sdsempty(),
11384 "+Value at:%p refcount:%d "
11385 "encoding:%s serializedlength:%lld\r\n",
11386 (void*)val
, val
->refcount
,
11387 strenc
, (long long) rdbSavedObjectLen(val
,NULL
)));
11389 vmpointer
*vp
= (vmpointer
*) val
;
11390 addReplySds(c
,sdscatprintf(sdsempty(),
11391 "+Value swapped at: page %llu "
11392 "using %llu pages\r\n",
11393 (unsigned long long) vp
->page
,
11394 (unsigned long long) vp
->usedpages
));
11396 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapin") && c
->argc
== 3) {
11397 lookupKeyRead(c
->db
,c
->argv
[2]);
11398 addReply(c
,shared
.ok
);
11399 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
11400 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]->ptr
);
11404 if (!server
.vm_enabled
) {
11405 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11409 addReply(c
,shared
.nokeyerr
);
11412 val
= dictGetEntryVal(de
);
11414 if (val
->storage
!= REDIS_VM_MEMORY
) {
11415 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
11416 } else if (val
->refcount
!= 1) {
11417 addReplySds(c
,sdsnew("-ERR Object is shared\r\n"));
11418 } else if ((vp
= vmSwapObjectBlocking(val
)) != NULL
) {
11419 dictGetEntryVal(de
) = vp
;
11420 addReply(c
,shared
.ok
);
11422 addReply(c
,shared
.err
);
11424 } else if (!strcasecmp(c
->argv
[1]->ptr
,"populate") && c
->argc
== 3) {
11429 if (getLongFromObjectOrReply(c
, c
->argv
[2], &keys
, NULL
) != REDIS_OK
)
11431 for (j
= 0; j
< keys
; j
++) {
11432 snprintf(buf
,sizeof(buf
),"key:%lu",j
);
11433 key
= createStringObject(buf
,strlen(buf
));
11434 if (lookupKeyRead(c
->db
,key
) != NULL
) {
11438 snprintf(buf
,sizeof(buf
),"value:%lu",j
);
11439 val
= createStringObject(buf
,strlen(buf
));
11440 dbAdd(c
->db
,key
,val
);
11443 addReply(c
,shared
.ok
);
11444 } else if (!strcasecmp(c
->argv
[1]->ptr
,"digest") && c
->argc
== 2) {
11445 unsigned char digest
[20];
11446 sds d
= sdsnew("+");
11449 computeDatasetDigest(digest
);
11450 for (j
= 0; j
< 20; j
++)
11451 d
= sdscatprintf(d
, "%02x",digest
[j
]);
11453 d
= sdscatlen(d
,"\r\n",2);
11456 addReplySds(c
,sdsnew(
11457 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11461 static void _redisAssert(char *estr
, char *file
, int line
) {
11462 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
11463 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true",file
,line
,estr
);
11464 #ifdef HAVE_BACKTRACE
11465 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
11466 *((char*)-1) = 'x';
11470 static void _redisPanic(char *msg
, char *file
, int line
) {
11471 redisLog(REDIS_WARNING
,"!!! Software Failure. Press left mouse button to continue");
11472 redisLog(REDIS_WARNING
,"Guru Meditation: %s #%s:%d",msg
,file
,line
);
11473 #ifdef HAVE_BACKTRACE
11474 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
11475 *((char*)-1) = 'x';
11479 /* =================================== Main! ================================ */
11482 int linuxOvercommitMemoryValue(void) {
11483 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
11486 if (!fp
) return -1;
11487 if (fgets(buf
,64,fp
) == NULL
) {
11496 void linuxOvercommitMemoryWarning(void) {
11497 if (linuxOvercommitMemoryValue() == 0) {
11498 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11501 #endif /* __linux__ */
11503 static void daemonize(void) {
11507 if (fork() != 0) exit(0); /* parent exits */
11508 setsid(); /* create a new session */
11510 /* Every output goes to /dev/null. If Redis is daemonized but
11511 * the 'logfile' is set to 'stdout' in the configuration file
11512 * it will not log at all. */
11513 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
11514 dup2(fd
, STDIN_FILENO
);
11515 dup2(fd
, STDOUT_FILENO
);
11516 dup2(fd
, STDERR_FILENO
);
11517 if (fd
> STDERR_FILENO
) close(fd
);
11519 /* Try to write the pid file */
11520 fp
= fopen(server
.pidfile
,"w");
11522 fprintf(fp
,"%d\n",getpid());
11527 static void version() {
11528 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION
,
11529 REDIS_GIT_SHA1
, atoi(REDIS_GIT_DIRTY
) > 0);
11533 static void usage() {
11534 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
11535 fprintf(stderr
," ./redis-server - (read config from stdin)\n");
11539 int main(int argc
, char **argv
) {
11542 initServerConfig();
11543 sortCommandTable();
11545 if (strcmp(argv
[1], "-v") == 0 ||
11546 strcmp(argv
[1], "--version") == 0) version();
11547 if (strcmp(argv
[1], "--help") == 0) usage();
11548 resetServerSaveParams();
11549 loadServerConfig(argv
[1]);
11550 } else if ((argc
> 2)) {
11553 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11555 if (server
.daemonize
) daemonize();
11557 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
11559 linuxOvercommitMemoryWarning();
11561 start
= time(NULL
);
11562 if (server
.appendonly
) {
11563 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
11564 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
11566 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
11567 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
11569 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
11570 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
11572 aeDeleteEventLoop(server
.el
);
11576 /* ============================= Backtrace support ========================= */
11578 #ifdef HAVE_BACKTRACE
11579 static char *findFuncName(void *pointer
, unsigned long *offset
);
11581 static void *getMcontextEip(ucontext_t
*uc
) {
11582 #if defined(__FreeBSD__)
11583 return (void*) uc
->uc_mcontext
.mc_eip
;
11584 #elif defined(__dietlibc__)
11585 return (void*) uc
->uc_mcontext
.eip
;
11586 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11588 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
11590 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
11592 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11593 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11594 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
11596 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
11598 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11599 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
11600 #elif defined(__ia64__) /* Linux IA64 */
11601 return (void*) uc
->uc_mcontext
.sc_ip
;
11607 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
11609 char **messages
= NULL
;
11610 int i
, trace_size
= 0;
11611 unsigned long offset
=0;
11612 ucontext_t
*uc
= (ucontext_t
*) secret
;
11614 REDIS_NOTUSED(info
);
11616 redisLog(REDIS_WARNING
,
11617 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
11618 infostring
= genRedisInfoString();
11619 redisLog(REDIS_WARNING
, "%s",infostring
);
11620 /* It's not safe to sdsfree() the returned string under memory
11621 * corruption conditions. Let it leak as we are going to abort */
11623 trace_size
= backtrace(trace
, 100);
11624 /* overwrite sigaction with caller's address */
11625 if (getMcontextEip(uc
) != NULL
) {
11626 trace
[1] = getMcontextEip(uc
);
11628 messages
= backtrace_symbols(trace
, trace_size
);
11630 for (i
=1; i
<trace_size
; ++i
) {
11631 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
11633 p
= strchr(messages
[i
],'+');
11634 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
11635 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
11637 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
11640 /* free(messages); Don't call free() with possibly corrupted memory. */
11644 static void sigtermHandler(int sig
) {
11645 REDIS_NOTUSED(sig
);
11647 redisLog(REDIS_WARNING
,"SIGTERM received, scheduling shutting down...");
11648 server
.shutdown_asap
= 1;
11651 static void setupSigSegvAction(void) {
11652 struct sigaction act
;
11654 sigemptyset (&act
.sa_mask
);
11655 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11656 * is used. Otherwise, sa_handler is used */
11657 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
11658 act
.sa_sigaction
= segvHandler
;
11659 sigaction (SIGSEGV
, &act
, NULL
);
11660 sigaction (SIGBUS
, &act
, NULL
);
11661 sigaction (SIGFPE
, &act
, NULL
);
11662 sigaction (SIGILL
, &act
, NULL
);
11663 sigaction (SIGBUS
, &act
, NULL
);
11665 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
;
11666 act
.sa_handler
= sigtermHandler
;
11667 sigaction (SIGTERM
, &act
, NULL
);
11671 #include "staticsymbols.h"
11672 /* This function try to convert a pointer into a function name. It's used in
11673 * oreder to provide a backtrace under segmentation fault that's able to
11674 * display functions declared as static (otherwise the backtrace is useless). */
11675 static char *findFuncName(void *pointer
, unsigned long *offset
){
11677 unsigned long off
, minoff
= 0;
11679 /* Try to match against the Symbol with the smallest offset */
11680 for (i
=0; symsTable
[i
].pointer
; i
++) {
11681 unsigned long lp
= (unsigned long) pointer
;
11683 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
11684 off
=lp
-symsTable
[i
].pointer
;
11685 if (ret
< 0 || off
< minoff
) {
11691 if (ret
== -1) return NULL
;
11693 return symsTable
[ret
].name
;
11695 #else /* HAVE_BACKTRACE */
11696 static void setupSigSegvAction(void) {
11698 #endif /* HAVE_BACKTRACE */