2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "1.3.10"
45 #endif /* HAVE_BACKTRACE */
53 #include <arpa/inet.h>
57 #include <sys/resource.h>
64 #include "solarisfixes.h"
68 #include "ae.h" /* Event driven programming library */
69 #include "sds.h" /* Dynamic safe strings */
70 #include "anet.h" /* Networking the easy way */
71 #include "dict.h" /* Hash tables */
72 #include "adlist.h" /* Linked lists */
73 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
74 #include "lzf.h" /* LZF compression library */
75 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
82 /* Static server configuration */
83 #define REDIS_SERVERPORT 6379 /* TCP port */
84 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
85 #define REDIS_IOBUF_LEN 1024
86 #define REDIS_LOADBUF_LEN 1024
87 #define REDIS_STATIC_ARGS 8
88 #define REDIS_DEFAULT_DBNUM 16
89 #define REDIS_CONFIGLINE_MAX 1024
90 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
92 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
93 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
94 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
96 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97 #define REDIS_WRITEV_THRESHOLD 3
98 /* Max number of iovecs used for each writev call */
99 #define REDIS_WRITEV_IOVEC_COUNT 256
101 /* Hash table parameters */
102 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105 #define REDIS_CMD_BULK 1 /* Bulk write command */
106 #define REDIS_CMD_INLINE 2 /* Inline command */
107 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111 #define REDIS_CMD_DENYOOM 4
112 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115 #define REDIS_STRING 0
121 /* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
124 #define REDIS_ENCODING_RAW 0 /* Raw representation */
125 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
126 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129 static char* strencoding
[] = {
130 "raw", "int", "zipmap", "hashtable"
133 /* Object types only used for dumping to disk */
134 #define REDIS_EXPIRETIME 253
135 #define REDIS_SELECTDB 254
136 #define REDIS_EOF 255
138 /* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
151 #define REDIS_RDB_6BITLEN 0
152 #define REDIS_RDB_14BITLEN 1
153 #define REDIS_RDB_32BITLEN 2
154 #define REDIS_RDB_ENCVAL 3
155 #define REDIS_RDB_LENERR UINT_MAX
157 /* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
163 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165 /* Virtual memory object->where field. */
166 #define REDIS_VM_MEMORY 0 /* The object is on memory */
167 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
168 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171 /* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173 #define REDIS_VM_MAX_NEAR_PAGES 65536
174 #define REDIS_VM_MAX_RANDOM_JUMP 4096
175 #define REDIS_VM_MAX_THREADS 32
176 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
177 /* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
181 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184 #define REDIS_SLAVE 1 /* This client is a slave server */
185 #define REDIS_MASTER 2 /* This client is a master server */
186 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187 #define REDIS_MULTI 8 /* This client is in a MULTI context */
188 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191 /* Slave replication state - slave side */
192 #define REDIS_REPL_NONE 0 /* No active replication */
193 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
194 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196 /* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205 /* List related stuff */
209 /* Sort operations */
210 #define REDIS_SORT_GET 0
211 #define REDIS_SORT_ASC 1
212 #define REDIS_SORT_DESC 2
213 #define REDIS_SORTKEY_MAX 1024
216 #define REDIS_DEBUG 0
217 #define REDIS_VERBOSE 1
218 #define REDIS_NOTICE 2
219 #define REDIS_WARNING 3
221 /* Anti-warning macro... */
222 #define REDIS_NOTUSED(V) ((void) V)
224 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227 /* Append only defines */
228 #define APPENDFSYNC_NO 0
229 #define APPENDFSYNC_ALWAYS 1
230 #define APPENDFSYNC_EVERYSEC 2
232 /* Hashes related defaults */
233 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236 /* We can print the stacktrace, so our assert is defined this way: */
237 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
238 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
239 static void _redisAssert(char *estr
, char *file
, int line
);
240 static void _redisPanic(char *msg
, char *file
, int line
);
242 /*================================= Data types ============================== */
244 /* A redis object, that is a type able to hold a string / list / set */
246 /* The VM object structure */
247 struct redisObjectVM
{
248 off_t page
; /* the page at witch the object is stored on disk */
249 off_t usedpages
; /* number of pages used on disk */
250 time_t atime
; /* Last access time */
253 /* The actual Redis Object */
254 typedef struct redisObject
{
257 unsigned char encoding
;
258 unsigned char storage
; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype
; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm
;
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
282 typedef struct redisDb
{
283 dict
*dict
; /* The keyspace for this DB */
284 dict
*expires
; /* Timeout of keys with a timeout set */
285 dict
*blockingkeys
; /* Keys with clients waiting for data (BLPOP) */
286 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd
{
294 struct redisCommand
*cmd
;
297 typedef struct multiState
{
298 multiCmd
*commands
; /* Array of MULTI commands */
299 int count
; /* Total number of MULTI commands */
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient
{
309 robj
**argv
, **mbargv
;
311 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk
; /* multi bulk command format active */
315 time_t lastinteraction
; /* time of the last interaction, used for timeout */
316 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb
; /* slave selected db, if this client is a slave */
318 int authenticated
; /* when requirepass is non-NULL */
319 int replstate
; /* replication state if this is a slave */
320 int repldbfd
; /* replication DB file descriptor */
321 long repldboff
; /* replication DB file offset */
322 off_t repldbsize
; /* replication DB file size */
323 multiState mstate
; /* MULTI/EXEC state */
324 robj
**blockingkeys
; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum
; /* Number of blocking keys */
327 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list
*io_keys
; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict
*pubsub_channels
; /* channels a client is interested in (SUBSCRIBE) */
332 list
*pubsub_patterns
; /* patterns a client is interested in (SUBSCRIBE) */
340 /* Global server state structure */
345 long long dirty
; /* changes to DB from the last save */
347 list
*slaves
, *monitors
;
348 char neterr
[ANET_ERR_LEN
];
350 int cronloops
; /* number of times the cron function run */
351 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
352 time_t lastsave
; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime
; /* server start time */
355 long long stat_numcommands
; /* number of processed commands */
356 long long stat_numconnections
; /* number of connections received */
357 long long stat_expiredkeys
; /* number of expired keys */
370 pid_t bgsavechildpid
;
371 pid_t bgrewritechildpid
;
372 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
373 sds aofbuf
; /* AOF buffer, written before entering the event loop */
374 struct saveparam
*saveparams
;
379 char *appendfilename
;
383 /* Replication related */
388 redisClient
*master
; /* client that is master for this slave */
390 unsigned int maxclients
;
391 unsigned long long maxmemory
;
392 unsigned int blpop_blocked_clients
;
393 unsigned int vm_blocked_clients
;
394 /* Sort parameters - qsort_r() is only available under BSD so we
395 * have to take this state global, in order to pass it to sortCompare() */
399 /* Virtual memory configuration */
404 unsigned long long vm_max_memory
;
406 size_t hash_max_zipmap_entries
;
407 size_t hash_max_zipmap_value
;
408 /* Virtual memory state */
411 off_t vm_next_page
; /* Next probably empty page */
412 off_t vm_near_pages
; /* Number of pages allocated sequentially */
413 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
414 time_t unixtime
; /* Unix time sampled every second. */
415 /* Virtual memory I/O threads stuff */
416 /* An I/O thread process an element taken from the io_jobs queue and
417 * put the result of the operation in the io_done list. While the
418 * job is being processed, it's put on io_processing queue. */
419 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
420 list
*io_processing
; /* List of VM I/O jobs being processed */
421 list
*io_processed
; /* List of VM I/O jobs already processed */
422 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
423 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
424 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
425 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
426 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
427 int io_active_threads
; /* Number of running I/O threads */
428 int vm_max_threads
; /* Max number of I/O threads running at the same time */
429 /* Our main thread is blocked on the event loop, locking for sockets ready
430 * to be read or written, so when a threaded I/O operation is ready to be
431 * processed by the main thread, the I/O thread will use a unix pipe to
432 * awake the main thread. The followings are the two pipe FDs. */
433 int io_ready_pipe_read
;
434 int io_ready_pipe_write
;
435 /* Virtual memory stats */
436 unsigned long long vm_stats_used_pages
;
437 unsigned long long vm_stats_swapped_objects
;
438 unsigned long long vm_stats_swapouts
;
439 unsigned long long vm_stats_swapins
;
441 dict
*pubsub_channels
; /* Map channels to list of subscribed clients */
442 list
*pubsub_patterns
; /* A list of pubsub_patterns */
447 typedef struct pubsubPattern
{
452 typedef void redisCommandProc(redisClient
*c
);
453 typedef void redisVmPreloadProc(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
);
454 struct redisCommand
{
456 redisCommandProc
*proc
;
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisVmPreloadProc
*vm_preload_proc
;
463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey
; /* THe last argument that's a key */
466 int vm_keystep
; /* The step between first and last key */
469 struct redisFunctionSym
{
471 unsigned long pointer
;
474 typedef struct _redisSortObject
{
482 typedef struct _redisSortOperation
{
485 } redisSortOperation
;
487 /* ZSETs use a specialized version of Skiplists */
489 typedef struct zskiplistNode
{
490 struct zskiplistNode
**forward
;
491 struct zskiplistNode
*backward
;
497 typedef struct zskiplist
{
498 struct zskiplistNode
*header
, *tail
;
499 unsigned long length
;
503 typedef struct zset
{
508 /* Our shared "common" objects */
510 #define REDIS_SHARED_INTEGERS 10000
511 struct sharedObjectsStruct
{
512 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
513 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
514 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
515 *outofrangeerr
, *plus
,
516 *select0
, *select1
, *select2
, *select3
, *select4
,
517 *select5
, *select6
, *select7
, *select8
, *select9
,
518 *messagebulk
, *pmessagebulk
, *subscribebulk
, *unsubscribebulk
, *mbulk3
,
519 *mbulk4
, *psubscribebulk
, *punsubscribebulk
,
520 *integers
[REDIS_SHARED_INTEGERS
];
523 /* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
527 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
529 /* VM threaded I/O request message */
530 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
533 typedef struct iojob
{
534 int type
; /* Request type, REDIS_IOJOB_* */
535 redisDb
*db
;/* Redis database */
536 robj
*key
; /* This I/O request is about swapping this key */
537 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page
; /* Swap page where to read/write the object */
540 off_t pages
; /* Swap pages needed to save object. PREPARE_SWAP return val */
541 int canceled
; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread
; /* ID of the thread processing this entry */
545 /*================================ Prototypes =============================== */
547 static void freeStringObject(robj
*o
);
548 static void freeListObject(robj
*o
);
549 static void freeSetObject(robj
*o
);
550 static void decrRefCount(void *o
);
551 static robj
*createObject(int type
, void *ptr
);
552 static void freeClient(redisClient
*c
);
553 static int rdbLoad(char *filename
);
554 static void addReply(redisClient
*c
, robj
*obj
);
555 static void addReplySds(redisClient
*c
, sds s
);
556 static void incrRefCount(robj
*o
);
557 static int rdbSaveBackground(char *filename
);
558 static robj
*createStringObject(char *ptr
, size_t len
);
559 static robj
*dupStringObject(robj
*o
);
560 static void replicationFeedSlaves(list
*slaves
, int dictid
, robj
**argv
, int argc
);
561 static void replicationFeedMonitors(list
*monitors
, int dictid
, robj
**argv
, int argc
);
562 static void flushAppendOnlyFile(void);
563 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
564 static int syncWithMaster(void);
565 static robj
*tryObjectEncoding(robj
*o
);
566 static robj
*getDecodedObject(robj
*o
);
567 static int removeExpire(redisDb
*db
, robj
*key
);
568 static int expireIfNeeded(redisDb
*db
, robj
*key
);
569 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
570 static int deleteIfSwapped(redisDb
*db
, robj
*key
);
571 static int deleteKey(redisDb
*db
, robj
*key
);
572 static time_t getExpire(redisDb
*db
, robj
*key
);
573 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
574 static void updateSlavesWaitingBgsave(int bgsaveerr
);
575 static void freeMemoryIfNeeded(void);
576 static int processCommand(redisClient
*c
);
577 static void setupSigSegvAction(void);
578 static void rdbRemoveTempFile(pid_t childpid
);
579 static void aofRemoveTempFile(pid_t childpid
);
580 static size_t stringObjectLen(robj
*o
);
581 static void processInputBuffer(redisClient
*c
);
582 static zskiplist
*zslCreate(void);
583 static void zslFree(zskiplist
*zsl
);
584 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
585 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
586 static void initClientMultiState(redisClient
*c
);
587 static void freeClientMultiState(redisClient
*c
);
588 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
589 static void unblockClientWaitingData(redisClient
*c
);
590 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
591 static void vmInit(void);
592 static void vmMarkPagesFree(off_t page
, off_t count
);
593 static robj
*vmLoadObject(robj
*key
);
594 static robj
*vmPreviewObject(robj
*key
);
595 static int vmSwapOneObjectBlocking(void);
596 static int vmSwapOneObjectThreaded(void);
597 static int vmCanSwapOut(void);
598 static int tryFreeOneObjectFromFreelist(void);
599 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
600 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
601 static void vmCancelThreadedIOJob(robj
*o
);
602 static void lockThreadedIO(void);
603 static void unlockThreadedIO(void);
604 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
605 static void freeIOJob(iojob
*j
);
606 static void queueIOJob(iojob
*j
);
607 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
608 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
609 static void waitEmptyIOJobsQueue(void);
610 static void vmReopenSwapFile(void);
611 static int vmFreePage(off_t page
);
612 static void zunionInterBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
);
613 static void execBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
);
614 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
);
615 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
616 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
617 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
618 static struct redisCommand
*lookupCommand(char *name
);
619 static void call(redisClient
*c
, struct redisCommand
*cmd
);
620 static void resetClient(redisClient
*c
);
621 static void convertToRealHash(robj
*o
);
622 static int pubsubUnsubscribeAllChannels(redisClient
*c
, int notify
);
623 static int pubsubUnsubscribeAllPatterns(redisClient
*c
, int notify
);
624 static void freePubsubPattern(void *p
);
625 static int listMatchPubsubPattern(void *a
, void *b
);
626 static int compareStringObjects(robj
*a
, robj
*b
);
628 static int rewriteAppendOnlyFileBackground(void);
630 static void authCommand(redisClient
*c
);
631 static void pingCommand(redisClient
*c
);
632 static void echoCommand(redisClient
*c
);
633 static void setCommand(redisClient
*c
);
634 static void setnxCommand(redisClient
*c
);
635 static void setexCommand(redisClient
*c
);
636 static void getCommand(redisClient
*c
);
637 static void delCommand(redisClient
*c
);
638 static void existsCommand(redisClient
*c
);
639 static void incrCommand(redisClient
*c
);
640 static void decrCommand(redisClient
*c
);
641 static void incrbyCommand(redisClient
*c
);
642 static void decrbyCommand(redisClient
*c
);
643 static void selectCommand(redisClient
*c
);
644 static void randomkeyCommand(redisClient
*c
);
645 static void keysCommand(redisClient
*c
);
646 static void dbsizeCommand(redisClient
*c
);
647 static void lastsaveCommand(redisClient
*c
);
648 static void saveCommand(redisClient
*c
);
649 static void bgsaveCommand(redisClient
*c
);
650 static void bgrewriteaofCommand(redisClient
*c
);
651 static void shutdownCommand(redisClient
*c
);
652 static void moveCommand(redisClient
*c
);
653 static void renameCommand(redisClient
*c
);
654 static void renamenxCommand(redisClient
*c
);
655 static void lpushCommand(redisClient
*c
);
656 static void rpushCommand(redisClient
*c
);
657 static void lpopCommand(redisClient
*c
);
658 static void rpopCommand(redisClient
*c
);
659 static void llenCommand(redisClient
*c
);
660 static void lindexCommand(redisClient
*c
);
661 static void lrangeCommand(redisClient
*c
);
662 static void ltrimCommand(redisClient
*c
);
663 static void typeCommand(redisClient
*c
);
664 static void lsetCommand(redisClient
*c
);
665 static void saddCommand(redisClient
*c
);
666 static void sremCommand(redisClient
*c
);
667 static void smoveCommand(redisClient
*c
);
668 static void sismemberCommand(redisClient
*c
);
669 static void scardCommand(redisClient
*c
);
670 static void spopCommand(redisClient
*c
);
671 static void srandmemberCommand(redisClient
*c
);
672 static void sinterCommand(redisClient
*c
);
673 static void sinterstoreCommand(redisClient
*c
);
674 static void sunionCommand(redisClient
*c
);
675 static void sunionstoreCommand(redisClient
*c
);
676 static void sdiffCommand(redisClient
*c
);
677 static void sdiffstoreCommand(redisClient
*c
);
678 static void syncCommand(redisClient
*c
);
679 static void flushdbCommand(redisClient
*c
);
680 static void flushallCommand(redisClient
*c
);
681 static void sortCommand(redisClient
*c
);
682 static void lremCommand(redisClient
*c
);
683 static void rpoplpushcommand(redisClient
*c
);
684 static void infoCommand(redisClient
*c
);
685 static void mgetCommand(redisClient
*c
);
686 static void monitorCommand(redisClient
*c
);
687 static void expireCommand(redisClient
*c
);
688 static void expireatCommand(redisClient
*c
);
689 static void getsetCommand(redisClient
*c
);
690 static void ttlCommand(redisClient
*c
);
691 static void slaveofCommand(redisClient
*c
);
692 static void debugCommand(redisClient
*c
);
693 static void msetCommand(redisClient
*c
);
694 static void msetnxCommand(redisClient
*c
);
695 static void zaddCommand(redisClient
*c
);
696 static void zincrbyCommand(redisClient
*c
);
697 static void zrangeCommand(redisClient
*c
);
698 static void zrangebyscoreCommand(redisClient
*c
);
699 static void zcountCommand(redisClient
*c
);
700 static void zrevrangeCommand(redisClient
*c
);
701 static void zcardCommand(redisClient
*c
);
702 static void zremCommand(redisClient
*c
);
703 static void zscoreCommand(redisClient
*c
);
704 static void zremrangebyscoreCommand(redisClient
*c
);
705 static void multiCommand(redisClient
*c
);
706 static void execCommand(redisClient
*c
);
707 static void discardCommand(redisClient
*c
);
708 static void blpopCommand(redisClient
*c
);
709 static void brpopCommand(redisClient
*c
);
710 static void appendCommand(redisClient
*c
);
711 static void substrCommand(redisClient
*c
);
712 static void zrankCommand(redisClient
*c
);
713 static void zrevrankCommand(redisClient
*c
);
714 static void hsetCommand(redisClient
*c
);
715 static void hsetnxCommand(redisClient
*c
);
716 static void hgetCommand(redisClient
*c
);
717 static void hmsetCommand(redisClient
*c
);
718 static void hmgetCommand(redisClient
*c
);
719 static void hdelCommand(redisClient
*c
);
720 static void hlenCommand(redisClient
*c
);
721 static void zremrangebyrankCommand(redisClient
*c
);
722 static void zunionCommand(redisClient
*c
);
723 static void zinterCommand(redisClient
*c
);
724 static void hkeysCommand(redisClient
*c
);
725 static void hvalsCommand(redisClient
*c
);
726 static void hgetallCommand(redisClient
*c
);
727 static void hexistsCommand(redisClient
*c
);
728 static void configCommand(redisClient
*c
);
729 static void hincrbyCommand(redisClient
*c
);
730 static void subscribeCommand(redisClient
*c
);
731 static void unsubscribeCommand(redisClient
*c
);
732 static void psubscribeCommand(redisClient
*c
);
733 static void punsubscribeCommand(redisClient
*c
);
734 static void publishCommand(redisClient
*c
);
736 /*================================= Globals ================================= */
739 static struct redisServer server
; /* server global state */
740 static struct redisCommand cmdTable
[] = {
741 {"get",getCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
742 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
743 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
744 {"setex",setexCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,0,0,0},
745 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
746 {"substr",substrCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
747 {"del",delCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
748 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
749 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
750 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
751 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,NULL
,1,-1,1},
752 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
753 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
754 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
755 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
756 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,NULL
,1,1,1},
757 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,NULL
,1,1,1},
758 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
759 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
760 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
761 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
762 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
763 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,NULL
,1,1,1},
764 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,2,1},
765 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
766 {"srem",sremCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
767 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,NULL
,1,2,1},
768 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
769 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
770 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
771 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
772 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
773 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
774 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
775 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
776 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,-1,1},
777 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,2,-1,1},
778 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
779 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
780 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
781 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
782 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
783 {"zremrangebyrank",zremrangebyrankCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
784 {"zunion",zunionCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,zunionInterBlockClientOnSwappedKeys
,0,0,0},
785 {"zinter",zinterCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,zunionInterBlockClientOnSwappedKeys
,0,0,0},
786 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
787 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
788 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,NULL
,1,1,1},
789 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,NULL
,1,1,1},
790 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
791 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
792 {"zrank",zrankCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
793 {"zrevrank",zrevrankCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
794 {"hset",hsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
795 {"hsetnx",hsetnxCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
796 {"hget",hgetCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
797 {"hmset",hmsetCommand
,-4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
798 {"hmget",hmgetCommand
,-3,REDIS_CMD_BULK
,NULL
,1,1,1},
799 {"hincrby",hincrbyCommand
,4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
800 {"hdel",hdelCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
801 {"hlen",hlenCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
802 {"hkeys",hkeysCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
803 {"hvals",hvalsCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
804 {"hgetall",hgetallCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
805 {"hexists",hexistsCommand
,3,REDIS_CMD_BULK
,NULL
,1,1,1},
806 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
807 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
808 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
809 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,-1,2},
810 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,NULL
,1,-1,2},
811 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
812 {"select",selectCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
813 {"move",moveCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
814 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
815 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,NULL
,1,1,1},
816 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
817 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
818 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
819 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
820 {"auth",authCommand
,2,REDIS_CMD_INLINE
,NULL
,0,0,0},
821 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
822 {"echo",echoCommand
,2,REDIS_CMD_BULK
,NULL
,0,0,0},
823 {"save",saveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
824 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
825 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
826 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
827 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
828 {"type",typeCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
829 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
830 {"exec",execCommand
,1,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,execBlockClientOnSwappedKeys
,0,0,0},
831 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
832 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
833 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
834 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
835 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,NULL
,1,1,1},
836 {"info",infoCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
837 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,NULL
,0,0,0},
838 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,NULL
,1,1,1},
839 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,NULL
,0,0,0},
840 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
841 {"config",configCommand
,-2,REDIS_CMD_BULK
,NULL
,0,0,0},
842 {"subscribe",subscribeCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
843 {"unsubscribe",unsubscribeCommand
,-1,REDIS_CMD_INLINE
,NULL
,0,0,0},
844 {"psubscribe",psubscribeCommand
,-2,REDIS_CMD_INLINE
,NULL
,0,0,0},
845 {"punsubscribe",punsubscribeCommand
,-1,REDIS_CMD_INLINE
,NULL
,0,0,0},
846 {"publish",publishCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_FORCE_REPLICATION
,NULL
,0,0,0},
847 {NULL
,NULL
,0,0,NULL
,0,0,0}
850 /*============================ Utility functions ============================ */
852 /* Glob-style pattern matching. */
853 static int stringmatchlen(const char *pattern
, int patternLen
,
854 const char *string
, int stringLen
, int nocase
)
859 while (pattern
[1] == '*') {
864 return 1; /* match */
866 if (stringmatchlen(pattern
+1, patternLen
-1,
867 string
, stringLen
, nocase
))
868 return 1; /* match */
872 return 0; /* no match */
876 return 0; /* no match */
886 not = pattern
[0] == '^';
893 if (pattern
[0] == '\\') {
896 if (pattern
[0] == string
[0])
898 } else if (pattern
[0] == ']') {
900 } else if (patternLen
== 0) {
904 } else if (pattern
[1] == '-' && patternLen
>= 3) {
905 int start
= pattern
[0];
906 int end
= pattern
[2];
914 start
= tolower(start
);
920 if (c
>= start
&& c
<= end
)
924 if (pattern
[0] == string
[0])
927 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
937 return 0; /* no match */
943 if (patternLen
>= 2) {
950 if (pattern
[0] != string
[0])
951 return 0; /* no match */
953 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
954 return 0; /* no match */
962 if (stringLen
== 0) {
963 while(*pattern
== '*') {
970 if (patternLen
== 0 && stringLen
== 0)
975 static int stringmatch(const char *pattern
, const char *string
, int nocase
) {
976 return stringmatchlen(pattern
,strlen(pattern
),string
,strlen(string
),nocase
);
979 /* Convert a string representing an amount of memory into the number of
980 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
983 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
985 static long long memtoll(const char *p
, int *err
) {
988 long mul
; /* unit multiplier */
993 /* Search the first non digit character. */
996 while(*u
&& isdigit(*u
)) u
++;
997 if (*u
== '\0' || !strcasecmp(u
,"b")) {
999 } else if (!strcasecmp(u
,"k")) {
1001 } else if (!strcasecmp(u
,"kb")) {
1003 } else if (!strcasecmp(u
,"m")) {
1005 } else if (!strcasecmp(u
,"mb")) {
1007 } else if (!strcasecmp(u
,"g")) {
1008 mul
= 1000L*1000*1000;
1009 } else if (!strcasecmp(u
,"gb")) {
1010 mul
= 1024L*1024*1024;
1016 if (digits
>= sizeof(buf
)) {
1020 memcpy(buf
,p
,digits
);
1022 val
= strtoll(buf
,NULL
,10);
1026 static void redisLog(int level
, const char *fmt
, ...) {
1030 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
1034 if (level
>= server
.verbosity
) {
1040 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
1041 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
1042 vfprintf(fp
, fmt
, ap
);
1048 if (server
.logfile
) fclose(fp
);
1051 /*====================== Hash table type implementation ==================== */
1053 /* This is an hash table type that uses the SDS dynamic strings libary as
1054 * keys and radis objects as values (objects can hold SDS strings,
1057 static void dictVanillaFree(void *privdata
, void *val
)
1059 DICT_NOTUSED(privdata
);
1063 static void dictListDestructor(void *privdata
, void *val
)
1065 DICT_NOTUSED(privdata
);
1066 listRelease((list
*)val
);
1069 static int sdsDictKeyCompare(void *privdata
, const void *key1
,
1073 DICT_NOTUSED(privdata
);
1075 l1
= sdslen((sds
)key1
);
1076 l2
= sdslen((sds
)key2
);
1077 if (l1
!= l2
) return 0;
1078 return memcmp(key1
, key2
, l1
) == 0;
1081 static void dictRedisObjectDestructor(void *privdata
, void *val
)
1083 DICT_NOTUSED(privdata
);
1085 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
1089 static int dictObjKeyCompare(void *privdata
, const void *key1
,
1092 const robj
*o1
= key1
, *o2
= key2
;
1093 return sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1096 static unsigned int dictObjHash(const void *key
) {
1097 const robj
*o
= key
;
1098 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1101 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
1104 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
1107 if (o1
->encoding
== REDIS_ENCODING_INT
&&
1108 o2
->encoding
== REDIS_ENCODING_INT
&&
1109 o1
->ptr
== o2
->ptr
) return 1;
1111 o1
= getDecodedObject(o1
);
1112 o2
= getDecodedObject(o2
);
1113 cmp
= sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1119 static unsigned int dictEncObjHash(const void *key
) {
1120 robj
*o
= (robj
*) key
;
1122 if (o
->encoding
== REDIS_ENCODING_RAW
) {
1123 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1125 if (o
->encoding
== REDIS_ENCODING_INT
) {
1129 len
= snprintf(buf
,32,"%ld",(long)o
->ptr
);
1130 return dictGenHashFunction((unsigned char*)buf
, len
);
1134 o
= getDecodedObject(o
);
1135 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1142 /* Sets type and expires */
1143 static dictType setDictType
= {
1144 dictEncObjHash
, /* hash function */
1147 dictEncObjKeyCompare
, /* key compare */
1148 dictRedisObjectDestructor
, /* key destructor */
1149 NULL
/* val destructor */
1152 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1153 static dictType zsetDictType
= {
1154 dictEncObjHash
, /* hash function */
1157 dictEncObjKeyCompare
, /* key compare */
1158 dictRedisObjectDestructor
, /* key destructor */
1159 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1163 static dictType dbDictType
= {
1164 dictObjHash
, /* hash function */
1167 dictObjKeyCompare
, /* key compare */
1168 dictRedisObjectDestructor
, /* key destructor */
1169 dictRedisObjectDestructor
/* val destructor */
1173 static dictType keyptrDictType
= {
1174 dictObjHash
, /* hash function */
1177 dictObjKeyCompare
, /* key compare */
1178 dictRedisObjectDestructor
, /* key destructor */
1179 NULL
/* val destructor */
1182 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1183 static dictType hashDictType
= {
1184 dictEncObjHash
, /* hash function */
1187 dictEncObjKeyCompare
, /* key compare */
1188 dictRedisObjectDestructor
, /* key destructor */
1189 dictRedisObjectDestructor
/* val destructor */
1192 /* Keylist hash table type has unencoded redis objects as keys and
1193 * lists as values. It's used for blocking operations (BLPOP) and to
1194 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1195 static dictType keylistDictType
= {
1196 dictObjHash
, /* hash function */
1199 dictObjKeyCompare
, /* key compare */
1200 dictRedisObjectDestructor
, /* key destructor */
1201 dictListDestructor
/* val destructor */
1204 static void version();
1206 /* ========================= Random utility functions ======================= */
1208 /* Redis generally does not try to recover from out of memory conditions
1209 * when allocating objects or strings, it is not clear if it will be possible
1210 * to report this condition to the client since the networking layer itself
1211 * is based on heap allocation for send buffers, so we simply abort.
1212 * At least the code will be simpler to read... */
1213 static void oom(const char *msg
) {
1214 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1219 /* ====================== Redis server networking stuff ===================== */
1220 static void closeTimedoutClients(void) {
1223 time_t now
= time(NULL
);
1226 listRewind(server
.clients
,&li
);
1227 while ((ln
= listNext(&li
)) != NULL
) {
1228 c
= listNodeValue(ln
);
1229 if (server
.maxidletime
&&
1230 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1231 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1232 dictSize(c
->pubsub_channels
) == 0 && /* no timeout for pubsub */
1233 listLength(c
->pubsub_patterns
) == 0 &&
1234 (now
- c
->lastinteraction
> server
.maxidletime
))
1236 redisLog(REDIS_VERBOSE
,"Closing idle client");
1238 } else if (c
->flags
& REDIS_BLOCKED
) {
1239 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1240 addReply(c
,shared
.nullmultibulk
);
1241 unblockClientWaitingData(c
);
1247 static int htNeedsResize(dict
*dict
) {
1248 long long size
, used
;
1250 size
= dictSlots(dict
);
1251 used
= dictSize(dict
);
1252 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1253 (used
*100/size
< REDIS_HT_MINFILL
));
1256 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1257 * we resize the hash table to save memory */
1258 static void tryResizeHashTables(void) {
1261 for (j
= 0; j
< server
.dbnum
; j
++) {
1262 if (htNeedsResize(server
.db
[j
].dict
))
1263 dictResize(server
.db
[j
].dict
);
1264 if (htNeedsResize(server
.db
[j
].expires
))
1265 dictResize(server
.db
[j
].expires
);
1269 /* Our hash table implementation performs rehashing incrementally while
1270 * we write/read from the hash table. Still if the server is idle, the hash
1271 * table will use two tables for a long time. So we try to use 1 millisecond
1272 * of CPU time at every serverCron() loop in order to rehash some key. */
1273 static void incrementallyRehash(void) {
1276 for (j
= 0; j
< server
.dbnum
; j
++) {
1277 if (dictIsRehashing(server
.db
[j
].dict
)) {
1278 dictRehashMilliseconds(server
.db
[j
].dict
,1);
1279 break; /* already used our millisecond for this loop... */
1284 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1285 void backgroundSaveDoneHandler(int statloc
) {
1286 int exitcode
= WEXITSTATUS(statloc
);
1287 int bysignal
= WIFSIGNALED(statloc
);
1289 if (!bysignal
&& exitcode
== 0) {
1290 redisLog(REDIS_NOTICE
,
1291 "Background saving terminated with success");
1293 server
.lastsave
= time(NULL
);
1294 } else if (!bysignal
&& exitcode
!= 0) {
1295 redisLog(REDIS_WARNING
, "Background saving error");
1297 redisLog(REDIS_WARNING
,
1298 "Background saving terminated by signal %d", WTERMSIG(statloc
));
1299 rdbRemoveTempFile(server
.bgsavechildpid
);
1301 server
.bgsavechildpid
= -1;
1302 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1303 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1304 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1307 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1309 void backgroundRewriteDoneHandler(int statloc
) {
1310 int exitcode
= WEXITSTATUS(statloc
);
1311 int bysignal
= WIFSIGNALED(statloc
);
1313 if (!bysignal
&& exitcode
== 0) {
1317 redisLog(REDIS_NOTICE
,
1318 "Background append only file rewriting terminated with success");
1319 /* Now it's time to flush the differences accumulated by the parent */
1320 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1321 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1323 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1326 /* Flush our data... */
1327 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1328 (signed) sdslen(server
.bgrewritebuf
)) {
1329 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1333 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1334 /* Now our work is to rename the temp file into the stable file. And
1335 * switch the file descriptor used by the server for append only. */
1336 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1337 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1341 /* Mission completed... almost */
1342 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1343 if (server
.appendfd
!= -1) {
1344 /* If append only is actually enabled... */
1345 close(server
.appendfd
);
1346 server
.appendfd
= fd
;
1348 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1349 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1351 /* If append only is disabled we just generate a dump in this
1352 * format. Why not? */
1355 } else if (!bysignal
&& exitcode
!= 0) {
1356 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1358 redisLog(REDIS_WARNING
,
1359 "Background append only file rewriting terminated by signal %d",
1363 sdsfree(server
.bgrewritebuf
);
1364 server
.bgrewritebuf
= sdsempty();
1365 aofRemoveTempFile(server
.bgrewritechildpid
);
1366 server
.bgrewritechildpid
= -1;
1369 /* This function is called once a background process of some kind terminates,
1370 * as we want to avoid resizing the hash tables when there is a child in order
1371 * to play well with copy-on-write (otherwise when a resize happens lots of
1372 * memory pages are copied). The goal of this function is to update the ability
1373 * for dict.c to resize the hash tables accordingly to the fact we have o not
1374 * running childs. */
1375 static void updateDictResizePolicy(void) {
1376 if (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1)
1379 dictDisableResize();
1382 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1383 int j
, loops
= server
.cronloops
++;
1384 REDIS_NOTUSED(eventLoop
);
1386 REDIS_NOTUSED(clientData
);
1388 /* We take a cached value of the unix time in the global state because
1389 * with virtual memory and aging there is to store the current time
1390 * in objects at every object access, and accuracy is not needed.
1391 * To access a global var is faster than calling time(NULL) */
1392 server
.unixtime
= time(NULL
);
1394 /* Show some info about non-empty databases */
1395 for (j
= 0; j
< server
.dbnum
; j
++) {
1396 long long size
, used
, vkeys
;
1398 size
= dictSlots(server
.db
[j
].dict
);
1399 used
= dictSize(server
.db
[j
].dict
);
1400 vkeys
= dictSize(server
.db
[j
].expires
);
1401 if (!(loops
% 50) && (used
|| vkeys
)) {
1402 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1403 /* dictPrintStats(server.dict); */
1407 /* We don't want to resize the hash tables while a bacground saving
1408 * is in progress: the saving child is created using fork() that is
1409 * implemented with a copy-on-write semantic in most modern systems, so
1410 * if we resize the HT while there is the saving child at work actually
1411 * a lot of memory movements in the parent will cause a lot of pages
1413 if (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1) {
1414 if (!(loops
% 10)) tryResizeHashTables();
1415 if (server
.activerehashing
) incrementallyRehash();
1418 /* Show information about connected clients */
1419 if (!(loops
% 50)) {
1420 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use",
1421 listLength(server
.clients
)-listLength(server
.slaves
),
1422 listLength(server
.slaves
),
1423 zmalloc_used_memory());
1426 /* Close connections of timedout clients */
1427 if ((server
.maxidletime
&& !(loops
% 100)) || server
.blpop_blocked_clients
)
1428 closeTimedoutClients();
1430 /* Check if a background saving or AOF rewrite in progress terminated */
1431 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1435 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1436 if (pid
== server
.bgsavechildpid
) {
1437 backgroundSaveDoneHandler(statloc
);
1439 backgroundRewriteDoneHandler(statloc
);
1441 updateDictResizePolicy();
1444 /* If there is not a background saving in progress check if
1445 * we have to save now */
1446 time_t now
= time(NULL
);
1447 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1448 struct saveparam
*sp
= server
.saveparams
+j
;
1450 if (server
.dirty
>= sp
->changes
&&
1451 now
-server
.lastsave
> sp
->seconds
) {
1452 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1453 sp
->changes
, sp
->seconds
);
1454 rdbSaveBackground(server
.dbfilename
);
1460 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1461 * will use few CPU cycles if there are few expiring keys, otherwise
1462 * it will get more aggressive to avoid that too much memory is used by
1463 * keys that can be removed from the keyspace. */
1464 for (j
= 0; j
< server
.dbnum
; j
++) {
1466 redisDb
*db
= server
.db
+j
;
1468 /* Continue to expire if at the end of the cycle more than 25%
1469 * of the keys were expired. */
1471 long num
= dictSize(db
->expires
);
1472 time_t now
= time(NULL
);
1475 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1476 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1481 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1482 t
= (time_t) dictGetEntryVal(de
);
1484 deleteKey(db
,dictGetEntryKey(de
));
1486 server
.stat_expiredkeys
++;
1489 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1492 /* Swap a few keys on disk if we are over the memory limit and VM
1493 * is enbled. Try to free objects from the free list first. */
1494 if (vmCanSwapOut()) {
1495 while (server
.vm_enabled
&& zmalloc_used_memory() >
1496 server
.vm_max_memory
)
1500 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1501 retval
= (server
.vm_max_threads
== 0) ?
1502 vmSwapOneObjectBlocking() :
1503 vmSwapOneObjectThreaded();
1504 if (retval
== REDIS_ERR
&& !(loops
% 300) &&
1505 zmalloc_used_memory() >
1506 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1508 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1510 /* Note that when using threade I/O we free just one object,
1511 * because anyway when the I/O thread in charge to swap this
1512 * object out will finish, the handler of completed jobs
1513 * will try to swap more objects if we are still out of memory. */
1514 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1518 /* Check if we should connect to a MASTER */
1519 if (server
.replstate
== REDIS_REPL_CONNECT
&& !(loops
% 10)) {
1520 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1521 if (syncWithMaster() == REDIS_OK
) {
1522 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1523 if (server
.appendonly
) rewriteAppendOnlyFileBackground();
1529 /* This function gets called every time Redis is entering the
1530 * main loop of the event driven library, that is, before to sleep
1531 * for ready file descriptors. */
1532 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1533 REDIS_NOTUSED(eventLoop
);
1535 /* Awake clients that got all the swapped keys they requested */
1536 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1540 listRewind(server
.io_ready_clients
,&li
);
1541 while((ln
= listNext(&li
))) {
1542 redisClient
*c
= ln
->value
;
1543 struct redisCommand
*cmd
;
1545 /* Resume the client. */
1546 listDelNode(server
.io_ready_clients
,ln
);
1547 c
->flags
&= (~REDIS_IO_WAIT
);
1548 server
.vm_blocked_clients
--;
1549 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1550 readQueryFromClient
, c
);
1551 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1552 assert(cmd
!= NULL
);
1555 /* There may be more data to process in the input buffer. */
1556 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1557 processInputBuffer(c
);
1560 /* Write the AOF buffer on disk */
1561 flushAppendOnlyFile();
1564 static void createSharedObjects(void) {
1567 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1568 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1569 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1570 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1571 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1572 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1573 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1574 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1575 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1576 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1577 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1578 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1579 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1580 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1581 "-ERR no such key\r\n"));
1582 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1583 "-ERR syntax error\r\n"));
1584 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1585 "-ERR source and destination objects are the same\r\n"));
1586 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1587 "-ERR index out of range\r\n"));
1588 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1589 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1590 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1591 shared
.select0
= createStringObject("select 0\r\n",10);
1592 shared
.select1
= createStringObject("select 1\r\n",10);
1593 shared
.select2
= createStringObject("select 2\r\n",10);
1594 shared
.select3
= createStringObject("select 3\r\n",10);
1595 shared
.select4
= createStringObject("select 4\r\n",10);
1596 shared
.select5
= createStringObject("select 5\r\n",10);
1597 shared
.select6
= createStringObject("select 6\r\n",10);
1598 shared
.select7
= createStringObject("select 7\r\n",10);
1599 shared
.select8
= createStringObject("select 8\r\n",10);
1600 shared
.select9
= createStringObject("select 9\r\n",10);
1601 shared
.messagebulk
= createStringObject("$7\r\nmessage\r\n",13);
1602 shared
.pmessagebulk
= createStringObject("$8\r\npmessage\r\n",14);
1603 shared
.subscribebulk
= createStringObject("$9\r\nsubscribe\r\n",15);
1604 shared
.unsubscribebulk
= createStringObject("$11\r\nunsubscribe\r\n",18);
1605 shared
.psubscribebulk
= createStringObject("$10\r\npsubscribe\r\n",17);
1606 shared
.punsubscribebulk
= createStringObject("$12\r\npunsubscribe\r\n",19);
1607 shared
.mbulk3
= createStringObject("*3\r\n",4);
1608 shared
.mbulk4
= createStringObject("*4\r\n",4);
1609 for (j
= 0; j
< REDIS_SHARED_INTEGERS
; j
++) {
1610 shared
.integers
[j
] = createObject(REDIS_STRING
,(void*)(long)j
);
1611 shared
.integers
[j
]->encoding
= REDIS_ENCODING_INT
;
1615 static void appendServerSaveParams(time_t seconds
, int changes
) {
1616 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1617 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1618 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1619 server
.saveparamslen
++;
1622 static void resetServerSaveParams() {
1623 zfree(server
.saveparams
);
1624 server
.saveparams
= NULL
;
1625 server
.saveparamslen
= 0;
1628 static void initServerConfig() {
1629 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1630 server
.port
= REDIS_SERVERPORT
;
1631 server
.verbosity
= REDIS_VERBOSE
;
1632 server
.maxidletime
= REDIS_MAXIDLETIME
;
1633 server
.saveparams
= NULL
;
1634 server
.logfile
= NULL
; /* NULL = log on standard output */
1635 server
.bindaddr
= NULL
;
1636 server
.glueoutputbuf
= 1;
1637 server
.daemonize
= 0;
1638 server
.appendonly
= 0;
1639 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1640 server
.lastfsync
= time(NULL
);
1641 server
.appendfd
= -1;
1642 server
.appendseldb
= -1; /* Make sure the first time will not match */
1643 server
.pidfile
= zstrdup("/var/run/redis.pid");
1644 server
.dbfilename
= zstrdup("dump.rdb");
1645 server
.appendfilename
= zstrdup("appendonly.aof");
1646 server
.requirepass
= NULL
;
1647 server
.rdbcompression
= 1;
1648 server
.activerehashing
= 1;
1649 server
.maxclients
= 0;
1650 server
.blpop_blocked_clients
= 0;
1651 server
.maxmemory
= 0;
1652 server
.vm_enabled
= 0;
1653 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1654 server
.vm_page_size
= 256; /* 256 bytes per page */
1655 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1656 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1657 server
.vm_max_threads
= 4;
1658 server
.vm_blocked_clients
= 0;
1659 server
.hash_max_zipmap_entries
= REDIS_HASH_MAX_ZIPMAP_ENTRIES
;
1660 server
.hash_max_zipmap_value
= REDIS_HASH_MAX_ZIPMAP_VALUE
;
1662 resetServerSaveParams();
1664 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1665 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1666 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1667 /* Replication related */
1669 server
.masterauth
= NULL
;
1670 server
.masterhost
= NULL
;
1671 server
.masterport
= 6379;
1672 server
.master
= NULL
;
1673 server
.replstate
= REDIS_REPL_NONE
;
1675 /* Double constants initialization */
1677 R_PosInf
= 1.0/R_Zero
;
1678 R_NegInf
= -1.0/R_Zero
;
1679 R_Nan
= R_Zero
/R_Zero
;
1682 static void initServer() {
1685 signal(SIGHUP
, SIG_IGN
);
1686 signal(SIGPIPE
, SIG_IGN
);
1687 setupSigSegvAction();
1689 server
.devnull
= fopen("/dev/null","w");
1690 if (server
.devnull
== NULL
) {
1691 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1694 server
.clients
= listCreate();
1695 server
.slaves
= listCreate();
1696 server
.monitors
= listCreate();
1697 server
.objfreelist
= listCreate();
1698 createSharedObjects();
1699 server
.el
= aeCreateEventLoop();
1700 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1701 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1702 if (server
.fd
== -1) {
1703 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1706 for (j
= 0; j
< server
.dbnum
; j
++) {
1707 server
.db
[j
].dict
= dictCreate(&dbDictType
,NULL
);
1708 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1709 server
.db
[j
].blockingkeys
= dictCreate(&keylistDictType
,NULL
);
1710 if (server
.vm_enabled
)
1711 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1712 server
.db
[j
].id
= j
;
1714 server
.pubsub_channels
= dictCreate(&keylistDictType
,NULL
);
1715 server
.pubsub_patterns
= listCreate();
1716 listSetFreeMethod(server
.pubsub_patterns
,freePubsubPattern
);
1717 listSetMatchMethod(server
.pubsub_patterns
,listMatchPubsubPattern
);
1718 server
.cronloops
= 0;
1719 server
.bgsavechildpid
= -1;
1720 server
.bgrewritechildpid
= -1;
1721 server
.bgrewritebuf
= sdsempty();
1722 server
.aofbuf
= sdsempty();
1723 server
.lastsave
= time(NULL
);
1725 server
.stat_numcommands
= 0;
1726 server
.stat_numconnections
= 0;
1727 server
.stat_expiredkeys
= 0;
1728 server
.stat_starttime
= time(NULL
);
1729 server
.unixtime
= time(NULL
);
1730 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1731 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1732 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1734 if (server
.appendonly
) {
1735 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1736 if (server
.appendfd
== -1) {
1737 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1743 if (server
.vm_enabled
) vmInit();
1746 /* Empty the whole database */
1747 static long long emptyDb() {
1749 long long removed
= 0;
1751 for (j
= 0; j
< server
.dbnum
; j
++) {
1752 removed
+= dictSize(server
.db
[j
].dict
);
1753 dictEmpty(server
.db
[j
].dict
);
1754 dictEmpty(server
.db
[j
].expires
);
1759 static int yesnotoi(char *s
) {
1760 if (!strcasecmp(s
,"yes")) return 1;
1761 else if (!strcasecmp(s
,"no")) return 0;
1765 /* I agree, this is a very rudimental way to load a configuration...
1766 will improve later if the config gets more complex */
1767 static void loadServerConfig(char *filename
) {
1769 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1773 if (filename
[0] == '-' && filename
[1] == '\0')
1776 if ((fp
= fopen(filename
,"r")) == NULL
) {
1777 redisLog(REDIS_WARNING
, "Fatal error, can't open config file '%s'", filename
);
1782 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1788 line
= sdstrim(line
," \t\r\n");
1790 /* Skip comments and blank lines*/
1791 if (line
[0] == '#' || line
[0] == '\0') {
1796 /* Split into arguments */
1797 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1798 sdstolower(argv
[0]);
1800 /* Execute config directives */
1801 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1802 server
.maxidletime
= atoi(argv
[1]);
1803 if (server
.maxidletime
< 0) {
1804 err
= "Invalid timeout value"; goto loaderr
;
1806 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1807 server
.port
= atoi(argv
[1]);
1808 if (server
.port
< 1 || server
.port
> 65535) {
1809 err
= "Invalid port"; goto loaderr
;
1811 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1812 server
.bindaddr
= zstrdup(argv
[1]);
1813 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1814 int seconds
= atoi(argv
[1]);
1815 int changes
= atoi(argv
[2]);
1816 if (seconds
< 1 || changes
< 0) {
1817 err
= "Invalid save parameters"; goto loaderr
;
1819 appendServerSaveParams(seconds
,changes
);
1820 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1821 if (chdir(argv
[1]) == -1) {
1822 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1823 argv
[1], strerror(errno
));
1826 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1827 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1828 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1829 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1830 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1832 err
= "Invalid log level. Must be one of debug, notice, warning";
1835 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1838 server
.logfile
= zstrdup(argv
[1]);
1839 if (!strcasecmp(server
.logfile
,"stdout")) {
1840 zfree(server
.logfile
);
1841 server
.logfile
= NULL
;
1843 if (server
.logfile
) {
1844 /* Test if we are able to open the file. The server will not
1845 * be able to abort just for this problem later... */
1846 logfp
= fopen(server
.logfile
,"a");
1847 if (logfp
== NULL
) {
1848 err
= sdscatprintf(sdsempty(),
1849 "Can't open the log file: %s", strerror(errno
));
1854 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1855 server
.dbnum
= atoi(argv
[1]);
1856 if (server
.dbnum
< 1) {
1857 err
= "Invalid number of databases"; goto loaderr
;
1859 } else if (!strcasecmp(argv
[0],"include") && argc
== 2) {
1860 loadServerConfig(argv
[1]);
1861 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1862 server
.maxclients
= atoi(argv
[1]);
1863 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1864 server
.maxmemory
= memtoll(argv
[1],NULL
);
1865 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1866 server
.masterhost
= sdsnew(argv
[1]);
1867 server
.masterport
= atoi(argv
[2]);
1868 server
.replstate
= REDIS_REPL_CONNECT
;
1869 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1870 server
.masterauth
= zstrdup(argv
[1]);
1871 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1872 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1873 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1875 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1876 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1877 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1879 } else if (!strcasecmp(argv
[0],"activerehashing") && argc
== 2) {
1880 if ((server
.activerehashing
= yesnotoi(argv
[1])) == -1) {
1881 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1883 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1884 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1885 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1887 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
1888 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
1889 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1891 } else if (!strcasecmp(argv
[0],"appendfilename") && argc
== 2) {
1892 zfree(server
.appendfilename
);
1893 server
.appendfilename
= zstrdup(argv
[1]);
1894 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
1895 if (!strcasecmp(argv
[1],"no")) {
1896 server
.appendfsync
= APPENDFSYNC_NO
;
1897 } else if (!strcasecmp(argv
[1],"always")) {
1898 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1899 } else if (!strcasecmp(argv
[1],"everysec")) {
1900 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1902 err
= "argument must be 'no', 'always' or 'everysec'";
1905 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
1906 server
.requirepass
= zstrdup(argv
[1]);
1907 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
1908 zfree(server
.pidfile
);
1909 server
.pidfile
= zstrdup(argv
[1]);
1910 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
1911 zfree(server
.dbfilename
);
1912 server
.dbfilename
= zstrdup(argv
[1]);
1913 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
1914 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
1915 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1917 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
1918 zfree(server
.vm_swap_file
);
1919 server
.vm_swap_file
= zstrdup(argv
[1]);
1920 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
1921 server
.vm_max_memory
= memtoll(argv
[1],NULL
);
1922 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
1923 server
.vm_page_size
= memtoll(argv
[1], NULL
);
1924 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
1925 server
.vm_pages
= memtoll(argv
[1], NULL
);
1926 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1927 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1928 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-entries") && argc
== 2){
1929 server
.hash_max_zipmap_entries
= memtoll(argv
[1], NULL
);
1930 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-value") && argc
== 2){
1931 server
.hash_max_zipmap_value
= memtoll(argv
[1], NULL
);
1933 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
1935 for (j
= 0; j
< argc
; j
++)
1940 if (fp
!= stdin
) fclose(fp
);
1944 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
1945 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
1946 fprintf(stderr
, ">>> '%s'\n", line
);
1947 fprintf(stderr
, "%s\n", err
);
1951 static void freeClientArgv(redisClient
*c
) {
1954 for (j
= 0; j
< c
->argc
; j
++)
1955 decrRefCount(c
->argv
[j
]);
1956 for (j
= 0; j
< c
->mbargc
; j
++)
1957 decrRefCount(c
->mbargv
[j
]);
1962 static void freeClient(redisClient
*c
) {
1965 /* Note that if the client we are freeing is blocked into a blocking
1966 * call, we have to set querybuf to NULL *before* to call
1967 * unblockClientWaitingData() to avoid processInputBuffer() will get
1968 * called. Also it is important to remove the file events after
1969 * this, because this call adds the READABLE event. */
1970 sdsfree(c
->querybuf
);
1972 if (c
->flags
& REDIS_BLOCKED
)
1973 unblockClientWaitingData(c
);
1975 /* Unsubscribe from all the pubsub channels */
1976 pubsubUnsubscribeAllChannels(c
,0);
1977 pubsubUnsubscribeAllPatterns(c
,0);
1978 dictRelease(c
->pubsub_channels
);
1979 listRelease(c
->pubsub_patterns
);
1980 /* Obvious cleanup */
1981 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
1982 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1983 listRelease(c
->reply
);
1986 /* Remove from the list of clients */
1987 ln
= listSearchKey(server
.clients
,c
);
1988 redisAssert(ln
!= NULL
);
1989 listDelNode(server
.clients
,ln
);
1990 /* Remove from the list of clients waiting for swapped keys */
1991 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
1992 ln
= listSearchKey(server
.io_ready_clients
,c
);
1994 listDelNode(server
.io_ready_clients
,ln
);
1995 server
.vm_blocked_clients
--;
1998 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
1999 ln
= listFirst(c
->io_keys
);
2000 dontWaitForSwappedKey(c
,ln
->value
);
2002 listRelease(c
->io_keys
);
2003 /* Master/slave cleanup */
2004 if (c
->flags
& REDIS_SLAVE
) {
2005 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
2007 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
2008 ln
= listSearchKey(l
,c
);
2009 redisAssert(ln
!= NULL
);
2012 if (c
->flags
& REDIS_MASTER
) {
2013 server
.master
= NULL
;
2014 server
.replstate
= REDIS_REPL_CONNECT
;
2016 /* Release memory */
2019 freeClientMultiState(c
);
2023 #define GLUEREPLY_UP_TO (1024)
2024 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
2026 char buf
[GLUEREPLY_UP_TO
];
2031 listRewind(c
->reply
,&li
);
2032 while((ln
= listNext(&li
))) {
2036 objlen
= sdslen(o
->ptr
);
2037 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
2038 memcpy(buf
+copylen
,o
->ptr
,objlen
);
2040 listDelNode(c
->reply
,ln
);
2042 if (copylen
== 0) return;
2046 /* Now the output buffer is empty, add the new single element */
2047 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
2048 listAddNodeHead(c
->reply
,o
);
2051 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2052 redisClient
*c
= privdata
;
2053 int nwritten
= 0, totwritten
= 0, objlen
;
2056 REDIS_NOTUSED(mask
);
2058 /* Use writev() if we have enough buffers to send */
2059 if (!server
.glueoutputbuf
&&
2060 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
2061 !(c
->flags
& REDIS_MASTER
))
2063 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
2067 while(listLength(c
->reply
)) {
2068 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
2069 glueReplyBuffersIfNeeded(c
);
2071 o
= listNodeValue(listFirst(c
->reply
));
2072 objlen
= sdslen(o
->ptr
);
2075 listDelNode(c
->reply
,listFirst(c
->reply
));
2079 if (c
->flags
& REDIS_MASTER
) {
2080 /* Don't reply to a master */
2081 nwritten
= objlen
- c
->sentlen
;
2083 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
2084 if (nwritten
<= 0) break;
2086 c
->sentlen
+= nwritten
;
2087 totwritten
+= nwritten
;
2088 /* If we fully sent the object on head go to the next one */
2089 if (c
->sentlen
== objlen
) {
2090 listDelNode(c
->reply
,listFirst(c
->reply
));
2093 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2094 * bytes, in a single threaded server it's a good idea to serve
2095 * other clients as well, even if a very large request comes from
2096 * super fast link that is always able to accept data (in real world
2097 * scenario think about 'KEYS *' against the loopback interfae) */
2098 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
2100 if (nwritten
== -1) {
2101 if (errno
== EAGAIN
) {
2104 redisLog(REDIS_VERBOSE
,
2105 "Error writing to client: %s", strerror(errno
));
2110 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
2111 if (listLength(c
->reply
) == 0) {
2113 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2117 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
2119 redisClient
*c
= privdata
;
2120 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
2122 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
2123 int offset
, ion
= 0;
2125 REDIS_NOTUSED(mask
);
2128 while (listLength(c
->reply
)) {
2129 offset
= c
->sentlen
;
2133 /* fill-in the iov[] array */
2134 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
2135 o
= listNodeValue(node
);
2136 objlen
= sdslen(o
->ptr
);
2138 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
2141 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
2142 break; /* no more iovecs */
2144 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
2145 iov
[ion
].iov_len
= objlen
- offset
;
2146 willwrite
+= objlen
- offset
;
2147 offset
= 0; /* just for the first item */
2154 /* write all collected blocks at once */
2155 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
2156 if (errno
!= EAGAIN
) {
2157 redisLog(REDIS_VERBOSE
,
2158 "Error writing to client: %s", strerror(errno
));
2165 totwritten
+= nwritten
;
2166 offset
= c
->sentlen
;
2168 /* remove written robjs from c->reply */
2169 while (nwritten
&& listLength(c
->reply
)) {
2170 o
= listNodeValue(listFirst(c
->reply
));
2171 objlen
= sdslen(o
->ptr
);
2173 if(nwritten
>= objlen
- offset
) {
2174 listDelNode(c
->reply
, listFirst(c
->reply
));
2175 nwritten
-= objlen
- offset
;
2179 c
->sentlen
+= nwritten
;
2187 c
->lastinteraction
= time(NULL
);
2189 if (listLength(c
->reply
) == 0) {
2191 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2195 static struct redisCommand
*lookupCommand(char *name
) {
2197 while(cmdTable
[j
].name
!= NULL
) {
2198 if (!strcasecmp(name
,cmdTable
[j
].name
)) return &cmdTable
[j
];
2204 /* resetClient prepare the client to process the next command */
2205 static void resetClient(redisClient
*c
) {
2211 /* Call() is the core of Redis execution of a command */
2212 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
2215 dirty
= server
.dirty
;
2217 dirty
= server
.dirty
-dirty
;
2219 if (server
.appendonly
&& dirty
)
2220 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2221 if ((dirty
|| cmd
->flags
& REDIS_CMD_FORCE_REPLICATION
) &&
2222 listLength(server
.slaves
))
2223 replicationFeedSlaves(server
.slaves
,c
->db
->id
,c
->argv
,c
->argc
);
2224 if (listLength(server
.monitors
))
2225 replicationFeedMonitors(server
.monitors
,c
->db
->id
,c
->argv
,c
->argc
);
2226 server
.stat_numcommands
++;
2229 /* If this function gets called we already read a whole
2230 * command, argments are in the client argv/argc fields.
2231 * processCommand() execute the command or prepare the
2232 * server for a bulk read from the client.
2234 * If 1 is returned the client is still alive and valid and
2235 * and other operations can be performed by the caller. Otherwise
2236 * if 0 is returned the client was destroied (i.e. after QUIT). */
2237 static int processCommand(redisClient
*c
) {
2238 struct redisCommand
*cmd
;
2240 /* Free some memory if needed (maxmemory setting) */
2241 if (server
.maxmemory
) freeMemoryIfNeeded();
2243 /* Handle the multi bulk command type. This is an alternative protocol
2244 * supported by Redis in order to receive commands that are composed of
2245 * multiple binary-safe "bulk" arguments. The latency of processing is
2246 * a bit higher but this allows things like multi-sets, so if this
2247 * protocol is used only for MSET and similar commands this is a big win. */
2248 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2249 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2250 if (c
->multibulk
<= 0) {
2254 decrRefCount(c
->argv
[c
->argc
-1]);
2258 } else if (c
->multibulk
) {
2259 if (c
->bulklen
== -1) {
2260 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2261 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2265 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2266 decrRefCount(c
->argv
[0]);
2267 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2269 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2274 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2278 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2279 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2283 if (c
->multibulk
== 0) {
2287 /* Here we need to swap the multi-bulk argc/argv with the
2288 * normal argc/argv of the client structure. */
2290 c
->argv
= c
->mbargv
;
2291 c
->mbargv
= auxargv
;
2294 c
->argc
= c
->mbargc
;
2295 c
->mbargc
= auxargc
;
2297 /* We need to set bulklen to something different than -1
2298 * in order for the code below to process the command without
2299 * to try to read the last argument of a bulk command as
2300 * a special argument. */
2302 /* continue below and process the command */
2309 /* -- end of multi bulk commands processing -- */
2311 /* The QUIT command is handled as a special case. Normal command
2312 * procs are unable to close the client connection safely */
2313 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2318 /* Now lookup the command and check ASAP about trivial error conditions
2319 * such wrong arity, bad command name and so forth. */
2320 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2323 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2324 (char*)c
->argv
[0]->ptr
));
2327 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2328 (c
->argc
< -cmd
->arity
)) {
2330 sdscatprintf(sdsempty(),
2331 "-ERR wrong number of arguments for '%s' command\r\n",
2335 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2336 /* This is a bulk command, we have to read the last argument yet. */
2337 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2339 decrRefCount(c
->argv
[c
->argc
-1]);
2340 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2342 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2347 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2348 /* It is possible that the bulk read is already in the
2349 * buffer. Check this condition and handle it accordingly.
2350 * This is just a fast path, alternative to call processInputBuffer().
2351 * It's a good idea since the code is small and this condition
2352 * happens most of the times. */
2353 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2354 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2356 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2358 /* Otherwise return... there is to read the last argument
2359 * from the socket. */
2363 /* Let's try to encode the bulk object to save space. */
2364 if (cmd
->flags
& REDIS_CMD_BULK
)
2365 c
->argv
[c
->argc
-1] = tryObjectEncoding(c
->argv
[c
->argc
-1]);
2367 /* Check if the user is authenticated */
2368 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2369 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2374 /* Handle the maxmemory directive */
2375 if (server
.maxmemory
&& (cmd
->flags
& REDIS_CMD_DENYOOM
) &&
2376 zmalloc_used_memory() > server
.maxmemory
)
2378 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2383 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2384 if ((dictSize(c
->pubsub_channels
) > 0 || listLength(c
->pubsub_patterns
) > 0)
2386 cmd
->proc
!= subscribeCommand
&& cmd
->proc
!= unsubscribeCommand
&&
2387 cmd
->proc
!= psubscribeCommand
&& cmd
->proc
!= punsubscribeCommand
) {
2388 addReplySds(c
,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2393 /* Exec the command */
2394 if (c
->flags
& REDIS_MULTI
&& cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
) {
2395 queueMultiCommand(c
,cmd
);
2396 addReply(c
,shared
.queued
);
2398 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2399 blockClientOnSwappedKeys(cmd
,c
)) return 1;
2403 /* Prepare the client for the next command */
2408 static void replicationFeedSlaves(list
*slaves
, int dictid
, robj
**argv
, int argc
) {
2413 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2414 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2415 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2416 robj
*static_outv
[REDIS_STATIC_ARGS
*3+1];
2419 if (argc
<= REDIS_STATIC_ARGS
) {
2422 outv
= zmalloc(sizeof(robj
*)*(argc
*3+1));
2425 lenobj
= createObject(REDIS_STRING
,
2426 sdscatprintf(sdsempty(), "*%d\r\n", argc
));
2427 lenobj
->refcount
= 0;
2428 outv
[outc
++] = lenobj
;
2429 for (j
= 0; j
< argc
; j
++) {
2430 lenobj
= createObject(REDIS_STRING
,
2431 sdscatprintf(sdsempty(),"$%lu\r\n",
2432 (unsigned long) stringObjectLen(argv
[j
])));
2433 lenobj
->refcount
= 0;
2434 outv
[outc
++] = lenobj
;
2435 outv
[outc
++] = argv
[j
];
2436 outv
[outc
++] = shared
.crlf
;
2439 /* Increment all the refcounts at start and decrement at end in order to
2440 * be sure to free objects if there is no slave in a replication state
2441 * able to be feed with commands */
2442 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2443 listRewind(slaves
,&li
);
2444 while((ln
= listNext(&li
))) {
2445 redisClient
*slave
= ln
->value
;
2447 /* Don't feed slaves that are still waiting for BGSAVE to start */
2448 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2450 /* Feed all the other slaves, MONITORs and so on */
2451 if (slave
->slaveseldb
!= dictid
) {
2455 case 0: selectcmd
= shared
.select0
; break;
2456 case 1: selectcmd
= shared
.select1
; break;
2457 case 2: selectcmd
= shared
.select2
; break;
2458 case 3: selectcmd
= shared
.select3
; break;
2459 case 4: selectcmd
= shared
.select4
; break;
2460 case 5: selectcmd
= shared
.select5
; break;
2461 case 6: selectcmd
= shared
.select6
; break;
2462 case 7: selectcmd
= shared
.select7
; break;
2463 case 8: selectcmd
= shared
.select8
; break;
2464 case 9: selectcmd
= shared
.select9
; break;
2466 selectcmd
= createObject(REDIS_STRING
,
2467 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2468 selectcmd
->refcount
= 0;
2471 addReply(slave
,selectcmd
);
2472 slave
->slaveseldb
= dictid
;
2474 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2476 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2477 if (outv
!= static_outv
) zfree(outv
);
2480 static sds
sdscatrepr(sds s
, char *p
, size_t len
) {
2481 s
= sdscatlen(s
,"\"",1);
2486 s
= sdscatprintf(s
,"\\%c",*p
);
2488 case '\n': s
= sdscatlen(s
,"\\n",1); break;
2489 case '\r': s
= sdscatlen(s
,"\\r",1); break;
2490 case '\t': s
= sdscatlen(s
,"\\t",1); break;
2491 case '\a': s
= sdscatlen(s
,"\\a",1); break;
2492 case '\b': s
= sdscatlen(s
,"\\b",1); break;
2495 s
= sdscatprintf(s
,"%c",*p
);
2497 s
= sdscatprintf(s
,"\\x%02x",(unsigned char)*p
);
2502 return sdscatlen(s
,"\"",1);
2505 static void replicationFeedMonitors(list
*monitors
, int dictid
, robj
**argv
, int argc
) {
2509 sds cmdrepr
= sdsnew("+");
2513 gettimeofday(&tv
,NULL
);
2514 cmdrepr
= sdscatprintf(cmdrepr
,"%ld.%ld ",(long)tv
.tv_sec
,(long)tv
.tv_usec
);
2515 if (dictid
!= 0) cmdrepr
= sdscatprintf(cmdrepr
,"(db %d) ", dictid
);
2517 for (j
= 0; j
< argc
; j
++) {
2518 if (argv
[j
]->encoding
== REDIS_ENCODING_INT
) {
2519 cmdrepr
= sdscatprintf(cmdrepr
, "%ld", (long)argv
[j
]->ptr
);
2521 cmdrepr
= sdscatrepr(cmdrepr
,(char*)argv
[j
]->ptr
,
2522 sdslen(argv
[j
]->ptr
));
2525 cmdrepr
= sdscatlen(cmdrepr
," ",1);
2527 cmdrepr
= sdscatlen(cmdrepr
,"\r\n",2);
2528 cmdobj
= createObject(REDIS_STRING
,cmdrepr
);
2530 listRewind(monitors
,&li
);
2531 while((ln
= listNext(&li
))) {
2532 redisClient
*monitor
= ln
->value
;
2533 addReply(monitor
,cmdobj
);
2535 decrRefCount(cmdobj
);
2538 static void processInputBuffer(redisClient
*c
) {
2540 /* Before to process the input buffer, make sure the client is not
2541 * waitig for a blocking operation such as BLPOP. Note that the first
2542 * iteration the client is never blocked, otherwise the processInputBuffer
2543 * would not be called at all, but after the execution of the first commands
2544 * in the input buffer the client may be blocked, and the "goto again"
2545 * will try to reiterate. The following line will make it return asap. */
2546 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2547 if (c
->bulklen
== -1) {
2548 /* Read the first line of the query */
2549 char *p
= strchr(c
->querybuf
,'\n');
2556 query
= c
->querybuf
;
2557 c
->querybuf
= sdsempty();
2558 querylen
= 1+(p
-(query
));
2559 if (sdslen(query
) > querylen
) {
2560 /* leave data after the first line of the query in the buffer */
2561 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2563 *p
= '\0'; /* remove "\n" */
2564 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2565 sdsupdatelen(query
);
2567 /* Now we can split the query in arguments */
2568 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2571 if (c
->argv
) zfree(c
->argv
);
2572 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2574 for (j
= 0; j
< argc
; j
++) {
2575 if (sdslen(argv
[j
])) {
2576 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2584 /* Execute the command. If the client is still valid
2585 * after processCommand() return and there is something
2586 * on the query buffer try to process the next command. */
2587 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2589 /* Nothing to process, argc == 0. Just process the query
2590 * buffer if it's not empty or return to the caller */
2591 if (sdslen(c
->querybuf
)) goto again
;
2594 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2595 redisLog(REDIS_VERBOSE
, "Client protocol error");
2600 /* Bulk read handling. Note that if we are at this point
2601 the client already sent a command terminated with a newline,
2602 we are reading the bulk data that is actually the last
2603 argument of the command. */
2604 int qbl
= sdslen(c
->querybuf
);
2606 if (c
->bulklen
<= qbl
) {
2607 /* Copy everything but the final CRLF as final argument */
2608 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2610 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2611 /* Process the command. If the client is still valid after
2612 * the processing and there is more data in the buffer
2613 * try to parse it. */
2614 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2620 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2621 redisClient
*c
= (redisClient
*) privdata
;
2622 char buf
[REDIS_IOBUF_LEN
];
2625 REDIS_NOTUSED(mask
);
2627 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2629 if (errno
== EAGAIN
) {
2632 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2636 } else if (nread
== 0) {
2637 redisLog(REDIS_VERBOSE
, "Client closed connection");
2642 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2643 c
->lastinteraction
= time(NULL
);
2647 processInputBuffer(c
);
2650 static int selectDb(redisClient
*c
, int id
) {
2651 if (id
< 0 || id
>= server
.dbnum
)
2653 c
->db
= &server
.db
[id
];
2657 static void *dupClientReplyValue(void *o
) {
2658 incrRefCount((robj
*)o
);
2662 static int listMatchObjects(void *a
, void *b
) {
2663 return compareStringObjects(a
,b
) == 0;
2666 static redisClient
*createClient(int fd
) {
2667 redisClient
*c
= zmalloc(sizeof(*c
));
2669 anetNonBlock(NULL
,fd
);
2670 anetTcpNoDelay(NULL
,fd
);
2671 if (!c
) return NULL
;
2674 c
->querybuf
= sdsempty();
2683 c
->lastinteraction
= time(NULL
);
2684 c
->authenticated
= 0;
2685 c
->replstate
= REDIS_REPL_NONE
;
2686 c
->reply
= listCreate();
2687 listSetFreeMethod(c
->reply
,decrRefCount
);
2688 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2689 c
->blockingkeys
= NULL
;
2690 c
->blockingkeysnum
= 0;
2691 c
->io_keys
= listCreate();
2692 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2693 c
->pubsub_channels
= dictCreate(&setDictType
,NULL
);
2694 c
->pubsub_patterns
= listCreate();
2695 listSetFreeMethod(c
->pubsub_patterns
,decrRefCount
);
2696 listSetMatchMethod(c
->pubsub_patterns
,listMatchObjects
);
2697 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2698 readQueryFromClient
, c
) == AE_ERR
) {
2702 listAddNodeTail(server
.clients
,c
);
2703 initClientMultiState(c
);
2707 static void addReply(redisClient
*c
, robj
*obj
) {
2708 if (listLength(c
->reply
) == 0 &&
2709 (c
->replstate
== REDIS_REPL_NONE
||
2710 c
->replstate
== REDIS_REPL_ONLINE
) &&
2711 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2712 sendReplyToClient
, c
) == AE_ERR
) return;
2714 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2715 obj
= dupStringObject(obj
);
2716 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2718 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2721 static void addReplySds(redisClient
*c
, sds s
) {
2722 robj
*o
= createObject(REDIS_STRING
,s
);
2727 static void addReplyDouble(redisClient
*c
, double d
) {
2730 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2731 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2732 (unsigned long) strlen(buf
),buf
));
2735 static void addReplyLong(redisClient
*c
, long l
) {
2740 addReply(c
,shared
.czero
);
2742 } else if (l
== 1) {
2743 addReply(c
,shared
.cone
);
2746 len
= snprintf(buf
,sizeof(buf
),":%ld\r\n",l
);
2747 addReplySds(c
,sdsnewlen(buf
,len
));
2750 static void addReplyLongLong(redisClient
*c
, long long ll
) {
2755 addReply(c
,shared
.czero
);
2757 } else if (ll
== 1) {
2758 addReply(c
,shared
.cone
);
2761 len
= snprintf(buf
,sizeof(buf
),":%lld\r\n",ll
);
2762 addReplySds(c
,sdsnewlen(buf
,len
));
2765 static void addReplyUlong(redisClient
*c
, unsigned long ul
) {
2770 addReply(c
,shared
.czero
);
2772 } else if (ul
== 1) {
2773 addReply(c
,shared
.cone
);
2776 len
= snprintf(buf
,sizeof(buf
),":%lu\r\n",ul
);
2777 addReplySds(c
,sdsnewlen(buf
,len
));
2780 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2783 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2784 len
= sdslen(obj
->ptr
);
2786 long n
= (long)obj
->ptr
;
2788 /* Compute how many bytes will take this integer as a radix 10 string */
2794 while((n
= n
/10) != 0) {
2798 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len
));
2801 static void addReplyBulk(redisClient
*c
, robj
*obj
) {
2802 addReplyBulkLen(c
,obj
);
2804 addReply(c
,shared
.crlf
);
2807 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2808 static void addReplyBulkCString(redisClient
*c
, char *s
) {
2810 addReply(c
,shared
.nullbulk
);
2812 robj
*o
= createStringObject(s
,strlen(s
));
2818 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2823 REDIS_NOTUSED(mask
);
2824 REDIS_NOTUSED(privdata
);
2826 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2827 if (cfd
== AE_ERR
) {
2828 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2831 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2832 if ((c
= createClient(cfd
)) == NULL
) {
2833 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2834 close(cfd
); /* May be already closed, just ingore errors */
2837 /* If maxclient directive is set and this is one client more... close the
2838 * connection. Note that we create the client instead to check before
2839 * for this condition, since now the socket is already set in nonblocking
2840 * mode and we can send an error for free using the Kernel I/O */
2841 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2842 char *err
= "-ERR max number of clients reached\r\n";
2844 /* That's a best effort error message, don't check write errors */
2845 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2846 /* Nothing to do, Just to avoid the warning... */
2851 server
.stat_numconnections
++;
2854 /* ======================= Redis objects implementation ===================== */
2856 static robj
*createObject(int type
, void *ptr
) {
2859 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2860 if (listLength(server
.objfreelist
)) {
2861 listNode
*head
= listFirst(server
.objfreelist
);
2862 o
= listNodeValue(head
);
2863 listDelNode(server
.objfreelist
,head
);
2864 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2866 if (server
.vm_enabled
) {
2867 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2868 o
= zmalloc(sizeof(*o
));
2870 o
= zmalloc(sizeof(*o
)-sizeof(struct redisObjectVM
));
2874 o
->encoding
= REDIS_ENCODING_RAW
;
2877 if (server
.vm_enabled
) {
2878 /* Note that this code may run in the context of an I/O thread
2879 * and accessing to server.unixtime in theory is an error
2880 * (no locks). But in practice this is safe, and even if we read
2881 * garbage Redis will not fail, as it's just a statistical info */
2882 o
->vm
.atime
= server
.unixtime
;
2883 o
->storage
= REDIS_VM_MEMORY
;
2888 static robj
*createStringObject(char *ptr
, size_t len
) {
2889 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
2892 static robj
*createStringObjectFromLongLong(long long value
) {
2894 if (value
>= 0 && value
< REDIS_SHARED_INTEGERS
) {
2895 incrRefCount(shared
.integers
[value
]);
2896 o
= shared
.integers
[value
];
2898 o
= createObject(REDIS_STRING
, NULL
);
2899 if (value
>= LONG_MIN
&& value
<= LONG_MAX
) {
2900 o
->encoding
= REDIS_ENCODING_INT
;
2901 o
->ptr
= (void*)((long)value
);
2903 o
->ptr
= sdscatprintf(sdsempty(),"%lld",value
);
2909 static robj
*dupStringObject(robj
*o
) {
2910 assert(o
->encoding
== REDIS_ENCODING_RAW
);
2911 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
2914 static robj
*createListObject(void) {
2915 list
*l
= listCreate();
2917 listSetFreeMethod(l
,decrRefCount
);
2918 return createObject(REDIS_LIST
,l
);
2921 static robj
*createSetObject(void) {
2922 dict
*d
= dictCreate(&setDictType
,NULL
);
2923 return createObject(REDIS_SET
,d
);
2926 static robj
*createHashObject(void) {
2927 /* All the Hashes start as zipmaps. Will be automatically converted
2928 * into hash tables if there are enough elements or big elements
2930 unsigned char *zm
= zipmapNew();
2931 robj
*o
= createObject(REDIS_HASH
,zm
);
2932 o
->encoding
= REDIS_ENCODING_ZIPMAP
;
2936 static robj
*createZsetObject(void) {
2937 zset
*zs
= zmalloc(sizeof(*zs
));
2939 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
2940 zs
->zsl
= zslCreate();
2941 return createObject(REDIS_ZSET
,zs
);
2944 static void freeStringObject(robj
*o
) {
2945 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2950 static void freeListObject(robj
*o
) {
2951 listRelease((list
*) o
->ptr
);
2954 static void freeSetObject(robj
*o
) {
2955 dictRelease((dict
*) o
->ptr
);
2958 static void freeZsetObject(robj
*o
) {
2961 dictRelease(zs
->dict
);
2966 static void freeHashObject(robj
*o
) {
2967 switch (o
->encoding
) {
2968 case REDIS_ENCODING_HT
:
2969 dictRelease((dict
*) o
->ptr
);
2971 case REDIS_ENCODING_ZIPMAP
:
2975 redisPanic("Unknown hash encoding type");
2980 static void incrRefCount(robj
*o
) {
2984 static void decrRefCount(void *obj
) {
2987 if (o
->refcount
<= 0) redisPanic("decrRefCount against refcount <= 0");
2988 /* Object is a key of a swapped out value, or in the process of being
2990 if (server
.vm_enabled
&&
2991 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
2993 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(obj
);
2994 redisAssert(o
->type
== REDIS_STRING
);
2995 freeStringObject(o
);
2996 vmMarkPagesFree(o
->vm
.page
,o
->vm
.usedpages
);
2997 pthread_mutex_lock(&server
.obj_freelist_mutex
);
2998 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2999 !listAddNodeHead(server
.objfreelist
,o
))
3001 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
3002 server
.vm_stats_swapped_objects
--;
3005 /* Object is in memory, or in the process of being swapped out. */
3006 if (--(o
->refcount
) == 0) {
3007 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
3008 vmCancelThreadedIOJob(obj
);
3010 case REDIS_STRING
: freeStringObject(o
); break;
3011 case REDIS_LIST
: freeListObject(o
); break;
3012 case REDIS_SET
: freeSetObject(o
); break;
3013 case REDIS_ZSET
: freeZsetObject(o
); break;
3014 case REDIS_HASH
: freeHashObject(o
); break;
3015 default: redisPanic("Unknown object type"); break;
3017 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
3018 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
3019 !listAddNodeHead(server
.objfreelist
,o
))
3021 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
3025 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
3026 dictEntry
*de
= dictFind(db
->dict
,key
);
3028 robj
*key
= dictGetEntryKey(de
);
3029 robj
*val
= dictGetEntryVal(de
);
3031 if (server
.vm_enabled
) {
3032 if (key
->storage
== REDIS_VM_MEMORY
||
3033 key
->storage
== REDIS_VM_SWAPPING
)
3035 /* If we were swapping the object out, stop it, this key
3037 if (key
->storage
== REDIS_VM_SWAPPING
)
3038 vmCancelThreadedIOJob(key
);
3039 /* Update the access time of the key for the aging algorithm. */
3040 key
->vm
.atime
= server
.unixtime
;
3042 int notify
= (key
->storage
== REDIS_VM_LOADING
);
3044 /* Our value was swapped on disk. Bring it at home. */
3045 redisAssert(val
== NULL
);
3046 val
= vmLoadObject(key
);
3047 dictGetEntryVal(de
) = val
;
3049 /* Clients blocked by the VM subsystem may be waiting for
3051 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
3060 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
3061 expireIfNeeded(db
,key
);
3062 return lookupKey(db
,key
);
3065 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
3066 deleteIfVolatile(db
,key
);
3067 return lookupKey(db
,key
);
3070 static robj
*lookupKeyReadOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
3071 robj
*o
= lookupKeyRead(c
->db
, key
);
3072 if (!o
) addReply(c
,reply
);
3076 static robj
*lookupKeyWriteOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
3077 robj
*o
= lookupKeyWrite(c
->db
, key
);
3078 if (!o
) addReply(c
,reply
);
3082 static int checkType(redisClient
*c
, robj
*o
, int type
) {
3083 if (o
->type
!= type
) {
3084 addReply(c
,shared
.wrongtypeerr
);
3090 static int deleteKey(redisDb
*db
, robj
*key
) {
3093 /* We need to protect key from destruction: after the first dictDelete()
3094 * it may happen that 'key' is no longer valid if we don't increment
3095 * it's count. This may happen when we get the object reference directly
3096 * from the hash table with dictRandomKey() or dict iterators */
3098 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
);
3099 retval
= dictDelete(db
->dict
,key
);
3102 return retval
== DICT_OK
;
3105 /* Check if the nul-terminated string 's' can be represented by a long
3106 * (that is, is a number that fits into long without any other space or
3107 * character before or after the digits).
3109 * If so, the function returns REDIS_OK and *longval is set to the value
3110 * of the number. Otherwise REDIS_ERR is returned */
3111 static int isStringRepresentableAsLong(sds s
, long *longval
) {
3112 char buf
[32], *endptr
;
3116 value
= strtol(s
, &endptr
, 10);
3117 if (endptr
[0] != '\0') return REDIS_ERR
;
3118 slen
= snprintf(buf
,32,"%ld",value
);
3120 /* If the number converted back into a string is not identical
3121 * then it's not possible to encode the string as integer */
3122 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
3123 if (longval
) *longval
= value
;
3127 /* Try to encode a string object in order to save space */
3128 static robj
*tryObjectEncoding(robj
*o
) {
3132 if (o
->encoding
!= REDIS_ENCODING_RAW
)
3133 return o
; /* Already encoded */
3135 /* It's not safe to encode shared objects: shared objects can be shared
3136 * everywhere in the "object space" of Redis. Encoded objects can only
3137 * appear as "values" (and not, for instance, as keys) */
3138 if (o
->refcount
> 1) return o
;
3140 /* Currently we try to encode only strings */
3141 redisAssert(o
->type
== REDIS_STRING
);
3143 /* Check if we can represent this string as a long integer */
3144 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return o
;
3146 /* Ok, this object can be encoded */
3147 if (value
>= 0 && value
< REDIS_SHARED_INTEGERS
) {
3149 incrRefCount(shared
.integers
[value
]);
3150 return shared
.integers
[value
];
3152 o
->encoding
= REDIS_ENCODING_INT
;
3154 o
->ptr
= (void*) value
;
3159 /* Get a decoded version of an encoded object (returned as a new object).
3160 * If the object is already raw-encoded just increment the ref count. */
3161 static robj
*getDecodedObject(robj
*o
) {
3164 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3168 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
3171 snprintf(buf
,32,"%ld",(long)o
->ptr
);
3172 dec
= createStringObject(buf
,strlen(buf
));
3175 redisPanic("Unknown encoding type");
3179 /* Compare two string objects via strcmp() or alike.
3180 * Note that the objects may be integer-encoded. In such a case we
3181 * use snprintf() to get a string representation of the numbers on the stack
3182 * and compare the strings, it's much faster than calling getDecodedObject().
3184 * Important note: if objects are not integer encoded, but binary-safe strings,
3185 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3187 static int compareStringObjects(robj
*a
, robj
*b
) {
3188 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
3189 char bufa
[128], bufb
[128], *astr
, *bstr
;
3192 if (a
== b
) return 0;
3193 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
3194 snprintf(bufa
,sizeof(bufa
),"%ld",(long) a
->ptr
);
3200 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
3201 snprintf(bufb
,sizeof(bufb
),"%ld",(long) b
->ptr
);
3207 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
3210 static size_t stringObjectLen(robj
*o
) {
3211 redisAssert(o
->type
== REDIS_STRING
);
3212 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3213 return sdslen(o
->ptr
);
3217 return snprintf(buf
,32,"%ld",(long)o
->ptr
);
3221 static int getDoubleFromObject(robj
*o
, double *target
) {
3228 redisAssert(o
->type
== REDIS_STRING
);
3229 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3230 value
= strtod(o
->ptr
, &eptr
);
3231 if (eptr
[0] != '\0') return REDIS_ERR
;
3232 } else if (o
->encoding
== REDIS_ENCODING_INT
) {
3233 value
= (long)o
->ptr
;
3235 redisPanic("Unknown string encoding");
3243 static int getDoubleFromObjectOrReply(redisClient
*c
, robj
*o
, double *target
, const char *msg
) {
3245 if (getDoubleFromObject(o
, &value
) != REDIS_OK
) {
3247 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3249 addReplySds(c
, sdsnew("-ERR value is not a double\r\n"));
3258 static int getLongLongFromObject(robj
*o
, long long *target
) {
3265 redisAssert(o
->type
== REDIS_STRING
);
3266 if (o
->encoding
== REDIS_ENCODING_RAW
) {
3267 value
= strtoll(o
->ptr
, &eptr
, 10);
3268 if (eptr
[0] != '\0') return REDIS_ERR
;
3269 } else if (o
->encoding
== REDIS_ENCODING_INT
) {
3270 value
= (long)o
->ptr
;
3272 redisPanic("Unknown string encoding");
3280 static int getLongLongFromObjectOrReply(redisClient
*c
, robj
*o
, long long *target
, const char *msg
) {
3282 if (getLongLongFromObject(o
, &value
) != REDIS_OK
) {
3284 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3286 addReplySds(c
, sdsnew("-ERR value is not an integer\r\n"));
3295 static int getLongFromObjectOrReply(redisClient
*c
, robj
*o
, long *target
, const char *msg
) {
3298 if (getLongLongFromObjectOrReply(c
, o
, &value
, msg
) != REDIS_OK
) return REDIS_ERR
;
3299 if (value
< LONG_MIN
|| value
> LONG_MAX
) {
3301 addReplySds(c
, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg
));
3303 addReplySds(c
, sdsnew("-ERR value is out of range\r\n"));
3312 /*============================ RDB saving/loading =========================== */
3314 static int rdbSaveType(FILE *fp
, unsigned char type
) {
3315 if (fwrite(&type
,1,1,fp
) == 0) return -1;
3319 static int rdbSaveTime(FILE *fp
, time_t t
) {
3320 int32_t t32
= (int32_t) t
;
3321 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
3325 /* check rdbLoadLen() comments for more info */
3326 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
3327 unsigned char buf
[2];
3330 /* Save a 6 bit len */
3331 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
3332 if (fwrite(buf
,1,1,fp
) == 0) return -1;
3333 } else if (len
< (1<<14)) {
3334 /* Save a 14 bit len */
3335 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
3337 if (fwrite(buf
,2,1,fp
) == 0) return -1;
3339 /* Save a 32 bit len */
3340 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
3341 if (fwrite(buf
,1,1,fp
) == 0) return -1;
3343 if (fwrite(&len
,4,1,fp
) == 0) return -1;
3348 /* String objects in the form "2391" "-100" without any space and with a
3349 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3350 * encoded as integers to save space */
3351 static int rdbTryIntegerEncoding(char *s
, size_t len
, unsigned char *enc
) {
3353 char *endptr
, buf
[32];
3355 /* Check if it's possible to encode this value as a number */
3356 value
= strtoll(s
, &endptr
, 10);
3357 if (endptr
[0] != '\0') return 0;
3358 snprintf(buf
,32,"%lld",value
);
3360 /* If the number converted back into a string is not identical
3361 * then it's not possible to encode the string as integer */
3362 if (strlen(buf
) != len
|| memcmp(buf
,s
,len
)) return 0;
3364 /* Finally check if it fits in our ranges */
3365 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
3366 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
3367 enc
[1] = value
&0xFF;
3369 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
3370 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
3371 enc
[1] = value
&0xFF;
3372 enc
[2] = (value
>>8)&0xFF;
3374 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
3375 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
3376 enc
[1] = value
&0xFF;
3377 enc
[2] = (value
>>8)&0xFF;
3378 enc
[3] = (value
>>16)&0xFF;
3379 enc
[4] = (value
>>24)&0xFF;
3386 static int rdbSaveLzfStringObject(FILE *fp
, unsigned char *s
, size_t len
) {
3387 size_t comprlen
, outlen
;
3391 /* We require at least four bytes compression for this to be worth it */
3392 if (len
<= 4) return 0;
3394 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
3395 comprlen
= lzf_compress(s
, len
, out
, outlen
);
3396 if (comprlen
== 0) {
3400 /* Data compressed! Let's save it on disk */
3401 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
3402 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
3403 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
3404 if (rdbSaveLen(fp
,len
) == -1) goto writeerr
;
3405 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
3414 /* Save a string objet as [len][data] on disk. If the object is a string
3415 * representation of an integer value we try to safe it in a special form */
3416 static int rdbSaveRawString(FILE *fp
, unsigned char *s
, size_t len
) {
3419 /* Try integer encoding */
3421 unsigned char buf
[5];
3422 if ((enclen
= rdbTryIntegerEncoding((char*)s
,len
,buf
)) > 0) {
3423 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
3428 /* Try LZF compression - under 20 bytes it's unable to compress even
3429 * aaaaaaaaaaaaaaaaaa so skip it */
3430 if (server
.rdbcompression
&& len
> 20) {
3433 retval
= rdbSaveLzfStringObject(fp
,s
,len
);
3434 if (retval
== -1) return -1;
3435 if (retval
> 0) return 0;
3436 /* retval == 0 means data can't be compressed, save the old way */
3439 /* Store verbatim */
3440 if (rdbSaveLen(fp
,len
) == -1) return -1;
3441 if (len
&& fwrite(s
,len
,1,fp
) == 0) return -1;
3445 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3446 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
3449 /* Avoid incr/decr ref count business when possible.
3450 * This plays well with copy-on-write given that we are probably
3451 * in a child process (BGSAVE). Also this makes sure key objects
3452 * of swapped objects are not incRefCount-ed (an assert does not allow
3453 * this in order to avoid bugs) */
3454 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
3455 obj
= getDecodedObject(obj
);
3456 retval
= rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3459 retval
= rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3464 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3465 * 8 bit integer specifing the length of the representation.
3466 * This 8 bit integer has special values in order to specify the following
3472 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3473 unsigned char buf
[128];
3479 } else if (!isfinite(val
)) {
3481 buf
[0] = (val
< 0) ? 255 : 254;
3483 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3484 buf
[0] = strlen((char*)buf
+1);
3487 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3491 /* Save a Redis object. */
3492 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3493 if (o
->type
== REDIS_STRING
) {
3494 /* Save a string value */
3495 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3496 } else if (o
->type
== REDIS_LIST
) {
3497 /* Save a list value */
3498 list
*list
= o
->ptr
;
3502 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3503 listRewind(list
,&li
);
3504 while((ln
= listNext(&li
))) {
3505 robj
*eleobj
= listNodeValue(ln
);
3507 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3509 } else if (o
->type
== REDIS_SET
) {
3510 /* Save a set value */
3512 dictIterator
*di
= dictGetIterator(set
);
3515 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3516 while((de
= dictNext(di
)) != NULL
) {
3517 robj
*eleobj
= dictGetEntryKey(de
);
3519 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3521 dictReleaseIterator(di
);
3522 } else if (o
->type
== REDIS_ZSET
) {
3523 /* Save a set value */
3525 dictIterator
*di
= dictGetIterator(zs
->dict
);
3528 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3529 while((de
= dictNext(di
)) != NULL
) {
3530 robj
*eleobj
= dictGetEntryKey(de
);
3531 double *score
= dictGetEntryVal(de
);
3533 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3534 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3536 dictReleaseIterator(di
);
3537 } else if (o
->type
== REDIS_HASH
) {
3538 /* Save a hash value */
3539 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3540 unsigned char *p
= zipmapRewind(o
->ptr
);
3541 unsigned int count
= zipmapLen(o
->ptr
);
3542 unsigned char *key
, *val
;
3543 unsigned int klen
, vlen
;
3545 if (rdbSaveLen(fp
,count
) == -1) return -1;
3546 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
3547 if (rdbSaveRawString(fp
,key
,klen
) == -1) return -1;
3548 if (rdbSaveRawString(fp
,val
,vlen
) == -1) return -1;
3551 dictIterator
*di
= dictGetIterator(o
->ptr
);
3554 if (rdbSaveLen(fp
,dictSize((dict
*)o
->ptr
)) == -1) return -1;
3555 while((de
= dictNext(di
)) != NULL
) {
3556 robj
*key
= dictGetEntryKey(de
);
3557 robj
*val
= dictGetEntryVal(de
);
3559 if (rdbSaveStringObject(fp
,key
) == -1) return -1;
3560 if (rdbSaveStringObject(fp
,val
) == -1) return -1;
3562 dictReleaseIterator(di
);
3565 redisPanic("Unknown object type");
3570 /* Return the length the object will have on disk if saved with
3571 * the rdbSaveObject() function. Currently we use a trick to get
3572 * this length with very little changes to the code. In the future
3573 * we could switch to a faster solution. */
3574 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3575 if (fp
== NULL
) fp
= server
.devnull
;
3577 assert(rdbSaveObject(fp
,o
) != 1);
3581 /* Return the number of pages required to save this object in the swap file */
3582 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3583 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3585 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3588 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3589 static int rdbSave(char *filename
) {
3590 dictIterator
*di
= NULL
;
3595 time_t now
= time(NULL
);
3597 /* Wait for I/O therads to terminate, just in case this is a
3598 * foreground-saving, to avoid seeking the swap file descriptor at the
3600 if (server
.vm_enabled
)
3601 waitEmptyIOJobsQueue();
3603 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3604 fp
= fopen(tmpfile
,"w");
3606 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3609 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3610 for (j
= 0; j
< server
.dbnum
; j
++) {
3611 redisDb
*db
= server
.db
+j
;
3613 if (dictSize(d
) == 0) continue;
3614 di
= dictGetIterator(d
);
3620 /* Write the SELECT DB opcode */
3621 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3622 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3624 /* Iterate this DB writing every entry */
3625 while((de
= dictNext(di
)) != NULL
) {
3626 robj
*key
= dictGetEntryKey(de
);
3627 robj
*o
= dictGetEntryVal(de
);
3628 time_t expiretime
= getExpire(db
,key
);
3630 /* Save the expire time */
3631 if (expiretime
!= -1) {
3632 /* If this key is already expired skip it */
3633 if (expiretime
< now
) continue;
3634 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3635 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3637 /* Save the key and associated value. This requires special
3638 * handling if the value is swapped out. */
3639 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
3640 key
->storage
== REDIS_VM_SWAPPING
) {
3641 /* Save type, key, value */
3642 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3643 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3644 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3646 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3648 /* Get a preview of the object in memory */
3649 po
= vmPreviewObject(key
);
3650 /* Save type, key, value */
3651 if (rdbSaveType(fp
,key
->vtype
) == -1) goto werr
;
3652 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3653 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3654 /* Remove the loaded object from memory */
3658 dictReleaseIterator(di
);
3661 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3663 /* Make sure data will not remain on the OS's output buffers */
3668 /* Use RENAME to make sure the DB file is changed atomically only
3669 * if the generate DB file is ok. */
3670 if (rename(tmpfile
,filename
) == -1) {
3671 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3675 redisLog(REDIS_NOTICE
,"DB saved on disk");
3677 server
.lastsave
= time(NULL
);
3683 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
3684 if (di
) dictReleaseIterator(di
);
3688 static int rdbSaveBackground(char *filename
) {
3691 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
3692 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
3693 if ((childpid
= fork()) == 0) {
3695 if (server
.vm_enabled
) vmReopenSwapFile();
3697 if (rdbSave(filename
) == REDIS_OK
) {
3704 if (childpid
== -1) {
3705 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
3709 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
3710 server
.bgsavechildpid
= childpid
;
3711 updateDictResizePolicy();
3714 return REDIS_OK
; /* unreached */
3717 static void rdbRemoveTempFile(pid_t childpid
) {
3720 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
3724 static int rdbLoadType(FILE *fp
) {
3726 if (fread(&type
,1,1,fp
) == 0) return -1;
3730 static time_t rdbLoadTime(FILE *fp
) {
3732 if (fread(&t32
,4,1,fp
) == 0) return -1;
3733 return (time_t) t32
;
3736 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3737 * of this file for a description of how this are stored on disk.
3739 * isencoded is set to 1 if the readed length is not actually a length but
3740 * an "encoding type", check the above comments for more info */
3741 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
3742 unsigned char buf
[2];
3746 if (isencoded
) *isencoded
= 0;
3747 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3748 type
= (buf
[0]&0xC0)>>6;
3749 if (type
== REDIS_RDB_6BITLEN
) {
3750 /* Read a 6 bit len */
3752 } else if (type
== REDIS_RDB_ENCVAL
) {
3753 /* Read a 6 bit len encoding type */
3754 if (isencoded
) *isencoded
= 1;
3756 } else if (type
== REDIS_RDB_14BITLEN
) {
3757 /* Read a 14 bit len */
3758 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3759 return ((buf
[0]&0x3F)<<8)|buf
[1];
3761 /* Read a 32 bit len */
3762 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
3767 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
) {
3768 unsigned char enc
[4];
3771 if (enctype
== REDIS_RDB_ENC_INT8
) {
3772 if (fread(enc
,1,1,fp
) == 0) return NULL
;
3773 val
= (signed char)enc
[0];
3774 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
3776 if (fread(enc
,2,1,fp
) == 0) return NULL
;
3777 v
= enc
[0]|(enc
[1]<<8);
3779 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
3781 if (fread(enc
,4,1,fp
) == 0) return NULL
;
3782 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
3785 val
= 0; /* anti-warning */
3786 redisPanic("Unknown RDB integer encoding type");
3788 return createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",val
));
3791 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
3792 unsigned int len
, clen
;
3793 unsigned char *c
= NULL
;
3796 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3797 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3798 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
3799 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
3800 if (fread(c
,clen
,1,fp
) == 0) goto err
;
3801 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
3803 return createObject(REDIS_STRING
,val
);
3810 static robj
*rdbLoadStringObject(FILE*fp
) {
3815 len
= rdbLoadLen(fp
,&isencoded
);
3818 case REDIS_RDB_ENC_INT8
:
3819 case REDIS_RDB_ENC_INT16
:
3820 case REDIS_RDB_ENC_INT32
:
3821 return rdbLoadIntegerObject(fp
,len
);
3822 case REDIS_RDB_ENC_LZF
:
3823 return rdbLoadLzfStringObject(fp
);
3825 redisPanic("Unknown RDB encoding type");
3829 if (len
== REDIS_RDB_LENERR
) return NULL
;
3830 val
= sdsnewlen(NULL
,len
);
3831 if (len
&& fread(val
,len
,1,fp
) == 0) {
3835 return createObject(REDIS_STRING
,val
);
3838 /* For information about double serialization check rdbSaveDoubleValue() */
3839 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
3843 if (fread(&len
,1,1,fp
) == 0) return -1;
3845 case 255: *val
= R_NegInf
; return 0;
3846 case 254: *val
= R_PosInf
; return 0;
3847 case 253: *val
= R_Nan
; return 0;
3849 if (fread(buf
,len
,1,fp
) == 0) return -1;
3851 sscanf(buf
, "%lg", val
);
3856 /* Load a Redis object of the specified type from the specified file.
3857 * On success a newly allocated object is returned, otherwise NULL. */
3858 static robj
*rdbLoadObject(int type
, FILE *fp
) {
3861 redisLog(REDIS_DEBUG
,"LOADING OBJECT %d (at %d)\n",type
,ftell(fp
));
3862 if (type
== REDIS_STRING
) {
3863 /* Read string value */
3864 if ((o
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3865 o
= tryObjectEncoding(o
);
3866 } else if (type
== REDIS_LIST
|| type
== REDIS_SET
) {
3867 /* Read list/set value */
3870 if ((listlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3871 o
= (type
== REDIS_LIST
) ? createListObject() : createSetObject();
3872 /* It's faster to expand the dict to the right size asap in order
3873 * to avoid rehashing */
3874 if (type
== REDIS_SET
&& listlen
> DICT_HT_INITIAL_SIZE
)
3875 dictExpand(o
->ptr
,listlen
);
3876 /* Load every single element of the list/set */
3880 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3881 ele
= tryObjectEncoding(ele
);
3882 if (type
== REDIS_LIST
) {
3883 listAddNodeTail((list
*)o
->ptr
,ele
);
3885 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
3888 } else if (type
== REDIS_ZSET
) {
3889 /* Read list/set value */
3893 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3894 o
= createZsetObject();
3896 /* Load every single element of the list/set */
3899 double *score
= zmalloc(sizeof(double));
3901 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3902 ele
= tryObjectEncoding(ele
);
3903 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
3904 dictAdd(zs
->dict
,ele
,score
);
3905 zslInsert(zs
->zsl
,*score
,ele
);
3906 incrRefCount(ele
); /* added to skiplist */
3908 } else if (type
== REDIS_HASH
) {
3911 if ((hashlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3912 o
= createHashObject();
3913 /* Too many entries? Use an hash table. */
3914 if (hashlen
> server
.hash_max_zipmap_entries
)
3915 convertToRealHash(o
);
3916 /* Load every key/value, then set it into the zipmap or hash
3917 * table, as needed. */
3921 if ((key
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3922 if ((val
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3923 /* If we are using a zipmap and there are too big values
3924 * the object is converted to real hash table encoding. */
3925 if (o
->encoding
!= REDIS_ENCODING_HT
&&
3926 (sdslen(key
->ptr
) > server
.hash_max_zipmap_value
||
3927 sdslen(val
->ptr
) > server
.hash_max_zipmap_value
))
3929 convertToRealHash(o
);
3932 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3933 unsigned char *zm
= o
->ptr
;
3935 zm
= zipmapSet(zm
,key
->ptr
,sdslen(key
->ptr
),
3936 val
->ptr
,sdslen(val
->ptr
),NULL
);
3941 key
= tryObjectEncoding(key
);
3942 val
= tryObjectEncoding(val
);
3943 dictAdd((dict
*)o
->ptr
,key
,val
);
3947 redisPanic("Unknown object type");
3952 static int rdbLoad(char *filename
) {
3954 robj
*keyobj
= NULL
;
3956 int type
, retval
, rdbver
;
3957 dict
*d
= server
.db
[0].dict
;
3958 redisDb
*db
= server
.db
+0;
3960 time_t expiretime
= -1, now
= time(NULL
);
3961 long long loadedkeys
= 0;
3963 fp
= fopen(filename
,"r");
3964 if (!fp
) return REDIS_ERR
;
3965 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
3967 if (memcmp(buf
,"REDIS",5) != 0) {
3969 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
3972 rdbver
= atoi(buf
+5);
3975 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
3982 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3983 if (type
== REDIS_EXPIRETIME
) {
3984 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
3985 /* We read the time so we need to read the object type again */
3986 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3988 if (type
== REDIS_EOF
) break;
3989 /* Handle SELECT DB opcode as a special case */
3990 if (type
== REDIS_SELECTDB
) {
3991 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
3993 if (dbid
>= (unsigned)server
.dbnum
) {
3994 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
3997 db
= server
.db
+dbid
;
4002 if ((keyobj
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
4004 if ((o
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
4005 /* Add the new object in the hash table */
4006 retval
= dictAdd(d
,keyobj
,o
);
4007 if (retval
== DICT_ERR
) {
4008 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj
->ptr
);
4011 /* Set the expire time if needed */
4012 if (expiretime
!= -1) {
4013 setExpire(db
,keyobj
,expiretime
);
4014 /* Delete this key if already expired */
4015 if (expiretime
< now
) deleteKey(db
,keyobj
);
4019 /* Handle swapping while loading big datasets when VM is on */
4021 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
4022 while (zmalloc_used_memory() > server
.vm_max_memory
) {
4023 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
4030 eoferr
: /* unexpected end of file is handled here with a fatal exit */
4031 if (keyobj
) decrRefCount(keyobj
);
4032 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4034 return REDIS_ERR
; /* Just to avoid warning */
4037 /*================================== Commands =============================== */
4039 static void authCommand(redisClient
*c
) {
4040 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
4041 c
->authenticated
= 1;
4042 addReply(c
,shared
.ok
);
4044 c
->authenticated
= 0;
4045 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4049 static void pingCommand(redisClient
*c
) {
4050 addReply(c
,shared
.pong
);
4053 static void echoCommand(redisClient
*c
) {
4054 addReplyBulk(c
,c
->argv
[1]);
4057 /*=================================== Strings =============================== */
4059 static void setGenericCommand(redisClient
*c
, int nx
, robj
*key
, robj
*val
, robj
*expire
) {
4061 long seconds
= 0; /* initialized to avoid an harmness warning */
4064 if (getLongFromObjectOrReply(c
, expire
, &seconds
, NULL
) != REDIS_OK
)
4067 addReplySds(c
,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4072 if (nx
) deleteIfVolatile(c
->db
,key
);
4073 retval
= dictAdd(c
->db
->dict
,key
,val
);
4074 if (retval
== DICT_ERR
) {
4076 /* If the key is about a swapped value, we want a new key object
4077 * to overwrite the old. So we delete the old key in the database.
4078 * This will also make sure that swap pages about the old object
4079 * will be marked as free. */
4080 if (server
.vm_enabled
&& deleteIfSwapped(c
->db
,key
))
4082 dictReplace(c
->db
->dict
,key
,val
);
4085 addReply(c
,shared
.czero
);
4093 removeExpire(c
->db
,key
);
4094 if (expire
) setExpire(c
->db
,key
,time(NULL
)+seconds
);
4095 addReply(c
, nx
? shared
.cone
: shared
.ok
);
4098 static void setCommand(redisClient
*c
) {
4099 setGenericCommand(c
,0,c
->argv
[1],c
->argv
[2],NULL
);
4102 static void setnxCommand(redisClient
*c
) {
4103 setGenericCommand(c
,1,c
->argv
[1],c
->argv
[2],NULL
);
4106 static void setexCommand(redisClient
*c
) {
4107 setGenericCommand(c
,0,c
->argv
[1],c
->argv
[3],c
->argv
[2]);
4110 static int getGenericCommand(redisClient
*c
) {
4113 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
)
4116 if (o
->type
!= REDIS_STRING
) {
4117 addReply(c
,shared
.wrongtypeerr
);
4125 static void getCommand(redisClient
*c
) {
4126 getGenericCommand(c
);
4129 static void getsetCommand(redisClient
*c
) {
4130 if (getGenericCommand(c
) == REDIS_ERR
) return;
4131 if (dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]) == DICT_ERR
) {
4132 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
4134 incrRefCount(c
->argv
[1]);
4136 incrRefCount(c
->argv
[2]);
4138 removeExpire(c
->db
,c
->argv
[1]);
4141 static void mgetCommand(redisClient
*c
) {
4144 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
4145 for (j
= 1; j
< c
->argc
; j
++) {
4146 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
4148 addReply(c
,shared
.nullbulk
);
4150 if (o
->type
!= REDIS_STRING
) {
4151 addReply(c
,shared
.nullbulk
);
4159 static void msetGenericCommand(redisClient
*c
, int nx
) {
4160 int j
, busykeys
= 0;
4162 if ((c
->argc
% 2) == 0) {
4163 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4166 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4167 * set nothing at all if at least one already key exists. */
4169 for (j
= 1; j
< c
->argc
; j
+= 2) {
4170 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
4176 addReply(c
, shared
.czero
);
4180 for (j
= 1; j
< c
->argc
; j
+= 2) {
4183 c
->argv
[j
+1] = tryObjectEncoding(c
->argv
[j
+1]);
4184 retval
= dictAdd(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
4185 if (retval
== DICT_ERR
) {
4186 dictReplace(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
4187 incrRefCount(c
->argv
[j
+1]);
4189 incrRefCount(c
->argv
[j
]);
4190 incrRefCount(c
->argv
[j
+1]);
4192 removeExpire(c
->db
,c
->argv
[j
]);
4194 server
.dirty
+= (c
->argc
-1)/2;
4195 addReply(c
, nx
? shared
.cone
: shared
.ok
);
4198 static void msetCommand(redisClient
*c
) {
4199 msetGenericCommand(c
,0);
4202 static void msetnxCommand(redisClient
*c
) {
4203 msetGenericCommand(c
,1);
4206 static void incrDecrCommand(redisClient
*c
, long long incr
) {
4211 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4213 if (getLongLongFromObjectOrReply(c
, o
, &value
, NULL
) != REDIS_OK
) return;
4216 o
= createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",value
));
4217 o
= tryObjectEncoding(o
);
4218 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],o
);
4219 if (retval
== DICT_ERR
) {
4220 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
4221 removeExpire(c
->db
,c
->argv
[1]);
4223 incrRefCount(c
->argv
[1]);
4226 addReply(c
,shared
.colon
);
4228 addReply(c
,shared
.crlf
);
4231 static void incrCommand(redisClient
*c
) {
4232 incrDecrCommand(c
,1);
4235 static void decrCommand(redisClient
*c
) {
4236 incrDecrCommand(c
,-1);
4239 static void incrbyCommand(redisClient
*c
) {
4242 if (getLongLongFromObjectOrReply(c
, c
->argv
[2], &incr
, NULL
) != REDIS_OK
) return;
4243 incrDecrCommand(c
,incr
);
4246 static void decrbyCommand(redisClient
*c
) {
4249 if (getLongLongFromObjectOrReply(c
, c
->argv
[2], &incr
, NULL
) != REDIS_OK
) return;
4250 incrDecrCommand(c
,-incr
);
4253 static void appendCommand(redisClient
*c
) {
4258 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4260 /* Create the key */
4261 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
4262 incrRefCount(c
->argv
[1]);
4263 incrRefCount(c
->argv
[2]);
4264 totlen
= stringObjectLen(c
->argv
[2]);
4268 de
= dictFind(c
->db
->dict
,c
->argv
[1]);
4271 o
= dictGetEntryVal(de
);
4272 if (o
->type
!= REDIS_STRING
) {
4273 addReply(c
,shared
.wrongtypeerr
);
4276 /* If the object is specially encoded or shared we have to make
4278 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
4279 robj
*decoded
= getDecodedObject(o
);
4281 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
4282 decrRefCount(decoded
);
4283 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
4286 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
4287 o
->ptr
= sdscatlen(o
->ptr
,
4288 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
4290 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
4291 (unsigned long) c
->argv
[2]->ptr
);
4293 totlen
= sdslen(o
->ptr
);
4296 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
4299 static void substrCommand(redisClient
*c
) {
4301 long start
= atoi(c
->argv
[2]->ptr
);
4302 long end
= atoi(c
->argv
[3]->ptr
);
4303 size_t rangelen
, strlen
;
4306 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4307 checkType(c
,o
,REDIS_STRING
)) return;
4309 o
= getDecodedObject(o
);
4310 strlen
= sdslen(o
->ptr
);
4312 /* convert negative indexes */
4313 if (start
< 0) start
= strlen
+start
;
4314 if (end
< 0) end
= strlen
+end
;
4315 if (start
< 0) start
= 0;
4316 if (end
< 0) end
= 0;
4318 /* indexes sanity checks */
4319 if (start
> end
|| (size_t)start
>= strlen
) {
4320 /* Out of range start or start > end result in null reply */
4321 addReply(c
,shared
.nullbulk
);
4325 if ((size_t)end
>= strlen
) end
= strlen
-1;
4326 rangelen
= (end
-start
)+1;
4328 /* Return the result */
4329 addReplySds(c
,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen
));
4330 range
= sdsnewlen((char*)o
->ptr
+start
,rangelen
);
4331 addReplySds(c
,range
);
4332 addReply(c
,shared
.crlf
);
4336 /* ========================= Type agnostic commands ========================= */
4338 static void delCommand(redisClient
*c
) {
4341 for (j
= 1; j
< c
->argc
; j
++) {
4342 if (deleteKey(c
->db
,c
->argv
[j
])) {
4347 addReplyLong(c
,deleted
);
4350 static void existsCommand(redisClient
*c
) {
4351 expireIfNeeded(c
->db
,c
->argv
[1]);
4352 if (dictFind(c
->db
->dict
,c
->argv
[1])) {
4353 addReply(c
, shared
.cone
);
4355 addReply(c
, shared
.czero
);
4359 static void selectCommand(redisClient
*c
) {
4360 int id
= atoi(c
->argv
[1]->ptr
);
4362 if (selectDb(c
,id
) == REDIS_ERR
) {
4363 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
4365 addReply(c
,shared
.ok
);
4369 static void randomkeyCommand(redisClient
*c
) {
4374 de
= dictGetRandomKey(c
->db
->dict
);
4375 if (!de
|| expireIfNeeded(c
->db
,dictGetEntryKey(de
)) == 0) break;
4379 addReply(c
,shared
.nullbulk
);
4383 key
= dictGetEntryKey(de
);
4384 if (server
.vm_enabled
) {
4385 key
= dupStringObject(key
);
4386 addReplyBulk(c
,key
);
4389 addReplyBulk(c
,key
);
4393 static void keysCommand(redisClient
*c
) {
4396 sds pattern
= c
->argv
[1]->ptr
;
4397 int plen
= sdslen(pattern
);
4398 unsigned long numkeys
= 0;
4399 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
4401 di
= dictGetIterator(c
->db
->dict
);
4403 decrRefCount(lenobj
);
4404 while((de
= dictNext(di
)) != NULL
) {
4405 robj
*keyobj
= dictGetEntryKey(de
);
4407 sds key
= keyobj
->ptr
;
4408 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
4409 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
4410 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
4411 addReplyBulk(c
,keyobj
);
4416 dictReleaseIterator(di
);
4417 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
4420 static void dbsizeCommand(redisClient
*c
) {
4422 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
4425 static void lastsaveCommand(redisClient
*c
) {
4427 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
4430 static void typeCommand(redisClient
*c
) {
4434 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4439 case REDIS_STRING
: type
= "+string"; break;
4440 case REDIS_LIST
: type
= "+list"; break;
4441 case REDIS_SET
: type
= "+set"; break;
4442 case REDIS_ZSET
: type
= "+zset"; break;
4443 case REDIS_HASH
: type
= "+hash"; break;
4444 default: type
= "+unknown"; break;
4447 addReplySds(c
,sdsnew(type
));
4448 addReply(c
,shared
.crlf
);
4451 static void saveCommand(redisClient
*c
) {
4452 if (server
.bgsavechildpid
!= -1) {
4453 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
4456 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4457 addReply(c
,shared
.ok
);
4459 addReply(c
,shared
.err
);
4463 static void bgsaveCommand(redisClient
*c
) {
4464 if (server
.bgsavechildpid
!= -1) {
4465 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
4468 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
4469 char *status
= "+Background saving started\r\n";
4470 addReplySds(c
,sdsnew(status
));
4472 addReply(c
,shared
.err
);
4476 static void shutdownCommand(redisClient
*c
) {
4477 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
4478 /* Kill the saving child if there is a background saving in progress.
4479 We want to avoid race conditions, for instance our saving child may
4480 overwrite the synchronous saving did by SHUTDOWN. */
4481 if (server
.bgsavechildpid
!= -1) {
4482 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
4483 kill(server
.bgsavechildpid
,SIGKILL
);
4484 rdbRemoveTempFile(server
.bgsavechildpid
);
4486 if (server
.appendonly
) {
4487 /* Append only file: fsync() the AOF and exit */
4488 fsync(server
.appendfd
);
4489 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4492 /* Snapshotting. Perform a SYNC SAVE and exit */
4493 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4494 if (server
.daemonize
)
4495 unlink(server
.pidfile
);
4496 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
4497 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
4500 /* Ooops.. error saving! The best we can do is to continue
4501 * operating. Note that if there was a background saving process,
4502 * in the next cron() Redis will be notified that the background
4503 * saving aborted, handling special stuff like slaves pending for
4504 * synchronization... */
4505 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
4507 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4512 static void renameGenericCommand(redisClient
*c
, int nx
) {
4515 /* To use the same key as src and dst is probably an error */
4516 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
4517 addReply(c
,shared
.sameobjecterr
);
4521 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nokeyerr
)) == NULL
)
4525 deleteIfVolatile(c
->db
,c
->argv
[2]);
4526 if (dictAdd(c
->db
->dict
,c
->argv
[2],o
) == DICT_ERR
) {
4529 addReply(c
,shared
.czero
);
4532 dictReplace(c
->db
->dict
,c
->argv
[2],o
);
4534 incrRefCount(c
->argv
[2]);
4536 deleteKey(c
->db
,c
->argv
[1]);
4538 addReply(c
,nx
? shared
.cone
: shared
.ok
);
4541 static void renameCommand(redisClient
*c
) {
4542 renameGenericCommand(c
,0);
4545 static void renamenxCommand(redisClient
*c
) {
4546 renameGenericCommand(c
,1);
4549 static void moveCommand(redisClient
*c
) {
4554 /* Obtain source and target DB pointers */
4557 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
4558 addReply(c
,shared
.outofrangeerr
);
4562 selectDb(c
,srcid
); /* Back to the source DB */
4564 /* If the user is moving using as target the same
4565 * DB as the source DB it is probably an error. */
4567 addReply(c
,shared
.sameobjecterr
);
4571 /* Check if the element exists and get a reference */
4572 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4574 addReply(c
,shared
.czero
);
4578 /* Try to add the element to the target DB */
4579 deleteIfVolatile(dst
,c
->argv
[1]);
4580 if (dictAdd(dst
->dict
,c
->argv
[1],o
) == DICT_ERR
) {
4581 addReply(c
,shared
.czero
);
4584 incrRefCount(c
->argv
[1]);
4587 /* OK! key moved, free the entry in the source DB */
4588 deleteKey(src
,c
->argv
[1]);
4590 addReply(c
,shared
.cone
);
4593 /* =================================== Lists ================================ */
4594 static void pushGenericCommand(redisClient
*c
, int where
) {
4598 lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4600 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4601 addReply(c
,shared
.cone
);
4604 lobj
= createListObject();
4606 if (where
== REDIS_HEAD
) {
4607 listAddNodeHead(list
,c
->argv
[2]);
4609 listAddNodeTail(list
,c
->argv
[2]);
4611 dictAdd(c
->db
->dict
,c
->argv
[1],lobj
);
4612 incrRefCount(c
->argv
[1]);
4613 incrRefCount(c
->argv
[2]);
4615 if (lobj
->type
!= REDIS_LIST
) {
4616 addReply(c
,shared
.wrongtypeerr
);
4619 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4620 addReply(c
,shared
.cone
);
4624 if (where
== REDIS_HEAD
) {
4625 listAddNodeHead(list
,c
->argv
[2]);
4627 listAddNodeTail(list
,c
->argv
[2]);
4629 incrRefCount(c
->argv
[2]);
4632 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(list
)));
4635 static void lpushCommand(redisClient
*c
) {
4636 pushGenericCommand(c
,REDIS_HEAD
);
4639 static void rpushCommand(redisClient
*c
) {
4640 pushGenericCommand(c
,REDIS_TAIL
);
4643 static void llenCommand(redisClient
*c
) {
4647 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4648 checkType(c
,o
,REDIS_LIST
)) return;
4651 addReplyUlong(c
,listLength(l
));
4654 static void lindexCommand(redisClient
*c
) {
4656 int index
= atoi(c
->argv
[2]->ptr
);
4660 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4661 checkType(c
,o
,REDIS_LIST
)) return;
4664 ln
= listIndex(list
, index
);
4666 addReply(c
,shared
.nullbulk
);
4668 robj
*ele
= listNodeValue(ln
);
4669 addReplyBulk(c
,ele
);
4673 static void lsetCommand(redisClient
*c
) {
4675 int index
= atoi(c
->argv
[2]->ptr
);
4679 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nokeyerr
)) == NULL
||
4680 checkType(c
,o
,REDIS_LIST
)) return;
4683 ln
= listIndex(list
, index
);
4685 addReply(c
,shared
.outofrangeerr
);
4687 robj
*ele
= listNodeValue(ln
);
4690 listNodeValue(ln
) = c
->argv
[3];
4691 incrRefCount(c
->argv
[3]);
4692 addReply(c
,shared
.ok
);
4697 static void popGenericCommand(redisClient
*c
, int where
) {
4702 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4703 checkType(c
,o
,REDIS_LIST
)) return;
4706 if (where
== REDIS_HEAD
)
4707 ln
= listFirst(list
);
4709 ln
= listLast(list
);
4712 addReply(c
,shared
.nullbulk
);
4714 robj
*ele
= listNodeValue(ln
);
4715 addReplyBulk(c
,ele
);
4716 listDelNode(list
,ln
);
4717 if (listLength(list
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4722 static void lpopCommand(redisClient
*c
) {
4723 popGenericCommand(c
,REDIS_HEAD
);
4726 static void rpopCommand(redisClient
*c
) {
4727 popGenericCommand(c
,REDIS_TAIL
);
4730 static void lrangeCommand(redisClient
*c
) {
4732 int start
= atoi(c
->argv
[2]->ptr
);
4733 int end
= atoi(c
->argv
[3]->ptr
);
4740 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
4741 || checkType(c
,o
,REDIS_LIST
)) return;
4743 llen
= listLength(list
);
4745 /* convert negative indexes */
4746 if (start
< 0) start
= llen
+start
;
4747 if (end
< 0) end
= llen
+end
;
4748 if (start
< 0) start
= 0;
4749 if (end
< 0) end
= 0;
4751 /* indexes sanity checks */
4752 if (start
> end
|| start
>= llen
) {
4753 /* Out of range start or start > end result in empty list */
4754 addReply(c
,shared
.emptymultibulk
);
4757 if (end
>= llen
) end
= llen
-1;
4758 rangelen
= (end
-start
)+1;
4760 /* Return the result in form of a multi-bulk reply */
4761 ln
= listIndex(list
, start
);
4762 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
4763 for (j
= 0; j
< rangelen
; j
++) {
4764 ele
= listNodeValue(ln
);
4765 addReplyBulk(c
,ele
);
4770 static void ltrimCommand(redisClient
*c
) {
4772 int start
= atoi(c
->argv
[2]->ptr
);
4773 int end
= atoi(c
->argv
[3]->ptr
);
4775 int j
, ltrim
, rtrim
;
4779 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.ok
)) == NULL
||
4780 checkType(c
,o
,REDIS_LIST
)) return;
4782 llen
= listLength(list
);
4784 /* convert negative indexes */
4785 if (start
< 0) start
= llen
+start
;
4786 if (end
< 0) end
= llen
+end
;
4787 if (start
< 0) start
= 0;
4788 if (end
< 0) end
= 0;
4790 /* indexes sanity checks */
4791 if (start
> end
|| start
>= llen
) {
4792 /* Out of range start or start > end result in empty list */
4796 if (end
>= llen
) end
= llen
-1;
4801 /* Remove list elements to perform the trim */
4802 for (j
= 0; j
< ltrim
; j
++) {
4803 ln
= listFirst(list
);
4804 listDelNode(list
,ln
);
4806 for (j
= 0; j
< rtrim
; j
++) {
4807 ln
= listLast(list
);
4808 listDelNode(list
,ln
);
4810 if (listLength(list
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4812 addReply(c
,shared
.ok
);
4815 static void lremCommand(redisClient
*c
) {
4818 listNode
*ln
, *next
;
4819 int toremove
= atoi(c
->argv
[2]->ptr
);
4823 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4824 checkType(c
,o
,REDIS_LIST
)) return;
4828 toremove
= -toremove
;
4831 ln
= fromtail
? list
->tail
: list
->head
;
4833 robj
*ele
= listNodeValue(ln
);
4835 next
= fromtail
? ln
->prev
: ln
->next
;
4836 if (compareStringObjects(ele
,c
->argv
[3]) == 0) {
4837 listDelNode(list
,ln
);
4840 if (toremove
&& removed
== toremove
) break;
4844 if (listLength(list
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4845 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
4848 /* This is the semantic of this command:
4849 * RPOPLPUSH srclist dstlist:
4850 * IF LLEN(srclist) > 0
4851 * element = RPOP srclist
4852 * LPUSH dstlist element
4859 * The idea is to be able to get an element from a list in a reliable way
4860 * since the element is not just returned but pushed against another list
4861 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4863 static void rpoplpushcommand(redisClient
*c
) {
4868 if ((sobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
4869 checkType(c
,sobj
,REDIS_LIST
)) return;
4870 srclist
= sobj
->ptr
;
4871 ln
= listLast(srclist
);
4874 addReply(c
,shared
.nullbulk
);
4876 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4877 robj
*ele
= listNodeValue(ln
);
4880 if (dobj
&& dobj
->type
!= REDIS_LIST
) {
4881 addReply(c
,shared
.wrongtypeerr
);
4885 /* Add the element to the target list (unless it's directly
4886 * passed to some BLPOP-ing client */
4887 if (!handleClientsWaitingListPush(c
,c
->argv
[2],ele
)) {
4889 /* Create the list if the key does not exist */
4890 dobj
= createListObject();
4891 dictAdd(c
->db
->dict
,c
->argv
[2],dobj
);
4892 incrRefCount(c
->argv
[2]);
4894 dstlist
= dobj
->ptr
;
4895 listAddNodeHead(dstlist
,ele
);
4899 /* Send the element to the client as reply as well */
4900 addReplyBulk(c
,ele
);
4902 /* Finally remove the element from the source list */
4903 listDelNode(srclist
,ln
);
4904 if (listLength(srclist
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4909 /* ==================================== Sets ================================ */
4911 static void saddCommand(redisClient
*c
) {
4914 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4916 set
= createSetObject();
4917 dictAdd(c
->db
->dict
,c
->argv
[1],set
);
4918 incrRefCount(c
->argv
[1]);
4920 if (set
->type
!= REDIS_SET
) {
4921 addReply(c
,shared
.wrongtypeerr
);
4925 if (dictAdd(set
->ptr
,c
->argv
[2],NULL
) == DICT_OK
) {
4926 incrRefCount(c
->argv
[2]);
4928 addReply(c
,shared
.cone
);
4930 addReply(c
,shared
.czero
);
4934 static void sremCommand(redisClient
*c
) {
4937 if ((set
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4938 checkType(c
,set
,REDIS_SET
)) return;
4940 if (dictDelete(set
->ptr
,c
->argv
[2]) == DICT_OK
) {
4942 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4943 if (dictSize((dict
*)set
->ptr
) == 0) deleteKey(c
->db
,c
->argv
[1]);
4944 addReply(c
,shared
.cone
);
4946 addReply(c
,shared
.czero
);
4950 static void smoveCommand(redisClient
*c
) {
4951 robj
*srcset
, *dstset
;
4953 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4954 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4956 /* If the source key does not exist return 0, if it's of the wrong type
4958 if (srcset
== NULL
|| srcset
->type
!= REDIS_SET
) {
4959 addReply(c
, srcset
? shared
.wrongtypeerr
: shared
.czero
);
4962 /* Error if the destination key is not a set as well */
4963 if (dstset
&& dstset
->type
!= REDIS_SET
) {
4964 addReply(c
,shared
.wrongtypeerr
);
4967 /* Remove the element from the source set */
4968 if (dictDelete(srcset
->ptr
,c
->argv
[3]) == DICT_ERR
) {
4969 /* Key not found in the src set! return zero */
4970 addReply(c
,shared
.czero
);
4973 if (dictSize((dict
*)srcset
->ptr
) == 0 && srcset
!= dstset
)
4974 deleteKey(c
->db
,c
->argv
[1]);
4976 /* Add the element to the destination set */
4978 dstset
= createSetObject();
4979 dictAdd(c
->db
->dict
,c
->argv
[2],dstset
);
4980 incrRefCount(c
->argv
[2]);
4982 if (dictAdd(dstset
->ptr
,c
->argv
[3],NULL
) == DICT_OK
)
4983 incrRefCount(c
->argv
[3]);
4984 addReply(c
,shared
.cone
);
4987 static void sismemberCommand(redisClient
*c
) {
4990 if ((set
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
4991 checkType(c
,set
,REDIS_SET
)) return;
4993 if (dictFind(set
->ptr
,c
->argv
[2]))
4994 addReply(c
,shared
.cone
);
4996 addReply(c
,shared
.czero
);
4999 static void scardCommand(redisClient
*c
) {
5003 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5004 checkType(c
,o
,REDIS_SET
)) return;
5007 addReplyUlong(c
,dictSize(s
));
5010 static void spopCommand(redisClient
*c
) {
5014 if ((set
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
5015 checkType(c
,set
,REDIS_SET
)) return;
5017 de
= dictGetRandomKey(set
->ptr
);
5019 addReply(c
,shared
.nullbulk
);
5021 robj
*ele
= dictGetEntryKey(de
);
5023 addReplyBulk(c
,ele
);
5024 dictDelete(set
->ptr
,ele
);
5025 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
5026 if (dictSize((dict
*)set
->ptr
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5031 static void srandmemberCommand(redisClient
*c
) {
5035 if ((set
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
5036 checkType(c
,set
,REDIS_SET
)) return;
5038 de
= dictGetRandomKey(set
->ptr
);
5040 addReply(c
,shared
.nullbulk
);
5042 robj
*ele
= dictGetEntryKey(de
);
5044 addReplyBulk(c
,ele
);
5048 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
5049 dict
**d1
= (void*) s1
, **d2
= (void*) s2
;
5051 return dictSize(*d1
)-dictSize(*d2
);
5054 static void sinterGenericCommand(redisClient
*c
, robj
**setskeys
, unsigned long setsnum
, robj
*dstkey
) {
5055 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
5058 robj
*lenobj
= NULL
, *dstset
= NULL
;
5059 unsigned long j
, cardinality
= 0;
5061 for (j
= 0; j
< setsnum
; j
++) {
5065 lookupKeyWrite(c
->db
,setskeys
[j
]) :
5066 lookupKeyRead(c
->db
,setskeys
[j
]);
5070 if (deleteKey(c
->db
,dstkey
))
5072 addReply(c
,shared
.czero
);
5074 addReply(c
,shared
.emptymultibulk
);
5078 if (setobj
->type
!= REDIS_SET
) {
5080 addReply(c
,shared
.wrongtypeerr
);
5083 dv
[j
] = setobj
->ptr
;
5085 /* Sort sets from the smallest to largest, this will improve our
5086 * algorithm's performace */
5087 qsort(dv
,setsnum
,sizeof(dict
*),qsortCompareSetsByCardinality
);
5089 /* The first thing we should output is the total number of elements...
5090 * since this is a multi-bulk write, but at this stage we don't know
5091 * the intersection set size, so we use a trick, append an empty object
5092 * to the output list and save the pointer to later modify it with the
5095 lenobj
= createObject(REDIS_STRING
,NULL
);
5097 decrRefCount(lenobj
);
5099 /* If we have a target key where to store the resulting set
5100 * create this key with an empty set inside */
5101 dstset
= createSetObject();
5104 /* Iterate all the elements of the first (smallest) set, and test
5105 * the element against all the other sets, if at least one set does
5106 * not include the element it is discarded */
5107 di
= dictGetIterator(dv
[0]);
5109 while((de
= dictNext(di
)) != NULL
) {
5112 for (j
= 1; j
< setsnum
; j
++)
5113 if (dictFind(dv
[j
],dictGetEntryKey(de
)) == NULL
) break;
5115 continue; /* at least one set does not contain the member */
5116 ele
= dictGetEntryKey(de
);
5118 addReplyBulk(c
,ele
);
5121 dictAdd(dstset
->ptr
,ele
,NULL
);
5125 dictReleaseIterator(di
);
5128 /* Store the resulting set into the target, if the intersection
5129 * is not an empty set. */
5130 deleteKey(c
->db
,dstkey
);
5131 if (dictSize((dict
*)dstset
->ptr
) > 0) {
5132 dictAdd(c
->db
->dict
,dstkey
,dstset
);
5133 incrRefCount(dstkey
);
5134 addReplyLong(c
,dictSize((dict
*)dstset
->ptr
));
5136 decrRefCount(dstset
);
5137 addReply(c
,shared
.czero
);
5141 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
5146 static void sinterCommand(redisClient
*c
) {
5147 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
5150 static void sinterstoreCommand(redisClient
*c
) {
5151 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
5154 #define REDIS_OP_UNION 0
5155 #define REDIS_OP_DIFF 1
5156 #define REDIS_OP_INTER 2
5158 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setskeys
, int setsnum
, robj
*dstkey
, int op
) {
5159 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
5162 robj
*dstset
= NULL
;
5163 int j
, cardinality
= 0;
5165 for (j
= 0; j
< setsnum
; j
++) {
5169 lookupKeyWrite(c
->db
,setskeys
[j
]) :
5170 lookupKeyRead(c
->db
,setskeys
[j
]);
5175 if (setobj
->type
!= REDIS_SET
) {
5177 addReply(c
,shared
.wrongtypeerr
);
5180 dv
[j
] = setobj
->ptr
;
5183 /* We need a temp set object to store our union. If the dstkey
5184 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5185 * this set object will be the resulting object to set into the target key*/
5186 dstset
= createSetObject();
5188 /* Iterate all the elements of all the sets, add every element a single
5189 * time to the result set */
5190 for (j
= 0; j
< setsnum
; j
++) {
5191 if (op
== REDIS_OP_DIFF
&& j
== 0 && !dv
[j
]) break; /* result set is empty */
5192 if (!dv
[j
]) continue; /* non existing keys are like empty sets */
5194 di
= dictGetIterator(dv
[j
]);
5196 while((de
= dictNext(di
)) != NULL
) {
5199 /* dictAdd will not add the same element multiple times */
5200 ele
= dictGetEntryKey(de
);
5201 if (op
== REDIS_OP_UNION
|| j
== 0) {
5202 if (dictAdd(dstset
->ptr
,ele
,NULL
) == DICT_OK
) {
5206 } else if (op
== REDIS_OP_DIFF
) {
5207 if (dictDelete(dstset
->ptr
,ele
) == DICT_OK
) {
5212 dictReleaseIterator(di
);
5214 /* result set is empty? Exit asap. */
5215 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break;
5218 /* Output the content of the resulting set, if not in STORE mode */
5220 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
5221 di
= dictGetIterator(dstset
->ptr
);
5222 while((de
= dictNext(di
)) != NULL
) {
5225 ele
= dictGetEntryKey(de
);
5226 addReplyBulk(c
,ele
);
5228 dictReleaseIterator(di
);
5229 decrRefCount(dstset
);
5231 /* If we have a target key where to store the resulting set
5232 * create this key with the result set inside */
5233 deleteKey(c
->db
,dstkey
);
5234 if (dictSize((dict
*)dstset
->ptr
) > 0) {
5235 dictAdd(c
->db
->dict
,dstkey
,dstset
);
5236 incrRefCount(dstkey
);
5237 addReplyLong(c
,dictSize((dict
*)dstset
->ptr
));
5239 decrRefCount(dstset
);
5240 addReply(c
,shared
.czero
);
5247 static void sunionCommand(redisClient
*c
) {
5248 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
5251 static void sunionstoreCommand(redisClient
*c
) {
5252 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
5255 static void sdiffCommand(redisClient
*c
) {
5256 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
5259 static void sdiffstoreCommand(redisClient
*c
) {
5260 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
5263 /* ==================================== ZSets =============================== */
5265 /* ZSETs are ordered sets using two data structures to hold the same elements
5266 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5269 * The elements are added to an hash table mapping Redis objects to scores.
5270 * At the same time the elements are added to a skip list mapping scores
5271 * to Redis objects (so objects are sorted by scores in this "view"). */
5273 /* This skiplist implementation is almost a C translation of the original
5274 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5275 * Alternative to Balanced Trees", modified in three ways:
5276 * a) this implementation allows for repeated values.
5277 * b) the comparison is not just by key (our 'score') but by satellite data.
5278 * c) there is a back pointer, so it's a doubly linked list with the back
5279 * pointers being only at "level 1". This allows to traverse the list
5280 * from tail to head, useful for ZREVRANGE. */
5282 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
5283 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
5285 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
5287 zn
->span
= zmalloc(sizeof(unsigned int) * (level
- 1));
5293 static zskiplist
*zslCreate(void) {
5297 zsl
= zmalloc(sizeof(*zsl
));
5300 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
5301 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
5302 zsl
->header
->forward
[j
] = NULL
;
5304 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5305 if (j
< ZSKIPLIST_MAXLEVEL
-1)
5306 zsl
->header
->span
[j
] = 0;
5308 zsl
->header
->backward
= NULL
;
5313 static void zslFreeNode(zskiplistNode
*node
) {
5314 decrRefCount(node
->obj
);
5315 zfree(node
->forward
);
5320 static void zslFree(zskiplist
*zsl
) {
5321 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
5323 zfree(zsl
->header
->forward
);
5324 zfree(zsl
->header
->span
);
5327 next
= node
->forward
[0];
5334 static int zslRandomLevel(void) {
5336 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
5338 return (level
<ZSKIPLIST_MAXLEVEL
) ? level
: ZSKIPLIST_MAXLEVEL
;
5341 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
5342 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5343 unsigned int rank
[ZSKIPLIST_MAXLEVEL
];
5347 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5348 /* store rank that is crossed to reach the insert position */
5349 rank
[i
] = i
== (zsl
->level
-1) ? 0 : rank
[i
+1];
5351 while (x
->forward
[i
] &&
5352 (x
->forward
[i
]->score
< score
||
5353 (x
->forward
[i
]->score
== score
&&
5354 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
5355 rank
[i
] += i
> 0 ? x
->span
[i
-1] : 1;
5360 /* we assume the key is not already inside, since we allow duplicated
5361 * scores, and the re-insertion of score and redis object should never
5362 * happpen since the caller of zslInsert() should test in the hash table
5363 * if the element is already inside or not. */
5364 level
= zslRandomLevel();
5365 if (level
> zsl
->level
) {
5366 for (i
= zsl
->level
; i
< level
; i
++) {
5368 update
[i
] = zsl
->header
;
5369 update
[i
]->span
[i
-1] = zsl
->length
;
5373 x
= zslCreateNode(level
,score
,obj
);
5374 for (i
= 0; i
< level
; i
++) {
5375 x
->forward
[i
] = update
[i
]->forward
[i
];
5376 update
[i
]->forward
[i
] = x
;
5378 /* update span covered by update[i] as x is inserted here */
5380 x
->span
[i
-1] = update
[i
]->span
[i
-1] - (rank
[0] - rank
[i
]);
5381 update
[i
]->span
[i
-1] = (rank
[0] - rank
[i
]) + 1;
5385 /* increment span for untouched levels */
5386 for (i
= level
; i
< zsl
->level
; i
++) {
5387 update
[i
]->span
[i
-1]++;
5390 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
5392 x
->forward
[0]->backward
= x
;
5398 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5399 void zslDeleteNode(zskiplist
*zsl
, zskiplistNode
*x
, zskiplistNode
**update
) {
5401 for (i
= 0; i
< zsl
->level
; i
++) {
5402 if (update
[i
]->forward
[i
] == x
) {
5404 update
[i
]->span
[i
-1] += x
->span
[i
-1] - 1;
5406 update
[i
]->forward
[i
] = x
->forward
[i
];
5408 /* invariant: i > 0, because update[0]->forward[0]
5409 * is always equal to x */
5410 update
[i
]->span
[i
-1] -= 1;
5413 if (x
->forward
[0]) {
5414 x
->forward
[0]->backward
= x
->backward
;
5416 zsl
->tail
= x
->backward
;
5418 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
5423 /* Delete an element with matching score/object from the skiplist. */
5424 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
5425 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5429 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5430 while (x
->forward
[i
] &&
5431 (x
->forward
[i
]->score
< score
||
5432 (x
->forward
[i
]->score
== score
&&
5433 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
5437 /* We may have multiple elements with the same score, what we need
5438 * is to find the element with both the right score and object. */
5440 if (x
&& score
== x
->score
&& compareStringObjects(x
->obj
,obj
) == 0) {
5441 zslDeleteNode(zsl
, x
, update
);
5445 return 0; /* not found */
5447 return 0; /* not found */
5450 /* Delete all the elements with score between min and max from the skiplist.
5451 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5452 * Note that this function takes the reference to the hash table view of the
5453 * sorted set, in order to remove the elements from the hash table too. */
5454 static unsigned long zslDeleteRangeByScore(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
5455 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5456 unsigned long removed
= 0;
5460 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5461 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
5465 /* We may have multiple elements with the same score, what we need
5466 * is to find the element with both the right score and object. */
5468 while (x
&& x
->score
<= max
) {
5469 zskiplistNode
*next
= x
->forward
[0];
5470 zslDeleteNode(zsl
, x
, update
);
5471 dictDelete(dict
,x
->obj
);
5476 return removed
; /* not found */
5479 /* Delete all the elements with rank between start and end from the skiplist.
5480 * Start and end are inclusive. Note that start and end need to be 1-based */
5481 static unsigned long zslDeleteRangeByRank(zskiplist
*zsl
, unsigned int start
, unsigned int end
, dict
*dict
) {
5482 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5483 unsigned long traversed
= 0, removed
= 0;
5487 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5488 while (x
->forward
[i
] && (traversed
+ (i
> 0 ? x
->span
[i
-1] : 1)) < start
) {
5489 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5497 while (x
&& traversed
<= end
) {
5498 zskiplistNode
*next
= x
->forward
[0];
5499 zslDeleteNode(zsl
, x
, update
);
5500 dictDelete(dict
,x
->obj
);
5509 /* Find the first node having a score equal or greater than the specified one.
5510 * Returns NULL if there is no match. */
5511 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
5516 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5517 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
5520 /* We may have multiple elements with the same score, what we need
5521 * is to find the element with both the right score and object. */
5522 return x
->forward
[0];
5525 /* Find the rank for an element by both score and key.
5526 * Returns 0 when the element cannot be found, rank otherwise.
5527 * Note that the rank is 1-based due to the span of zsl->header to the
5529 static unsigned long zslGetRank(zskiplist
*zsl
, double score
, robj
*o
) {
5531 unsigned long rank
= 0;
5535 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5536 while (x
->forward
[i
] &&
5537 (x
->forward
[i
]->score
< score
||
5538 (x
->forward
[i
]->score
== score
&&
5539 compareStringObjects(x
->forward
[i
]->obj
,o
) <= 0))) {
5540 rank
+= i
> 0 ? x
->span
[i
-1] : 1;
5544 /* x might be equal to zsl->header, so test if obj is non-NULL */
5545 if (x
->obj
&& compareStringObjects(x
->obj
,o
) == 0) {
5552 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5553 zskiplistNode
* zslGetElementByRank(zskiplist
*zsl
, unsigned long rank
) {
5555 unsigned long traversed
= 0;
5559 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5560 while (x
->forward
[i
] && (traversed
+ (i
>0 ? x
->span
[i
-1] : 1)) <= rank
)
5562 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5565 if (traversed
== rank
) {
5572 /* The actual Z-commands implementations */
5574 /* This generic command implements both ZADD and ZINCRBY.
5575 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5576 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5577 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
5582 zsetobj
= lookupKeyWrite(c
->db
,key
);
5583 if (zsetobj
== NULL
) {
5584 zsetobj
= createZsetObject();
5585 dictAdd(c
->db
->dict
,key
,zsetobj
);
5588 if (zsetobj
->type
!= REDIS_ZSET
) {
5589 addReply(c
,shared
.wrongtypeerr
);
5595 /* Ok now since we implement both ZADD and ZINCRBY here the code
5596 * needs to handle the two different conditions. It's all about setting
5597 * '*score', that is, the new score to set, to the right value. */
5598 score
= zmalloc(sizeof(double));
5602 /* Read the old score. If the element was not present starts from 0 */
5603 de
= dictFind(zs
->dict
,ele
);
5605 double *oldscore
= dictGetEntryVal(de
);
5606 *score
= *oldscore
+ scoreval
;
5614 /* What follows is a simple remove and re-insert operation that is common
5615 * to both ZADD and ZINCRBY... */
5616 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
5617 /* case 1: New element */
5618 incrRefCount(ele
); /* added to hash */
5619 zslInsert(zs
->zsl
,*score
,ele
);
5620 incrRefCount(ele
); /* added to skiplist */
5623 addReplyDouble(c
,*score
);
5625 addReply(c
,shared
.cone
);
5630 /* case 2: Score update operation */
5631 de
= dictFind(zs
->dict
,ele
);
5632 redisAssert(de
!= NULL
);
5633 oldscore
= dictGetEntryVal(de
);
5634 if (*score
!= *oldscore
) {
5637 /* Remove and insert the element in the skip list with new score */
5638 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
5639 redisAssert(deleted
!= 0);
5640 zslInsert(zs
->zsl
,*score
,ele
);
5642 /* Update the score in the hash table */
5643 dictReplace(zs
->dict
,ele
,score
);
5649 addReplyDouble(c
,*score
);
5651 addReply(c
,shared
.czero
);
5655 static void zaddCommand(redisClient
*c
) {
5658 if (getDoubleFromObjectOrReply(c
, c
->argv
[2], &scoreval
, NULL
) != REDIS_OK
) return;
5659 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
5662 static void zincrbyCommand(redisClient
*c
) {
5665 if (getDoubleFromObjectOrReply(c
, c
->argv
[2], &scoreval
, NULL
) != REDIS_OK
) return;
5666 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
5669 static void zremCommand(redisClient
*c
) {
5676 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5677 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
5680 de
= dictFind(zs
->dict
,c
->argv
[2]);
5682 addReply(c
,shared
.czero
);
5685 /* Delete from the skiplist */
5686 oldscore
= dictGetEntryVal(de
);
5687 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
5688 redisAssert(deleted
!= 0);
5690 /* Delete from the hash table */
5691 dictDelete(zs
->dict
,c
->argv
[2]);
5692 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5693 if (dictSize(zs
->dict
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5695 addReply(c
,shared
.cone
);
5698 static void zremrangebyscoreCommand(redisClient
*c
) {
5705 if ((getDoubleFromObjectOrReply(c
, c
->argv
[2], &min
, NULL
) != REDIS_OK
) ||
5706 (getDoubleFromObjectOrReply(c
, c
->argv
[3], &max
, NULL
) != REDIS_OK
)) return;
5708 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5709 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
5712 deleted
= zslDeleteRangeByScore(zs
->zsl
,min
,max
,zs
->dict
);
5713 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5714 if (dictSize(zs
->dict
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5715 server
.dirty
+= deleted
;
5716 addReplyLong(c
,deleted
);
5719 static void zremrangebyrankCommand(redisClient
*c
) {
5727 if ((getLongFromObjectOrReply(c
, c
->argv
[2], &start
, NULL
) != REDIS_OK
) ||
5728 (getLongFromObjectOrReply(c
, c
->argv
[3], &end
, NULL
) != REDIS_OK
)) return;
5730 if ((zsetobj
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
5731 checkType(c
,zsetobj
,REDIS_ZSET
)) return;
5733 llen
= zs
->zsl
->length
;
5735 /* convert negative indexes */
5736 if (start
< 0) start
= llen
+start
;
5737 if (end
< 0) end
= llen
+end
;
5738 if (start
< 0) start
= 0;
5739 if (end
< 0) end
= 0;
5741 /* indexes sanity checks */
5742 if (start
> end
|| start
>= llen
) {
5743 addReply(c
,shared
.czero
);
5746 if (end
>= llen
) end
= llen
-1;
5748 /* increment start and end because zsl*Rank functions
5749 * use 1-based rank */
5750 deleted
= zslDeleteRangeByRank(zs
->zsl
,start
+1,end
+1,zs
->dict
);
5751 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5752 if (dictSize(zs
->dict
) == 0) deleteKey(c
->db
,c
->argv
[1]);
5753 server
.dirty
+= deleted
;
5754 addReplyLong(c
, deleted
);
5762 static int qsortCompareZsetopsrcByCardinality(const void *s1
, const void *s2
) {
5763 zsetopsrc
*d1
= (void*) s1
, *d2
= (void*) s2
;
5764 unsigned long size1
, size2
;
5765 size1
= d1
->dict
? dictSize(d1
->dict
) : 0;
5766 size2
= d2
->dict
? dictSize(d2
->dict
) : 0;
5767 return size1
- size2
;
5770 #define REDIS_AGGR_SUM 1
5771 #define REDIS_AGGR_MIN 2
5772 #define REDIS_AGGR_MAX 3
5774 inline static void zunionInterAggregate(double *target
, double val
, int aggregate
) {
5775 if (aggregate
== REDIS_AGGR_SUM
) {
5776 *target
= *target
+ val
;
5777 } else if (aggregate
== REDIS_AGGR_MIN
) {
5778 *target
= val
< *target
? val
: *target
;
5779 } else if (aggregate
== REDIS_AGGR_MAX
) {
5780 *target
= val
> *target
? val
: *target
;
5783 redisPanic("Unknown ZUNION/INTER aggregate type");
5787 static void zunionInterGenericCommand(redisClient
*c
, robj
*dstkey
, int op
) {
5789 int aggregate
= REDIS_AGGR_SUM
;
5796 /* expect zsetnum input keys to be given */
5797 zsetnum
= atoi(c
->argv
[2]->ptr
);
5799 addReplySds(c
,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5803 /* test if the expected number of keys would overflow */
5804 if (3+zsetnum
> c
->argc
) {
5805 addReply(c
,shared
.syntaxerr
);
5809 /* read keys to be used for input */
5810 src
= zmalloc(sizeof(zsetopsrc
) * zsetnum
);
5811 for (i
= 0, j
= 3; i
< zsetnum
; i
++, j
++) {
5812 robj
*zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
5816 if (zsetobj
->type
!= REDIS_ZSET
) {
5818 addReply(c
,shared
.wrongtypeerr
);
5821 src
[i
].dict
= ((zset
*)zsetobj
->ptr
)->dict
;
5824 /* default all weights to 1 */
5825 src
[i
].weight
= 1.0;
5828 /* parse optional extra arguments */
5830 int remaining
= c
->argc
- j
;
5833 if (remaining
>= (zsetnum
+ 1) && !strcasecmp(c
->argv
[j
]->ptr
,"weights")) {
5835 for (i
= 0; i
< zsetnum
; i
++, j
++, remaining
--) {
5836 if (getDoubleFromObjectOrReply(c
, c
->argv
[j
], &src
[i
].weight
, NULL
) != REDIS_OK
)
5839 } else if (remaining
>= 2 && !strcasecmp(c
->argv
[j
]->ptr
,"aggregate")) {
5841 if (!strcasecmp(c
->argv
[j
]->ptr
,"sum")) {
5842 aggregate
= REDIS_AGGR_SUM
;
5843 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"min")) {
5844 aggregate
= REDIS_AGGR_MIN
;
5845 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"max")) {
5846 aggregate
= REDIS_AGGR_MAX
;
5849 addReply(c
,shared
.syntaxerr
);
5855 addReply(c
,shared
.syntaxerr
);
5861 /* sort sets from the smallest to largest, this will improve our
5862 * algorithm's performance */
5863 qsort(src
,zsetnum
,sizeof(zsetopsrc
), qsortCompareZsetopsrcByCardinality
);
5865 dstobj
= createZsetObject();
5866 dstzset
= dstobj
->ptr
;
5868 if (op
== REDIS_OP_INTER
) {
5869 /* skip going over all entries if the smallest zset is NULL or empty */
5870 if (src
[0].dict
&& dictSize(src
[0].dict
) > 0) {
5871 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5872 * from small to large, all src[i > 0].dict are non-empty too */
5873 di
= dictGetIterator(src
[0].dict
);
5874 while((de
= dictNext(di
)) != NULL
) {
5875 double *score
= zmalloc(sizeof(double)), value
;
5876 *score
= src
[0].weight
* (*(double*)dictGetEntryVal(de
));
5878 for (j
= 1; j
< zsetnum
; j
++) {
5879 dictEntry
*other
= dictFind(src
[j
].dict
,dictGetEntryKey(de
));
5881 value
= src
[j
].weight
* (*(double*)dictGetEntryVal(other
));
5882 zunionInterAggregate(score
, value
, aggregate
);
5888 /* skip entry when not present in every source dict */
5892 robj
*o
= dictGetEntryKey(de
);
5893 dictAdd(dstzset
->dict
,o
,score
);
5894 incrRefCount(o
); /* added to dictionary */
5895 zslInsert(dstzset
->zsl
,*score
,o
);
5896 incrRefCount(o
); /* added to skiplist */
5899 dictReleaseIterator(di
);
5901 } else if (op
== REDIS_OP_UNION
) {
5902 for (i
= 0; i
< zsetnum
; i
++) {
5903 if (!src
[i
].dict
) continue;
5905 di
= dictGetIterator(src
[i
].dict
);
5906 while((de
= dictNext(di
)) != NULL
) {
5907 /* skip key when already processed */
5908 if (dictFind(dstzset
->dict
,dictGetEntryKey(de
)) != NULL
) continue;
5910 double *score
= zmalloc(sizeof(double)), value
;
5911 *score
= src
[i
].weight
* (*(double*)dictGetEntryVal(de
));
5913 /* because the zsets are sorted by size, its only possible
5914 * for sets at larger indices to hold this entry */
5915 for (j
= (i
+1); j
< zsetnum
; j
++) {
5916 dictEntry
*other
= dictFind(src
[j
].dict
,dictGetEntryKey(de
));
5918 value
= src
[j
].weight
* (*(double*)dictGetEntryVal(other
));
5919 zunionInterAggregate(score
, value
, aggregate
);
5923 robj
*o
= dictGetEntryKey(de
);
5924 dictAdd(dstzset
->dict
,o
,score
);
5925 incrRefCount(o
); /* added to dictionary */
5926 zslInsert(dstzset
->zsl
,*score
,o
);
5927 incrRefCount(o
); /* added to skiplist */
5929 dictReleaseIterator(di
);
5932 /* unknown operator */
5933 redisAssert(op
== REDIS_OP_INTER
|| op
== REDIS_OP_UNION
);
5936 deleteKey(c
->db
,dstkey
);
5937 if (dstzset
->zsl
->length
) {
5938 dictAdd(c
->db
->dict
,dstkey
,dstobj
);
5939 incrRefCount(dstkey
);
5940 addReplyLong(c
, dstzset
->zsl
->length
);
5943 decrRefCount(dstobj
);
5944 addReply(c
, shared
.czero
);
5949 static void zunionCommand(redisClient
*c
) {
5950 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_UNION
);
5953 static void zinterCommand(redisClient
*c
) {
5954 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_INTER
);
5957 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
5969 if ((getLongFromObjectOrReply(c
, c
->argv
[2], &start
, NULL
) != REDIS_OK
) ||
5970 (getLongFromObjectOrReply(c
, c
->argv
[3], &end
, NULL
) != REDIS_OK
)) return;
5972 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
5974 } else if (c
->argc
>= 5) {
5975 addReply(c
,shared
.syntaxerr
);
5979 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
5980 || checkType(c
,o
,REDIS_ZSET
)) return;
5985 /* convert negative indexes */
5986 if (start
< 0) start
= llen
+start
;
5987 if (end
< 0) end
= llen
+end
;
5988 if (start
< 0) start
= 0;
5989 if (end
< 0) end
= 0;
5991 /* indexes sanity checks */
5992 if (start
> end
|| start
>= llen
) {
5993 /* Out of range start or start > end result in empty list */
5994 addReply(c
,shared
.emptymultibulk
);
5997 if (end
>= llen
) end
= llen
-1;
5998 rangelen
= (end
-start
)+1;
6000 /* check if starting point is trivial, before searching
6001 * the element in log(N) time */
6003 ln
= start
== 0 ? zsl
->tail
: zslGetElementByRank(zsl
, llen
-start
);
6006 zsl
->header
->forward
[0] : zslGetElementByRank(zsl
, start
+1);
6009 /* Return the result in form of a multi-bulk reply */
6010 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
6011 withscores
? (rangelen
*2) : rangelen
));
6012 for (j
= 0; j
< rangelen
; j
++) {
6014 addReplyBulk(c
,ele
);
6016 addReplyDouble(c
,ln
->score
);
6017 ln
= reverse
? ln
->backward
: ln
->forward
[0];
6021 static void zrangeCommand(redisClient
*c
) {
6022 zrangeGenericCommand(c
,0);
6025 static void zrevrangeCommand(redisClient
*c
) {
6026 zrangeGenericCommand(c
,1);
6029 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6030 * If justcount is non-zero, just the count is returned. */
6031 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
6034 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
6035 int offset
= 0, limit
= -1;
6039 /* Parse the min-max interval. If one of the values is prefixed
6040 * by the "(" character, it's considered "open". For instance
6041 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6042 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6043 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
6044 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
6047 min
= strtod(c
->argv
[2]->ptr
,NULL
);
6049 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
6050 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
6053 max
= strtod(c
->argv
[3]->ptr
,NULL
);
6056 /* Parse "WITHSCORES": note that if the command was called with
6057 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6058 * enter the following paths to parse WITHSCORES and LIMIT. */
6059 if (c
->argc
== 5 || c
->argc
== 8) {
6060 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
6065 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
6069 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6074 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
6075 addReply(c
,shared
.syntaxerr
);
6077 } else if (c
->argc
== (7 + withscores
)) {
6078 offset
= atoi(c
->argv
[5]->ptr
);
6079 limit
= atoi(c
->argv
[6]->ptr
);
6080 if (offset
< 0) offset
= 0;
6083 /* Ok, lookup the key and get the range */
6084 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
6086 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
6088 if (o
->type
!= REDIS_ZSET
) {
6089 addReply(c
,shared
.wrongtypeerr
);
6091 zset
*zsetobj
= o
->ptr
;
6092 zskiplist
*zsl
= zsetobj
->zsl
;
6094 robj
*ele
, *lenobj
= NULL
;
6095 unsigned long rangelen
= 0;
6097 /* Get the first node with the score >= min, or with
6098 * score > min if 'minex' is true. */
6099 ln
= zslFirstWithScore(zsl
,min
);
6100 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
6103 /* No element matching the speciifed interval */
6104 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
6108 /* We don't know in advance how many matching elements there
6109 * are in the list, so we push this object that will represent
6110 * the multi-bulk length in the output buffer, and will "fix"
6113 lenobj
= createObject(REDIS_STRING
,NULL
);
6115 decrRefCount(lenobj
);
6118 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
6121 ln
= ln
->forward
[0];
6124 if (limit
== 0) break;
6127 addReplyBulk(c
,ele
);
6129 addReplyDouble(c
,ln
->score
);
6131 ln
= ln
->forward
[0];
6133 if (limit
> 0) limit
--;
6136 addReplyLong(c
,(long)rangelen
);
6138 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
6139 withscores
? (rangelen
*2) : rangelen
);
6145 static void zrangebyscoreCommand(redisClient
*c
) {
6146 genericZrangebyscoreCommand(c
,0);
6149 static void zcountCommand(redisClient
*c
) {
6150 genericZrangebyscoreCommand(c
,1);
6153 static void zcardCommand(redisClient
*c
) {
6157 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6158 checkType(c
,o
,REDIS_ZSET
)) return;
6161 addReplyUlong(c
,zs
->zsl
->length
);
6164 static void zscoreCommand(redisClient
*c
) {
6169 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6170 checkType(c
,o
,REDIS_ZSET
)) return;
6173 de
= dictFind(zs
->dict
,c
->argv
[2]);
6175 addReply(c
,shared
.nullbulk
);
6177 double *score
= dictGetEntryVal(de
);
6179 addReplyDouble(c
,*score
);
6183 static void zrankGenericCommand(redisClient
*c
, int reverse
) {
6191 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6192 checkType(c
,o
,REDIS_ZSET
)) return;
6196 de
= dictFind(zs
->dict
,c
->argv
[2]);
6198 addReply(c
,shared
.nullbulk
);
6202 score
= dictGetEntryVal(de
);
6203 rank
= zslGetRank(zsl
, *score
, c
->argv
[2]);
6206 addReplyLong(c
, zsl
->length
- rank
);
6208 addReplyLong(c
, rank
-1);
6211 addReply(c
,shared
.nullbulk
);
6215 static void zrankCommand(redisClient
*c
) {
6216 zrankGenericCommand(c
, 0);
6219 static void zrevrankCommand(redisClient
*c
) {
6220 zrankGenericCommand(c
, 1);
6223 /* ========================= Hashes utility functions ======================= */
6224 #define REDIS_HASH_KEY 1
6225 #define REDIS_HASH_VALUE 2
6227 /* Check the length of a number of objects to see if we need to convert a
6228 * zipmap to a real hash. Note that we only check string encoded objects
6229 * as their string length can be queried in constant time. */
6230 static void hashTryConversion(robj
*subject
, robj
**argv
, int start
, int end
) {
6232 if (subject
->encoding
!= REDIS_ENCODING_ZIPMAP
) return;
6234 for (i
= start
; i
<= end
; i
++) {
6235 if (argv
[i
]->encoding
== REDIS_ENCODING_RAW
&&
6236 sdslen(argv
[i
]->ptr
) > server
.hash_max_zipmap_value
)
6238 convertToRealHash(subject
);
6244 /* Encode given objects in-place when the hash uses a dict. */
6245 static void hashTryObjectEncoding(robj
*subject
, robj
**o1
, robj
**o2
) {
6246 if (subject
->encoding
== REDIS_ENCODING_HT
) {
6247 if (o1
) *o1
= tryObjectEncoding(*o1
);
6248 if (o2
) *o2
= tryObjectEncoding(*o2
);
6252 /* Get the value from a hash identified by key. Returns either a string
6253 * object or NULL if the value cannot be found. The refcount of the object
6254 * is always increased by 1 when the value was found. */
6255 static robj
*hashGet(robj
*o
, robj
*key
) {
6257 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6260 key
= getDecodedObject(key
);
6261 if (zipmapGet(o
->ptr
,key
->ptr
,sdslen(key
->ptr
),&v
,&vlen
)) {
6262 value
= createStringObject((char*)v
,vlen
);
6266 dictEntry
*de
= dictFind(o
->ptr
,key
);
6268 value
= dictGetEntryVal(de
);
6269 incrRefCount(value
);
6275 /* Test if the key exists in the given hash. Returns 1 if the key
6276 * exists and 0 when it doesn't. */
6277 static int hashExists(robj
*o
, robj
*key
) {
6278 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6279 key
= getDecodedObject(key
);
6280 if (zipmapExists(o
->ptr
,key
->ptr
,sdslen(key
->ptr
))) {
6286 if (dictFind(o
->ptr
,key
) != NULL
) {
6293 /* Add an element, discard the old if the key already exists.
6294 * Return 0 on insert and 1 on update. */
6295 static int hashSet(robj
*o
, robj
*key
, robj
*value
) {
6297 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6298 key
= getDecodedObject(key
);
6299 value
= getDecodedObject(value
);
6300 o
->ptr
= zipmapSet(o
->ptr
,
6301 key
->ptr
,sdslen(key
->ptr
),
6302 value
->ptr
,sdslen(value
->ptr
), &update
);
6304 decrRefCount(value
);
6306 /* Check if the zipmap needs to be upgraded to a real hash table */
6307 if (zipmapLen(o
->ptr
) > server
.hash_max_zipmap_entries
)
6308 convertToRealHash(o
);
6310 if (dictReplace(o
->ptr
,key
,value
)) {
6317 incrRefCount(value
);
6322 /* Delete an element from a hash.
6323 * Return 1 on deleted and 0 on not found. */
6324 static int hashDelete(robj
*o
, robj
*key
) {
6326 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6327 key
= getDecodedObject(key
);
6328 o
->ptr
= zipmapDel(o
->ptr
,key
->ptr
,sdslen(key
->ptr
), &deleted
);
6331 deleted
= dictDelete((dict
*)o
->ptr
,key
) == DICT_OK
;
6332 /* Always check if the dictionary needs a resize after a delete. */
6333 if (deleted
&& htNeedsResize(o
->ptr
)) dictResize(o
->ptr
);
6338 /* Return the number of elements in a hash. */
6339 static unsigned long hashLength(robj
*o
) {
6340 return (o
->encoding
== REDIS_ENCODING_ZIPMAP
) ?
6341 zipmapLen((unsigned char*)o
->ptr
) : dictSize((dict
*)o
->ptr
);
6344 /* Structure to hold hash iteration abstration. Note that iteration over
6345 * hashes involves both fields and values. Because it is possible that
6346 * not both are required, store pointers in the iterator to avoid
6347 * unnecessary memory allocation for fields/values. */
6351 unsigned char *zk
, *zv
;
6352 unsigned int zklen
, zvlen
;
6358 static hashIterator
*hashInitIterator(robj
*subject
) {
6359 hashIterator
*hi
= zmalloc(sizeof(hashIterator
));
6360 hi
->encoding
= subject
->encoding
;
6361 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6362 hi
->zi
= zipmapRewind(subject
->ptr
);
6363 } else if (hi
->encoding
== REDIS_ENCODING_HT
) {
6364 hi
->di
= dictGetIterator(subject
->ptr
);
6371 static void hashReleaseIterator(hashIterator
*hi
) {
6372 if (hi
->encoding
== REDIS_ENCODING_HT
) {
6373 dictReleaseIterator(hi
->di
);
6378 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6379 * could be found and REDIS_ERR when the iterator reaches the end. */
6380 static int hashNext(hashIterator
*hi
) {
6381 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6382 if ((hi
->zi
= zipmapNext(hi
->zi
, &hi
->zk
, &hi
->zklen
,
6383 &hi
->zv
, &hi
->zvlen
)) == NULL
) return REDIS_ERR
;
6385 if ((hi
->de
= dictNext(hi
->di
)) == NULL
) return REDIS_ERR
;
6390 /* Get key or value object at current iteration position.
6391 * This increases the refcount of the field object by 1. */
6392 static robj
*hashCurrent(hashIterator
*hi
, int what
) {
6394 if (hi
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6395 if (what
& REDIS_HASH_KEY
) {
6396 o
= createStringObject((char*)hi
->zk
,hi
->zklen
);
6398 o
= createStringObject((char*)hi
->zv
,hi
->zvlen
);
6401 if (what
& REDIS_HASH_KEY
) {
6402 o
= dictGetEntryKey(hi
->de
);
6404 o
= dictGetEntryVal(hi
->de
);
6411 static robj
*hashLookupWriteOrCreate(redisClient
*c
, robj
*key
) {
6412 robj
*o
= lookupKeyWrite(c
->db
,key
);
6414 o
= createHashObject();
6415 dictAdd(c
->db
->dict
,key
,o
);
6418 if (o
->type
!= REDIS_HASH
) {
6419 addReply(c
,shared
.wrongtypeerr
);
6426 /* ============================= Hash commands ============================== */
6427 static void hsetCommand(redisClient
*c
) {
6431 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6432 hashTryConversion(o
,c
->argv
,2,3);
6433 hashTryObjectEncoding(o
,&c
->argv
[2], &c
->argv
[3]);
6434 update
= hashSet(o
,c
->argv
[2],c
->argv
[3]);
6435 addReply(c
, update
? shared
.czero
: shared
.cone
);
6439 static void hsetnxCommand(redisClient
*c
) {
6441 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6442 hashTryConversion(o
,c
->argv
,2,3);
6444 if (hashExists(o
, c
->argv
[2])) {
6445 addReply(c
, shared
.czero
);
6447 hashTryObjectEncoding(o
,&c
->argv
[2], &c
->argv
[3]);
6448 hashSet(o
,c
->argv
[2],c
->argv
[3]);
6449 addReply(c
, shared
.cone
);
6454 static void hmsetCommand(redisClient
*c
) {
6458 if ((c
->argc
% 2) == 1) {
6459 addReplySds(c
,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6463 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6464 hashTryConversion(o
,c
->argv
,2,c
->argc
-1);
6465 for (i
= 2; i
< c
->argc
; i
+= 2) {
6466 hashTryObjectEncoding(o
,&c
->argv
[i
], &c
->argv
[i
+1]);
6467 hashSet(o
,c
->argv
[i
],c
->argv
[i
+1]);
6469 addReply(c
, shared
.ok
);
6473 static void hincrbyCommand(redisClient
*c
) {
6474 long long value
, incr
;
6475 robj
*o
, *current
, *new;
6477 if (getLongLongFromObjectOrReply(c
,c
->argv
[3],&incr
,NULL
) != REDIS_OK
) return;
6478 if ((o
= hashLookupWriteOrCreate(c
,c
->argv
[1])) == NULL
) return;
6479 if ((current
= hashGet(o
,c
->argv
[2])) != NULL
) {
6480 if (getLongLongFromObjectOrReply(c
,current
,&value
,
6481 "hash value is not an integer") != REDIS_OK
) {
6482 decrRefCount(current
);
6485 decrRefCount(current
);
6491 new = createStringObjectFromLongLong(value
);
6492 hashTryObjectEncoding(o
,&c
->argv
[2],NULL
);
6493 hashSet(o
,c
->argv
[2],new);
6495 addReplyLongLong(c
,value
);
6499 static void hgetCommand(redisClient
*c
) {
6501 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.nullbulk
)) == NULL
||
6502 checkType(c
,o
,REDIS_HASH
)) return;
6504 if ((value
= hashGet(o
,c
->argv
[2])) != NULL
) {
6505 addReplyBulk(c
,value
);
6506 decrRefCount(value
);
6508 addReply(c
,shared
.nullbulk
);
6512 static void hmgetCommand(redisClient
*c
) {
6515 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
6516 if (o
!= NULL
&& o
->type
!= REDIS_HASH
) {
6517 addReply(c
,shared
.wrongtypeerr
);
6520 /* Note the check for o != NULL happens inside the loop. This is
6521 * done because objects that cannot be found are considered to be
6522 * an empty hash. The reply should then be a series of NULLs. */
6523 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-2));
6524 for (i
= 2; i
< c
->argc
; i
++) {
6525 if (o
!= NULL
&& (value
= hashGet(o
,c
->argv
[i
])) != NULL
) {
6526 addReplyBulk(c
,value
);
6527 decrRefCount(value
);
6529 addReply(c
,shared
.nullbulk
);
6534 static void hdelCommand(redisClient
*c
) {
6536 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6537 checkType(c
,o
,REDIS_HASH
)) return;
6539 if (hashDelete(o
,c
->argv
[2])) {
6540 if (hashLength(o
) == 0) deleteKey(c
->db
,c
->argv
[1]);
6541 addReply(c
,shared
.cone
);
6544 addReply(c
,shared
.czero
);
6548 static void hlenCommand(redisClient
*c
) {
6550 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6551 checkType(c
,o
,REDIS_HASH
)) return;
6553 addReplyUlong(c
,hashLength(o
));
6556 static void genericHgetallCommand(redisClient
*c
, int flags
) {
6557 robj
*o
, *lenobj
, *obj
;
6558 unsigned long count
= 0;
6561 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.emptymultibulk
)) == NULL
6562 || checkType(c
,o
,REDIS_HASH
)) return;
6564 lenobj
= createObject(REDIS_STRING
,NULL
);
6566 decrRefCount(lenobj
);
6568 hi
= hashInitIterator(o
);
6569 while (hashNext(hi
) != REDIS_ERR
) {
6570 if (flags
& REDIS_HASH_KEY
) {
6571 obj
= hashCurrent(hi
,REDIS_HASH_KEY
);
6572 addReplyBulk(c
,obj
);
6576 if (flags
& REDIS_HASH_VALUE
) {
6577 obj
= hashCurrent(hi
,REDIS_HASH_VALUE
);
6578 addReplyBulk(c
,obj
);
6583 hashReleaseIterator(hi
);
6585 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",count
);
6588 static void hkeysCommand(redisClient
*c
) {
6589 genericHgetallCommand(c
,REDIS_HASH_KEY
);
6592 static void hvalsCommand(redisClient
*c
) {
6593 genericHgetallCommand(c
,REDIS_HASH_VALUE
);
6596 static void hgetallCommand(redisClient
*c
) {
6597 genericHgetallCommand(c
,REDIS_HASH_KEY
|REDIS_HASH_VALUE
);
6600 static void hexistsCommand(redisClient
*c
) {
6602 if ((o
= lookupKeyReadOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6603 checkType(c
,o
,REDIS_HASH
)) return;
6605 addReply(c
, hashExists(o
,c
->argv
[2]) ? shared
.cone
: shared
.czero
);
6608 static void convertToRealHash(robj
*o
) {
6609 unsigned char *key
, *val
, *p
, *zm
= o
->ptr
;
6610 unsigned int klen
, vlen
;
6611 dict
*dict
= dictCreate(&hashDictType
,NULL
);
6613 assert(o
->type
== REDIS_HASH
&& o
->encoding
!= REDIS_ENCODING_HT
);
6614 p
= zipmapRewind(zm
);
6615 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
6616 robj
*keyobj
, *valobj
;
6618 keyobj
= createStringObject((char*)key
,klen
);
6619 valobj
= createStringObject((char*)val
,vlen
);
6620 keyobj
= tryObjectEncoding(keyobj
);
6621 valobj
= tryObjectEncoding(valobj
);
6622 dictAdd(dict
,keyobj
,valobj
);
6624 o
->encoding
= REDIS_ENCODING_HT
;
6629 /* ========================= Non type-specific commands ==================== */
6631 static void flushdbCommand(redisClient
*c
) {
6632 server
.dirty
+= dictSize(c
->db
->dict
);
6633 dictEmpty(c
->db
->dict
);
6634 dictEmpty(c
->db
->expires
);
6635 addReply(c
,shared
.ok
);
6638 static void flushallCommand(redisClient
*c
) {
6639 server
.dirty
+= emptyDb();
6640 addReply(c
,shared
.ok
);
6641 if (server
.bgsavechildpid
!= -1) {
6642 kill(server
.bgsavechildpid
,SIGKILL
);
6643 rdbRemoveTempFile(server
.bgsavechildpid
);
6645 rdbSave(server
.dbfilename
);
6649 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
6650 redisSortOperation
*so
= zmalloc(sizeof(*so
));
6652 so
->pattern
= pattern
;
6656 /* Return the value associated to the key with a name obtained
6657 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6658 * The returned object will always have its refcount increased by 1
6659 * when it is non-NULL. */
6660 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
6663 robj keyobj
, fieldobj
, *o
;
6664 int prefixlen
, sublen
, postfixlen
, fieldlen
;
6665 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6669 char buf
[REDIS_SORTKEY_MAX
+1];
6670 } keyname
, fieldname
;
6672 /* If the pattern is "#" return the substitution object itself in order
6673 * to implement the "SORT ... GET #" feature. */
6674 spat
= pattern
->ptr
;
6675 if (spat
[0] == '#' && spat
[1] == '\0') {
6676 incrRefCount(subst
);
6680 /* The substitution object may be specially encoded. If so we create
6681 * a decoded object on the fly. Otherwise getDecodedObject will just
6682 * increment the ref count, that we'll decrement later. */
6683 subst
= getDecodedObject(subst
);
6686 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
6687 p
= strchr(spat
,'*');
6689 decrRefCount(subst
);
6693 /* Find out if we're dealing with a hash dereference. */
6694 if ((f
= strstr(p
+1, "->")) != NULL
) {
6695 fieldlen
= sdslen(spat
)-(f
-spat
);
6696 /* this also copies \0 character */
6697 memcpy(fieldname
.buf
,f
+2,fieldlen
-1);
6698 fieldname
.len
= fieldlen
-2;
6704 sublen
= sdslen(ssub
);
6705 postfixlen
= sdslen(spat
)-(prefixlen
+1)-fieldlen
;
6706 memcpy(keyname
.buf
,spat
,prefixlen
);
6707 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
6708 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
6709 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
6710 keyname
.len
= prefixlen
+sublen
+postfixlen
;
6711 decrRefCount(subst
);
6713 /* Lookup substituted key */
6714 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2));
6715 o
= lookupKeyRead(db
,&keyobj
);
6716 if (o
== NULL
) return NULL
;
6719 if (o
->type
!= REDIS_HASH
|| fieldname
.len
< 1) return NULL
;
6721 /* Retrieve value from hash by the field name. This operation
6722 * already increases the refcount of the returned object. */
6723 initStaticStringObject(fieldobj
,((char*)&fieldname
)+(sizeof(long)*2));
6724 o
= hashGet(o
, &fieldobj
);
6726 if (o
->type
!= REDIS_STRING
) return NULL
;
6728 /* Every object that this function returns needs to have its refcount
6729 * increased. sortCommand decreases it again. */
6736 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6737 * the additional parameter is not standard but a BSD-specific we have to
6738 * pass sorting parameters via the global 'server' structure */
6739 static int sortCompare(const void *s1
, const void *s2
) {
6740 const redisSortObject
*so1
= s1
, *so2
= s2
;
6743 if (!server
.sort_alpha
) {
6744 /* Numeric sorting. Here it's trivial as we precomputed scores */
6745 if (so1
->u
.score
> so2
->u
.score
) {
6747 } else if (so1
->u
.score
< so2
->u
.score
) {
6753 /* Alphanumeric sorting */
6754 if (server
.sort_bypattern
) {
6755 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
6756 /* At least one compare object is NULL */
6757 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
6759 else if (so1
->u
.cmpobj
== NULL
)
6764 /* We have both the objects, use strcoll */
6765 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
6768 /* Compare elements directly. */
6769 cmp
= compareStringObjects(so1
->obj
,so2
->obj
);
6772 return server
.sort_desc
? -cmp
: cmp
;
6775 /* The SORT command is the most complex command in Redis. Warning: this code
6776 * is optimized for speed and a bit less for readability */
6777 static void sortCommand(redisClient
*c
) {
6780 int desc
= 0, alpha
= 0;
6781 int limit_start
= 0, limit_count
= -1, start
, end
;
6782 int j
, dontsort
= 0, vectorlen
;
6783 int getop
= 0; /* GET operation counter */
6784 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
6785 redisSortObject
*vector
; /* Resulting vector to sort */
6787 /* Lookup the key to sort. It must be of the right types */
6788 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
6789 if (sortval
== NULL
) {
6790 addReply(c
,shared
.emptymultibulk
);
6793 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
6794 sortval
->type
!= REDIS_ZSET
)
6796 addReply(c
,shared
.wrongtypeerr
);
6800 /* Create a list of operations to perform for every sorted element.
6801 * Operations can be GET/DEL/INCR/DECR */
6802 operations
= listCreate();
6803 listSetFreeMethod(operations
,zfree
);
6806 /* Now we need to protect sortval incrementing its count, in the future
6807 * SORT may have options able to overwrite/delete keys during the sorting
6808 * and the sorted key itself may get destroied */
6809 incrRefCount(sortval
);
6811 /* The SORT command has an SQL-alike syntax, parse it */
6812 while(j
< c
->argc
) {
6813 int leftargs
= c
->argc
-j
-1;
6814 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
6816 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
6818 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
6820 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
6821 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
6822 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
6824 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
6825 storekey
= c
->argv
[j
+1];
6827 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
6828 sortby
= c
->argv
[j
+1];
6829 /* If the BY pattern does not contain '*', i.e. it is constant,
6830 * we don't need to sort nor to lookup the weight keys. */
6831 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
6833 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
6834 listAddNodeTail(operations
,createSortOperation(
6835 REDIS_SORT_GET
,c
->argv
[j
+1]));
6839 decrRefCount(sortval
);
6840 listRelease(operations
);
6841 addReply(c
,shared
.syntaxerr
);
6847 /* Load the sorting vector with all the objects to sort */
6848 switch(sortval
->type
) {
6849 case REDIS_LIST
: vectorlen
= listLength((list
*)sortval
->ptr
); break;
6850 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
6851 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
6852 default: vectorlen
= 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6854 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
6857 if (sortval
->type
== REDIS_LIST
) {
6858 list
*list
= sortval
->ptr
;
6862 listRewind(list
,&li
);
6863 while((ln
= listNext(&li
))) {
6864 robj
*ele
= ln
->value
;
6865 vector
[j
].obj
= ele
;
6866 vector
[j
].u
.score
= 0;
6867 vector
[j
].u
.cmpobj
= NULL
;
6875 if (sortval
->type
== REDIS_SET
) {
6878 zset
*zs
= sortval
->ptr
;
6882 di
= dictGetIterator(set
);
6883 while((setele
= dictNext(di
)) != NULL
) {
6884 vector
[j
].obj
= dictGetEntryKey(setele
);
6885 vector
[j
].u
.score
= 0;
6886 vector
[j
].u
.cmpobj
= NULL
;
6889 dictReleaseIterator(di
);
6891 redisAssert(j
== vectorlen
);
6893 /* Now it's time to load the right scores in the sorting vector */
6894 if (dontsort
== 0) {
6895 for (j
= 0; j
< vectorlen
; j
++) {
6898 /* lookup value to sort by */
6899 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
6900 if (!byval
) continue;
6902 /* use object itself to sort by */
6903 byval
= vector
[j
].obj
;
6907 if (sortby
) vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
6909 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
6910 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
6911 } else if (byval
->encoding
== REDIS_ENCODING_INT
) {
6912 /* Don't need to decode the object if it's
6913 * integer-encoded (the only encoding supported) so
6914 * far. We can just cast it */
6915 vector
[j
].u
.score
= (long)byval
->ptr
;
6917 redisAssert(1 != 1);
6921 /* when the object was retrieved using lookupKeyByPattern,
6922 * its refcount needs to be decreased. */
6924 decrRefCount(byval
);
6929 /* We are ready to sort the vector... perform a bit of sanity check
6930 * on the LIMIT option too. We'll use a partial version of quicksort. */
6931 start
= (limit_start
< 0) ? 0 : limit_start
;
6932 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
6933 if (start
>= vectorlen
) {
6934 start
= vectorlen
-1;
6937 if (end
>= vectorlen
) end
= vectorlen
-1;
6939 if (dontsort
== 0) {
6940 server
.sort_desc
= desc
;
6941 server
.sort_alpha
= alpha
;
6942 server
.sort_bypattern
= sortby
? 1 : 0;
6943 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
6944 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
6946 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
6949 /* Send command output to the output buffer, performing the specified
6950 * GET/DEL/INCR/DECR operations if any. */
6951 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
6952 if (storekey
== NULL
) {
6953 /* STORE option not specified, sent the sorting result to client */
6954 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
6955 for (j
= start
; j
<= end
; j
++) {
6959 if (!getop
) addReplyBulk(c
,vector
[j
].obj
);
6960 listRewind(operations
,&li
);
6961 while((ln
= listNext(&li
))) {
6962 redisSortOperation
*sop
= ln
->value
;
6963 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
6966 if (sop
->type
== REDIS_SORT_GET
) {
6968 addReply(c
,shared
.nullbulk
);
6970 addReplyBulk(c
,val
);
6974 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
6979 robj
*listObject
= createListObject();
6980 list
*listPtr
= (list
*) listObject
->ptr
;
6982 /* STORE option specified, set the sorting result as a List object */
6983 for (j
= start
; j
<= end
; j
++) {
6988 listAddNodeTail(listPtr
,vector
[j
].obj
);
6989 incrRefCount(vector
[j
].obj
);
6991 listRewind(operations
,&li
);
6992 while((ln
= listNext(&li
))) {
6993 redisSortOperation
*sop
= ln
->value
;
6994 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
6997 if (sop
->type
== REDIS_SORT_GET
) {
6999 listAddNodeTail(listPtr
,createStringObject("",0));
7001 /* We should do a incrRefCount on val because it is
7002 * added to the list, but also a decrRefCount because
7003 * it is returned by lookupKeyByPattern. This results
7004 * in doing nothing at all. */
7005 listAddNodeTail(listPtr
,val
);
7008 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
7012 if (dictReplace(c
->db
->dict
,storekey
,listObject
)) {
7013 incrRefCount(storekey
);
7015 /* Note: we add 1 because the DB is dirty anyway since even if the
7016 * SORT result is empty a new key is set and maybe the old content
7018 server
.dirty
+= 1+outputlen
;
7019 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
7023 decrRefCount(sortval
);
7024 listRelease(operations
);
7025 for (j
= 0; j
< vectorlen
; j
++) {
7026 if (alpha
&& vector
[j
].u
.cmpobj
)
7027 decrRefCount(vector
[j
].u
.cmpobj
);
7032 /* Convert an amount of bytes into a human readable string in the form
7033 * of 100B, 2G, 100M, 4K, and so forth. */
7034 static void bytesToHuman(char *s
, unsigned long long n
) {
7039 sprintf(s
,"%lluB",n
);
7041 } else if (n
< (1024*1024)) {
7042 d
= (double)n
/(1024);
7043 sprintf(s
,"%.2fK",d
);
7044 } else if (n
< (1024LL*1024*1024)) {
7045 d
= (double)n
/(1024*1024);
7046 sprintf(s
,"%.2fM",d
);
7047 } else if (n
< (1024LL*1024*1024*1024)) {
7048 d
= (double)n
/(1024LL*1024*1024);
7049 sprintf(s
,"%.2fG",d
);
7053 /* Create the string returned by the INFO command. This is decoupled
7054 * by the INFO command itself as we need to report the same information
7055 * on memory corruption problems. */
7056 static sds
genRedisInfoString(void) {
7058 time_t uptime
= time(NULL
)-server
.stat_starttime
;
7062 bytesToHuman(hmem
,zmalloc_used_memory());
7063 info
= sdscatprintf(sdsempty(),
7064 "redis_version:%s\r\n"
7066 "multiplexing_api:%s\r\n"
7067 "process_id:%ld\r\n"
7068 "uptime_in_seconds:%ld\r\n"
7069 "uptime_in_days:%ld\r\n"
7070 "connected_clients:%d\r\n"
7071 "connected_slaves:%d\r\n"
7072 "blocked_clients:%d\r\n"
7073 "used_memory:%zu\r\n"
7074 "used_memory_human:%s\r\n"
7075 "changes_since_last_save:%lld\r\n"
7076 "bgsave_in_progress:%d\r\n"
7077 "last_save_time:%ld\r\n"
7078 "bgrewriteaof_in_progress:%d\r\n"
7079 "total_connections_received:%lld\r\n"
7080 "total_commands_processed:%lld\r\n"
7081 "expired_keys:%lld\r\n"
7082 "hash_max_zipmap_entries:%ld\r\n"
7083 "hash_max_zipmap_value:%ld\r\n"
7084 "pubsub_channels:%ld\r\n"
7085 "pubsub_patterns:%u\r\n"
7089 (sizeof(long) == 8) ? "64" : "32",
7094 listLength(server
.clients
)-listLength(server
.slaves
),
7095 listLength(server
.slaves
),
7096 server
.blpop_blocked_clients
,
7097 zmalloc_used_memory(),
7100 server
.bgsavechildpid
!= -1,
7102 server
.bgrewritechildpid
!= -1,
7103 server
.stat_numconnections
,
7104 server
.stat_numcommands
,
7105 server
.stat_expiredkeys
,
7106 server
.hash_max_zipmap_entries
,
7107 server
.hash_max_zipmap_value
,
7108 dictSize(server
.pubsub_channels
),
7109 listLength(server
.pubsub_patterns
),
7110 server
.vm_enabled
!= 0,
7111 server
.masterhost
== NULL
? "master" : "slave"
7113 if (server
.masterhost
) {
7114 info
= sdscatprintf(info
,
7115 "master_host:%s\r\n"
7116 "master_port:%d\r\n"
7117 "master_link_status:%s\r\n"
7118 "master_last_io_seconds_ago:%d\r\n"
7121 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
7123 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
7126 if (server
.vm_enabled
) {
7128 info
= sdscatprintf(info
,
7129 "vm_conf_max_memory:%llu\r\n"
7130 "vm_conf_page_size:%llu\r\n"
7131 "vm_conf_pages:%llu\r\n"
7132 "vm_stats_used_pages:%llu\r\n"
7133 "vm_stats_swapped_objects:%llu\r\n"
7134 "vm_stats_swappin_count:%llu\r\n"
7135 "vm_stats_swappout_count:%llu\r\n"
7136 "vm_stats_io_newjobs_len:%lu\r\n"
7137 "vm_stats_io_processing_len:%lu\r\n"
7138 "vm_stats_io_processed_len:%lu\r\n"
7139 "vm_stats_io_active_threads:%lu\r\n"
7140 "vm_stats_blocked_clients:%lu\r\n"
7141 ,(unsigned long long) server
.vm_max_memory
,
7142 (unsigned long long) server
.vm_page_size
,
7143 (unsigned long long) server
.vm_pages
,
7144 (unsigned long long) server
.vm_stats_used_pages
,
7145 (unsigned long long) server
.vm_stats_swapped_objects
,
7146 (unsigned long long) server
.vm_stats_swapins
,
7147 (unsigned long long) server
.vm_stats_swapouts
,
7148 (unsigned long) listLength(server
.io_newjobs
),
7149 (unsigned long) listLength(server
.io_processing
),
7150 (unsigned long) listLength(server
.io_processed
),
7151 (unsigned long) server
.io_active_threads
,
7152 (unsigned long) server
.vm_blocked_clients
7156 for (j
= 0; j
< server
.dbnum
; j
++) {
7157 long long keys
, vkeys
;
7159 keys
= dictSize(server
.db
[j
].dict
);
7160 vkeys
= dictSize(server
.db
[j
].expires
);
7161 if (keys
|| vkeys
) {
7162 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
7169 static void infoCommand(redisClient
*c
) {
7170 sds info
= genRedisInfoString();
7171 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
7172 (unsigned long)sdslen(info
)));
7173 addReplySds(c
,info
);
7174 addReply(c
,shared
.crlf
);
7177 static void monitorCommand(redisClient
*c
) {
7178 /* ignore MONITOR if aleady slave or in monitor mode */
7179 if (c
->flags
& REDIS_SLAVE
) return;
7181 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
7183 listAddNodeTail(server
.monitors
,c
);
7184 addReply(c
,shared
.ok
);
7187 /* ================================= Expire ================================= */
7188 static int removeExpire(redisDb
*db
, robj
*key
) {
7189 if (dictDelete(db
->expires
,key
) == DICT_OK
) {
7196 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
7197 if (dictAdd(db
->expires
,key
,(void*)when
) == DICT_ERR
) {
7205 /* Return the expire time of the specified key, or -1 if no expire
7206 * is associated with this key (i.e. the key is non volatile) */
7207 static time_t getExpire(redisDb
*db
, robj
*key
) {
7210 /* No expire? return ASAP */
7211 if (dictSize(db
->expires
) == 0 ||
7212 (de
= dictFind(db
->expires
,key
)) == NULL
) return -1;
7214 return (time_t) dictGetEntryVal(de
);
7217 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
7221 /* No expire? return ASAP */
7222 if (dictSize(db
->expires
) == 0 ||
7223 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
7225 /* Lookup the expire */
7226 when
= (time_t) dictGetEntryVal(de
);
7227 if (time(NULL
) <= when
) return 0;
7229 /* Delete the key */
7230 dictDelete(db
->expires
,key
);
7231 server
.stat_expiredkeys
++;
7232 return dictDelete(db
->dict
,key
) == DICT_OK
;
7235 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
7238 /* No expire? return ASAP */
7239 if (dictSize(db
->expires
) == 0 ||
7240 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
7242 /* Delete the key */
7244 server
.stat_expiredkeys
++;
7245 dictDelete(db
->expires
,key
);
7246 return dictDelete(db
->dict
,key
) == DICT_OK
;
7249 static void expireGenericCommand(redisClient
*c
, robj
*key
, robj
*param
, long offset
) {
7253 if (getLongFromObjectOrReply(c
, param
, &seconds
, NULL
) != REDIS_OK
) return;
7257 de
= dictFind(c
->db
->dict
,key
);
7259 addReply(c
,shared
.czero
);
7263 if (deleteKey(c
->db
,key
)) server
.dirty
++;
7264 addReply(c
, shared
.cone
);
7267 time_t when
= time(NULL
)+seconds
;
7268 if (setExpire(c
->db
,key
,when
)) {
7269 addReply(c
,shared
.cone
);
7272 addReply(c
,shared
.czero
);
7278 static void expireCommand(redisClient
*c
) {
7279 expireGenericCommand(c
,c
->argv
[1],c
->argv
[2],0);
7282 static void expireatCommand(redisClient
*c
) {
7283 expireGenericCommand(c
,c
->argv
[1],c
->argv
[2],time(NULL
));
7286 static void ttlCommand(redisClient
*c
) {
7290 expire
= getExpire(c
->db
,c
->argv
[1]);
7292 ttl
= (int) (expire
-time(NULL
));
7293 if (ttl
< 0) ttl
= -1;
7295 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
7298 /* ================================ MULTI/EXEC ============================== */
7300 /* Client state initialization for MULTI/EXEC */
7301 static void initClientMultiState(redisClient
*c
) {
7302 c
->mstate
.commands
= NULL
;
7303 c
->mstate
.count
= 0;
7306 /* Release all the resources associated with MULTI/EXEC state */
7307 static void freeClientMultiState(redisClient
*c
) {
7310 for (j
= 0; j
< c
->mstate
.count
; j
++) {
7312 multiCmd
*mc
= c
->mstate
.commands
+j
;
7314 for (i
= 0; i
< mc
->argc
; i
++)
7315 decrRefCount(mc
->argv
[i
]);
7318 zfree(c
->mstate
.commands
);
7321 /* Add a new command into the MULTI commands queue */
7322 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
7326 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
7327 sizeof(multiCmd
)*(c
->mstate
.count
+1));
7328 mc
= c
->mstate
.commands
+c
->mstate
.count
;
7331 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
7332 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
7333 for (j
= 0; j
< c
->argc
; j
++)
7334 incrRefCount(mc
->argv
[j
]);
7338 static void multiCommand(redisClient
*c
) {
7339 c
->flags
|= REDIS_MULTI
;
7340 addReply(c
,shared
.ok
);
7343 static void discardCommand(redisClient
*c
) {
7344 if (!(c
->flags
& REDIS_MULTI
)) {
7345 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
7349 freeClientMultiState(c
);
7350 initClientMultiState(c
);
7351 c
->flags
&= (~REDIS_MULTI
);
7352 addReply(c
,shared
.ok
);
7355 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7356 * implememntation for more information. */
7357 static void execCommandReplicateMulti(redisClient
*c
) {
7358 struct redisCommand
*cmd
;
7359 robj
*multistring
= createStringObject("MULTI",5);
7361 cmd
= lookupCommand("multi");
7362 if (server
.appendonly
)
7363 feedAppendOnlyFile(cmd
,c
->db
->id
,&multistring
,1);
7364 if (listLength(server
.slaves
))
7365 replicationFeedSlaves(server
.slaves
,c
->db
->id
,&multistring
,1);
7366 decrRefCount(multistring
);
7369 static void execCommand(redisClient
*c
) {
7374 if (!(c
->flags
& REDIS_MULTI
)) {
7375 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
7379 /* Replicate a MULTI request now that we are sure the block is executed.
7380 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7381 * both the AOF and the replication link will have the same consistency
7382 * and atomicity guarantees. */
7383 execCommandReplicateMulti(c
);
7385 /* Exec all the queued commands */
7386 orig_argv
= c
->argv
;
7387 orig_argc
= c
->argc
;
7388 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
7389 for (j
= 0; j
< c
->mstate
.count
; j
++) {
7390 c
->argc
= c
->mstate
.commands
[j
].argc
;
7391 c
->argv
= c
->mstate
.commands
[j
].argv
;
7392 call(c
,c
->mstate
.commands
[j
].cmd
);
7394 c
->argv
= orig_argv
;
7395 c
->argc
= orig_argc
;
7396 freeClientMultiState(c
);
7397 initClientMultiState(c
);
7398 c
->flags
&= (~REDIS_MULTI
);
7399 /* Make sure the EXEC command is always replicated / AOF, since we
7400 * always send the MULTI command (we can't know beforehand if the
7401 * next operations will contain at least a modification to the DB). */
7405 /* =========================== Blocking Operations ========================= */
7407 /* Currently Redis blocking operations support is limited to list POP ops,
7408 * so the current implementation is not fully generic, but it is also not
7409 * completely specific so it will not require a rewrite to support new
7410 * kind of blocking operations in the future.
7412 * Still it's important to note that list blocking operations can be already
7413 * used as a notification mechanism in order to implement other blocking
7414 * operations at application level, so there must be a very strong evidence
7415 * of usefulness and generality before new blocking operations are implemented.
7417 * This is how the current blocking POP works, we use BLPOP as example:
7418 * - If the user calls BLPOP and the key exists and contains a non empty list
7419 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7420 * if there is not to block.
7421 * - If instead BLPOP is called and the key does not exists or the list is
7422 * empty we need to block. In order to do so we remove the notification for
7423 * new data to read in the client socket (so that we'll not serve new
7424 * requests if the blocking request is not served). Also we put the client
7425 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7426 * blocking for this keys.
7427 * - If a PUSH operation against a key with blocked clients waiting is
7428 * performed, we serve the first in the list: basically instead to push
7429 * the new element inside the list we return it to the (first / oldest)
7430 * blocking client, unblock the client, and remove it form the list.
7432 * The above comment and the source code should be enough in order to understand
7433 * the implementation and modify / fix it later.
7436 /* Set a client in blocking mode for the specified key, with the specified
7438 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
7443 c
->blockingkeys
= zmalloc(sizeof(robj
*)*numkeys
);
7444 c
->blockingkeysnum
= numkeys
;
7445 c
->blockingto
= timeout
;
7446 for (j
= 0; j
< numkeys
; j
++) {
7447 /* Add the key in the client structure, to map clients -> keys */
7448 c
->blockingkeys
[j
] = keys
[j
];
7449 incrRefCount(keys
[j
]);
7451 /* And in the other "side", to map keys -> clients */
7452 de
= dictFind(c
->db
->blockingkeys
,keys
[j
]);
7456 /* For every key we take a list of clients blocked for it */
7458 retval
= dictAdd(c
->db
->blockingkeys
,keys
[j
],l
);
7459 incrRefCount(keys
[j
]);
7460 assert(retval
== DICT_OK
);
7462 l
= dictGetEntryVal(de
);
7464 listAddNodeTail(l
,c
);
7466 /* Mark the client as a blocked client */
7467 c
->flags
|= REDIS_BLOCKED
;
7468 server
.blpop_blocked_clients
++;
7471 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7472 static void unblockClientWaitingData(redisClient
*c
) {
7477 assert(c
->blockingkeys
!= NULL
);
7478 /* The client may wait for multiple keys, so unblock it for every key. */
7479 for (j
= 0; j
< c
->blockingkeysnum
; j
++) {
7480 /* Remove this client from the list of clients waiting for this key. */
7481 de
= dictFind(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
7483 l
= dictGetEntryVal(de
);
7484 listDelNode(l
,listSearchKey(l
,c
));
7485 /* If the list is empty we need to remove it to avoid wasting memory */
7486 if (listLength(l
) == 0)
7487 dictDelete(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
7488 decrRefCount(c
->blockingkeys
[j
]);
7490 /* Cleanup the client structure */
7491 zfree(c
->blockingkeys
);
7492 c
->blockingkeys
= NULL
;
7493 c
->flags
&= (~REDIS_BLOCKED
);
7494 server
.blpop_blocked_clients
--;
7495 /* We want to process data if there is some command waiting
7496 * in the input buffer. Note that this is safe even if
7497 * unblockClientWaitingData() gets called from freeClient() because
7498 * freeClient() will be smart enough to call this function
7499 * *after* c->querybuf was set to NULL. */
7500 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
7503 /* This should be called from any function PUSHing into lists.
7504 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7505 * 'ele' is the element pushed.
7507 * If the function returns 0 there was no client waiting for a list push
7510 * If the function returns 1 there was a client waiting for a list push
7511 * against this key, the element was passed to this client thus it's not
7512 * needed to actually add it to the list and the caller should return asap. */
7513 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
7514 struct dictEntry
*de
;
7515 redisClient
*receiver
;
7519 de
= dictFind(c
->db
->blockingkeys
,key
);
7520 if (de
== NULL
) return 0;
7521 l
= dictGetEntryVal(de
);
7524 receiver
= ln
->value
;
7526 addReplySds(receiver
,sdsnew("*2\r\n"));
7527 addReplyBulk(receiver
,key
);
7528 addReplyBulk(receiver
,ele
);
7529 unblockClientWaitingData(receiver
);
7533 /* Blocking RPOP/LPOP */
7534 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
7539 for (j
= 1; j
< c
->argc
-1; j
++) {
7540 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
7542 if (o
->type
!= REDIS_LIST
) {
7543 addReply(c
,shared
.wrongtypeerr
);
7546 list
*list
= o
->ptr
;
7547 if (listLength(list
) != 0) {
7548 /* If the list contains elements fall back to the usual
7549 * non-blocking POP operation */
7550 robj
*argv
[2], **orig_argv
;
7553 /* We need to alter the command arguments before to call
7554 * popGenericCommand() as the command takes a single key. */
7555 orig_argv
= c
->argv
;
7556 orig_argc
= c
->argc
;
7557 argv
[1] = c
->argv
[j
];
7561 /* Also the return value is different, we need to output
7562 * the multi bulk reply header and the key name. The
7563 * "real" command will add the last element (the value)
7564 * for us. If this souds like an hack to you it's just
7565 * because it is... */
7566 addReplySds(c
,sdsnew("*2\r\n"));
7567 addReplyBulk(c
,argv
[1]);
7568 popGenericCommand(c
,where
);
7570 /* Fix the client structure with the original stuff */
7571 c
->argv
= orig_argv
;
7572 c
->argc
= orig_argc
;
7578 /* If the list is empty or the key does not exists we must block */
7579 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
7580 if (timeout
> 0) timeout
+= time(NULL
);
7581 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
7584 static void blpopCommand(redisClient
*c
) {
7585 blockingPopGenericCommand(c
,REDIS_HEAD
);
7588 static void brpopCommand(redisClient
*c
) {
7589 blockingPopGenericCommand(c
,REDIS_TAIL
);
7592 /* =============================== Replication ============================= */
7594 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7595 ssize_t nwritten
, ret
= size
;
7596 time_t start
= time(NULL
);
7600 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
7601 nwritten
= write(fd
,ptr
,size
);
7602 if (nwritten
== -1) return -1;
7606 if ((time(NULL
)-start
) > timeout
) {
7614 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7615 ssize_t nread
, totread
= 0;
7616 time_t start
= time(NULL
);
7620 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
7621 nread
= read(fd
,ptr
,size
);
7622 if (nread
== -1) return -1;
7627 if ((time(NULL
)-start
) > timeout
) {
7635 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7642 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
7645 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
7656 static void syncCommand(redisClient
*c
) {
7657 /* ignore SYNC if aleady slave or in monitor mode */
7658 if (c
->flags
& REDIS_SLAVE
) return;
7660 /* SYNC can't be issued when the server has pending data to send to
7661 * the client about already issued commands. We need a fresh reply
7662 * buffer registering the differences between the BGSAVE and the current
7663 * dataset, so that we can copy to other slaves if needed. */
7664 if (listLength(c
->reply
) != 0) {
7665 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7669 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
7670 /* Here we need to check if there is a background saving operation
7671 * in progress, or if it is required to start one */
7672 if (server
.bgsavechildpid
!= -1) {
7673 /* Ok a background save is in progress. Let's check if it is a good
7674 * one for replication, i.e. if there is another slave that is
7675 * registering differences since the server forked to save */
7680 listRewind(server
.slaves
,&li
);
7681 while((ln
= listNext(&li
))) {
7683 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
7686 /* Perfect, the server is already registering differences for
7687 * another slave. Set the right state, and copy the buffer. */
7688 listRelease(c
->reply
);
7689 c
->reply
= listDup(slave
->reply
);
7690 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7691 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
7693 /* No way, we need to wait for the next BGSAVE in order to
7694 * register differences */
7695 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
7696 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
7699 /* Ok we don't have a BGSAVE in progress, let's start one */
7700 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
7701 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
7702 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
7703 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
7706 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7709 c
->flags
|= REDIS_SLAVE
;
7711 listAddNodeTail(server
.slaves
,c
);
7715 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
7716 redisClient
*slave
= privdata
;
7718 REDIS_NOTUSED(mask
);
7719 char buf
[REDIS_IOBUF_LEN
];
7720 ssize_t nwritten
, buflen
;
7722 if (slave
->repldboff
== 0) {
7723 /* Write the bulk write count before to transfer the DB. In theory here
7724 * we don't know how much room there is in the output buffer of the
7725 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7726 * operations) will never be smaller than the few bytes we need. */
7729 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7731 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
7739 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
7740 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
7742 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
7743 (buflen
== 0) ? "premature EOF" : strerror(errno
));
7747 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
7748 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
7753 slave
->repldboff
+= nwritten
;
7754 if (slave
->repldboff
== slave
->repldbsize
) {
7755 close(slave
->repldbfd
);
7756 slave
->repldbfd
= -1;
7757 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
7758 slave
->replstate
= REDIS_REPL_ONLINE
;
7759 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
7760 sendReplyToClient
, slave
) == AE_ERR
) {
7764 addReplySds(slave
,sdsempty());
7765 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
7769 /* This function is called at the end of every backgrond saving.
7770 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7771 * otherwise REDIS_ERR is passed to the function.
7773 * The goal of this function is to handle slaves waiting for a successful
7774 * background saving in order to perform non-blocking synchronization. */
7775 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
7777 int startbgsave
= 0;
7780 listRewind(server
.slaves
,&li
);
7781 while((ln
= listNext(&li
))) {
7782 redisClient
*slave
= ln
->value
;
7784 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
7786 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7787 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
7788 struct redis_stat buf
;
7790 if (bgsaveerr
!= REDIS_OK
) {
7792 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
7795 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
7796 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
7798 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
7801 slave
->repldboff
= 0;
7802 slave
->repldbsize
= buf
.st_size
;
7803 slave
->replstate
= REDIS_REPL_SEND_BULK
;
7804 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
7805 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
7812 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
7815 listRewind(server
.slaves
,&li
);
7816 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
7817 while((ln
= listNext(&li
))) {
7818 redisClient
*slave
= ln
->value
;
7820 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
7827 static int syncWithMaster(void) {
7828 char buf
[1024], tmpfile
[256], authcmd
[1024];
7830 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
7831 int dfd
, maxtries
= 5;
7834 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
7839 /* AUTH with the master if required. */
7840 if(server
.masterauth
) {
7841 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
7842 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
7844 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
7848 /* Read the AUTH result. */
7849 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
7851 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
7855 if (buf
[0] != '+') {
7857 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
7862 /* Issue the SYNC command */
7863 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
7865 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
7869 /* Read the bulk write count */
7870 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
7872 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
7876 if (buf
[0] != '$') {
7878 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7881 dumpsize
= strtol(buf
+1,NULL
,10);
7882 redisLog(REDIS_NOTICE
,"Receiving %ld bytes data dump from MASTER",dumpsize
);
7883 /* Read the bulk write data on a temp file */
7885 snprintf(tmpfile
,256,
7886 "temp-%d.%ld.rdb",(int)time(NULL
),(long int)getpid());
7887 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
|O_EXCL
,0644);
7888 if (dfd
!= -1) break;
7893 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
7897 int nread
, nwritten
;
7899 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
7901 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
7907 nwritten
= write(dfd
,buf
,nread
);
7908 if (nwritten
== -1) {
7909 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
7917 if (rename(tmpfile
,server
.dbfilename
) == -1) {
7918 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
7924 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
7925 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
7929 server
.master
= createClient(fd
);
7930 server
.master
->flags
|= REDIS_MASTER
;
7931 server
.master
->authenticated
= 1;
7932 server
.replstate
= REDIS_REPL_CONNECTED
;
7936 static void slaveofCommand(redisClient
*c
) {
7937 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
7938 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
7939 if (server
.masterhost
) {
7940 sdsfree(server
.masterhost
);
7941 server
.masterhost
= NULL
;
7942 if (server
.master
) freeClient(server
.master
);
7943 server
.replstate
= REDIS_REPL_NONE
;
7944 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
7947 sdsfree(server
.masterhost
);
7948 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
7949 server
.masterport
= atoi(c
->argv
[2]->ptr
);
7950 if (server
.master
) freeClient(server
.master
);
7951 server
.replstate
= REDIS_REPL_CONNECT
;
7952 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
7953 server
.masterhost
, server
.masterport
);
7955 addReply(c
,shared
.ok
);
7958 /* ============================ Maxmemory directive ======================== */
7960 /* Try to free one object form the pre-allocated objects free list.
7961 * This is useful under low mem conditions as by default we take 1 million
7962 * free objects allocated. On success REDIS_OK is returned, otherwise
7964 static int tryFreeOneObjectFromFreelist(void) {
7967 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
7968 if (listLength(server
.objfreelist
)) {
7969 listNode
*head
= listFirst(server
.objfreelist
);
7970 o
= listNodeValue(head
);
7971 listDelNode(server
.objfreelist
,head
);
7972 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
7976 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
7981 /* This function gets called when 'maxmemory' is set on the config file to limit
7982 * the max memory used by the server, and we are out of memory.
7983 * This function will try to, in order:
7985 * - Free objects from the free list
7986 * - Try to remove keys with an EXPIRE set
7988 * It is not possible to free enough memory to reach used-memory < maxmemory
7989 * the server will start refusing commands that will enlarge even more the
7992 static void freeMemoryIfNeeded(void) {
7993 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
7994 int j
, k
, freed
= 0;
7996 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
7997 for (j
= 0; j
< server
.dbnum
; j
++) {
7999 robj
*minkey
= NULL
;
8000 struct dictEntry
*de
;
8002 if (dictSize(server
.db
[j
].expires
)) {
8004 /* From a sample of three keys drop the one nearest to
8005 * the natural expire */
8006 for (k
= 0; k
< 3; k
++) {
8009 de
= dictGetRandomKey(server
.db
[j
].expires
);
8010 t
= (time_t) dictGetEntryVal(de
);
8011 if (minttl
== -1 || t
< minttl
) {
8012 minkey
= dictGetEntryKey(de
);
8016 deleteKey(server
.db
+j
,minkey
);
8019 if (!freed
) return; /* nothing to free... */
8023 /* ============================== Append Only file ========================== */
8025 /* Write the append only file buffer on disk.
8027 * Since we are required to write the AOF before replying to the client,
8028 * and the only way the client socket can get a write is entering when the
8029 * the event loop, we accumulate all the AOF writes in a memory
8030 * buffer and write it on disk using this function just before entering
8031 * the event loop again. */
8032 static void flushAppendOnlyFile(void) {
8036 if (sdslen(server
.aofbuf
) == 0) return;
8038 /* We want to perform a single write. This should be guaranteed atomic
8039 * at least if the filesystem we are writing is a real physical one.
8040 * While this will save us against the server being killed I don't think
8041 * there is much to do about the whole server stopping for power problems
8043 nwritten
= write(server
.appendfd
,server
.aofbuf
,sdslen(server
.aofbuf
));
8044 if (nwritten
!= (signed)sdslen(server
.aofbuf
)) {
8045 /* Ooops, we are in troubles. The best thing to do for now is
8046 * aborting instead of giving the illusion that everything is
8047 * working as expected. */
8048 if (nwritten
== -1) {
8049 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
8051 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
8055 sdsfree(server
.aofbuf
);
8056 server
.aofbuf
= sdsempty();
8058 /* Fsync if needed */
8060 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
8061 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
8062 now
-server
.lastfsync
> 1))
8064 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8065 * flushing metadata. */
8066 aof_fsync(server
.appendfd
); /* Let's try to get this data on the disk */
8067 server
.lastfsync
= now
;
8071 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
8072 sds buf
= sdsempty();
8076 /* The DB this command was targetting is not the same as the last command
8077 * we appendend. To issue a SELECT command is needed. */
8078 if (dictid
!= server
.appendseldb
) {
8081 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
8082 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8083 (unsigned long)strlen(seldb
),seldb
);
8084 server
.appendseldb
= dictid
;
8087 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8088 * EXPIREs into EXPIREATs calls */
8089 if (cmd
->proc
== expireCommand
) {
8092 tmpargv
[0] = createStringObject("EXPIREAT",8);
8093 tmpargv
[1] = argv
[1];
8094 incrRefCount(argv
[1]);
8095 when
= time(NULL
)+strtol(argv
[2]->ptr
,NULL
,10);
8096 tmpargv
[2] = createObject(REDIS_STRING
,
8097 sdscatprintf(sdsempty(),"%ld",when
));
8101 /* Append the actual command */
8102 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
8103 for (j
= 0; j
< argc
; j
++) {
8106 o
= getDecodedObject(o
);
8107 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
8108 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
8109 buf
= sdscatlen(buf
,"\r\n",2);
8113 /* Free the objects from the modified argv for EXPIREAT */
8114 if (cmd
->proc
== expireCommand
) {
8115 for (j
= 0; j
< 3; j
++)
8116 decrRefCount(argv
[j
]);
8119 /* Append to the AOF buffer. This will be flushed on disk just before
8120 * of re-entering the event loop, so before the client will get a
8121 * positive reply about the operation performed. */
8122 server
.aofbuf
= sdscatlen(server
.aofbuf
,buf
,sdslen(buf
));
8124 /* If a background append only file rewriting is in progress we want to
8125 * accumulate the differences between the child DB and the current one
8126 * in a buffer, so that when the child process will do its work we
8127 * can append the differences to the new append only file. */
8128 if (server
.bgrewritechildpid
!= -1)
8129 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
8134 /* In Redis commands are always executed in the context of a client, so in
8135 * order to load the append only file we need to create a fake client. */
8136 static struct redisClient
*createFakeClient(void) {
8137 struct redisClient
*c
= zmalloc(sizeof(*c
));
8141 c
->querybuf
= sdsempty();
8145 /* We set the fake client as a slave waiting for the synchronization
8146 * so that Redis will not try to send replies to this client. */
8147 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
8148 c
->reply
= listCreate();
8149 listSetFreeMethod(c
->reply
,decrRefCount
);
8150 listSetDupMethod(c
->reply
,dupClientReplyValue
);
8151 initClientMultiState(c
);
8155 static void freeFakeClient(struct redisClient
*c
) {
8156 sdsfree(c
->querybuf
);
8157 listRelease(c
->reply
);
8158 freeClientMultiState(c
);
8162 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8163 * error (the append only file is zero-length) REDIS_ERR is returned. On
8164 * fatal error an error message is logged and the program exists. */
8165 int loadAppendOnlyFile(char *filename
) {
8166 struct redisClient
*fakeClient
;
8167 FILE *fp
= fopen(filename
,"r");
8168 struct redis_stat sb
;
8169 unsigned long long loadedkeys
= 0;
8170 int appendonly
= server
.appendonly
;
8172 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
8176 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
8180 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8181 * to the same file we're about to read. */
8182 server
.appendonly
= 0;
8184 fakeClient
= createFakeClient();
8191 struct redisCommand
*cmd
;
8193 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
8199 if (buf
[0] != '*') goto fmterr
;
8201 argv
= zmalloc(sizeof(robj
*)*argc
);
8202 for (j
= 0; j
< argc
; j
++) {
8203 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
8204 if (buf
[0] != '$') goto fmterr
;
8205 len
= strtol(buf
+1,NULL
,10);
8206 argsds
= sdsnewlen(NULL
,len
);
8207 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
8208 argv
[j
] = createObject(REDIS_STRING
,argsds
);
8209 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
8212 /* Command lookup */
8213 cmd
= lookupCommand(argv
[0]->ptr
);
8215 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
8218 /* Try object encoding */
8219 if (cmd
->flags
& REDIS_CMD_BULK
)
8220 argv
[argc
-1] = tryObjectEncoding(argv
[argc
-1]);
8221 /* Run the command in the context of a fake client */
8222 fakeClient
->argc
= argc
;
8223 fakeClient
->argv
= argv
;
8224 cmd
->proc(fakeClient
);
8225 /* Discard the reply objects list from the fake client */
8226 while(listLength(fakeClient
->reply
))
8227 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
8228 /* Clean up, ready for the next command */
8229 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
8231 /* Handle swapping while loading big datasets when VM is on */
8233 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
8234 while (zmalloc_used_memory() > server
.vm_max_memory
) {
8235 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
8240 /* This point can only be reached when EOF is reached without errors.
8241 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8242 if (fakeClient
->flags
& REDIS_MULTI
) goto readerr
;
8245 freeFakeClient(fakeClient
);
8246 server
.appendonly
= appendonly
;
8251 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
8253 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
8257 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
8261 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8262 static int fwriteBulkObject(FILE *fp
, robj
*obj
) {
8266 /* Avoid the incr/decr ref count business if possible to help
8267 * copy-on-write (we are often in a child process when this function
8269 * Also makes sure that key objects don't get incrRefCount-ed when VM
8271 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
8272 obj
= getDecodedObject(obj
);
8275 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(long)sdslen(obj
->ptr
));
8276 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) goto err
;
8277 if (sdslen(obj
->ptr
) && fwrite(obj
->ptr
,sdslen(obj
->ptr
),1,fp
) == 0)
8279 if (fwrite("\r\n",2,1,fp
) == 0) goto err
;
8280 if (decrrc
) decrRefCount(obj
);
8283 if (decrrc
) decrRefCount(obj
);
8287 /* Write binary-safe string into a file in the bulkformat
8288 * $<count>\r\n<payload>\r\n */
8289 static int fwriteBulkString(FILE *fp
, char *s
, unsigned long len
) {
8292 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(unsigned long)len
);
8293 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
8294 if (len
&& fwrite(s
,len
,1,fp
) == 0) return 0;
8295 if (fwrite("\r\n",2,1,fp
) == 0) return 0;
8299 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8300 static int fwriteBulkDouble(FILE *fp
, double d
) {
8301 char buf
[128], dbuf
[128];
8303 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
8304 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
8305 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
8306 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
8310 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8311 static int fwriteBulkLong(FILE *fp
, long l
) {
8312 char buf
[128], lbuf
[128];
8314 snprintf(lbuf
,sizeof(lbuf
),"%ld\r\n",l
);
8315 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(lbuf
)-2);
8316 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
8317 if (fwrite(lbuf
,strlen(lbuf
),1,fp
) == 0) return 0;
8321 /* Write a sequence of commands able to fully rebuild the dataset into
8322 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8323 static int rewriteAppendOnlyFile(char *filename
) {
8324 dictIterator
*di
= NULL
;
8329 time_t now
= time(NULL
);
8331 /* Note that we have to use a different temp name here compared to the
8332 * one used by rewriteAppendOnlyFileBackground() function. */
8333 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
8334 fp
= fopen(tmpfile
,"w");
8336 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
8339 for (j
= 0; j
< server
.dbnum
; j
++) {
8340 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
8341 redisDb
*db
= server
.db
+j
;
8343 if (dictSize(d
) == 0) continue;
8344 di
= dictGetIterator(d
);
8350 /* SELECT the new DB */
8351 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
8352 if (fwriteBulkLong(fp
,j
) == 0) goto werr
;
8354 /* Iterate this DB writing every entry */
8355 while((de
= dictNext(di
)) != NULL
) {
8360 key
= dictGetEntryKey(de
);
8361 /* If the value for this key is swapped, load a preview in memory.
8362 * We use a "swapped" flag to remember if we need to free the
8363 * value object instead to just increment the ref count anyway
8364 * in order to avoid copy-on-write of pages if we are forked() */
8365 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
8366 key
->storage
== REDIS_VM_SWAPPING
) {
8367 o
= dictGetEntryVal(de
);
8370 o
= vmPreviewObject(key
);
8373 expiretime
= getExpire(db
,key
);
8375 /* Save the key and associated value */
8376 if (o
->type
== REDIS_STRING
) {
8377 /* Emit a SET command */
8378 char cmd
[]="*3\r\n$3\r\nSET\r\n";
8379 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8381 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8382 if (fwriteBulkObject(fp
,o
) == 0) goto werr
;
8383 } else if (o
->type
== REDIS_LIST
) {
8384 /* Emit the RPUSHes needed to rebuild the list */
8385 list
*list
= o
->ptr
;
8389 listRewind(list
,&li
);
8390 while((ln
= listNext(&li
))) {
8391 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
8392 robj
*eleobj
= listNodeValue(ln
);
8394 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8395 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8396 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
8398 } else if (o
->type
== REDIS_SET
) {
8399 /* Emit the SADDs needed to rebuild the set */
8401 dictIterator
*di
= dictGetIterator(set
);
8404 while((de
= dictNext(di
)) != NULL
) {
8405 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
8406 robj
*eleobj
= dictGetEntryKey(de
);
8408 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8409 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8410 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
8412 dictReleaseIterator(di
);
8413 } else if (o
->type
== REDIS_ZSET
) {
8414 /* Emit the ZADDs needed to rebuild the sorted set */
8416 dictIterator
*di
= dictGetIterator(zs
->dict
);
8419 while((de
= dictNext(di
)) != NULL
) {
8420 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
8421 robj
*eleobj
= dictGetEntryKey(de
);
8422 double *score
= dictGetEntryVal(de
);
8424 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8425 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8426 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
8427 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
8429 dictReleaseIterator(di
);
8430 } else if (o
->type
== REDIS_HASH
) {
8431 char cmd
[]="*4\r\n$4\r\nHSET\r\n";
8433 /* Emit the HSETs needed to rebuild the hash */
8434 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
8435 unsigned char *p
= zipmapRewind(o
->ptr
);
8436 unsigned char *field
, *val
;
8437 unsigned int flen
, vlen
;
8439 while((p
= zipmapNext(p
,&field
,&flen
,&val
,&vlen
)) != NULL
) {
8440 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8441 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8442 if (fwriteBulkString(fp
,(char*)field
,flen
) == -1)
8444 if (fwriteBulkString(fp
,(char*)val
,vlen
) == -1)
8448 dictIterator
*di
= dictGetIterator(o
->ptr
);
8451 while((de
= dictNext(di
)) != NULL
) {
8452 robj
*field
= dictGetEntryKey(de
);
8453 robj
*val
= dictGetEntryVal(de
);
8455 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8456 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8457 if (fwriteBulkObject(fp
,field
) == -1) return -1;
8458 if (fwriteBulkObject(fp
,val
) == -1) return -1;
8460 dictReleaseIterator(di
);
8463 redisPanic("Unknown object type");
8465 /* Save the expire time */
8466 if (expiretime
!= -1) {
8467 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
8468 /* If this key is already expired skip it */
8469 if (expiretime
< now
) continue;
8470 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
8471 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
8472 if (fwriteBulkLong(fp
,expiretime
) == 0) goto werr
;
8474 if (swapped
) decrRefCount(o
);
8476 dictReleaseIterator(di
);
8479 /* Make sure data will not remain on the OS's output buffers */
8484 /* Use RENAME to make sure the DB file is changed atomically only
8485 * if the generate DB file is ok. */
8486 if (rename(tmpfile
,filename
) == -1) {
8487 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
8491 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
8497 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
8498 if (di
) dictReleaseIterator(di
);
8502 /* This is how rewriting of the append only file in background works:
8504 * 1) The user calls BGREWRITEAOF
8505 * 2) Redis calls this function, that forks():
8506 * 2a) the child rewrite the append only file in a temp file.
8507 * 2b) the parent accumulates differences in server.bgrewritebuf.
8508 * 3) When the child finished '2a' exists.
8509 * 4) The parent will trap the exit code, if it's OK, will append the
8510 * data accumulated into server.bgrewritebuf into the temp file, and
8511 * finally will rename(2) the temp file in the actual file name.
8512 * The the new file is reopened as the new append only file. Profit!
8514 static int rewriteAppendOnlyFileBackground(void) {
8517 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
8518 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
8519 if ((childpid
= fork()) == 0) {
8523 if (server
.vm_enabled
) vmReopenSwapFile();
8525 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8526 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
8533 if (childpid
== -1) {
8534 redisLog(REDIS_WARNING
,
8535 "Can't rewrite append only file in background: fork: %s",
8539 redisLog(REDIS_NOTICE
,
8540 "Background append only file rewriting started by pid %d",childpid
);
8541 server
.bgrewritechildpid
= childpid
;
8542 updateDictResizePolicy();
8543 /* We set appendseldb to -1 in order to force the next call to the
8544 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8545 * accumulated by the parent into server.bgrewritebuf will start
8546 * with a SELECT statement and it will be safe to merge. */
8547 server
.appendseldb
= -1;
8550 return REDIS_OK
; /* unreached */
8553 static void bgrewriteaofCommand(redisClient
*c
) {
8554 if (server
.bgrewritechildpid
!= -1) {
8555 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8558 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
8559 char *status
= "+Background append only file rewriting started\r\n";
8560 addReplySds(c
,sdsnew(status
));
8562 addReply(c
,shared
.err
);
8566 static void aofRemoveTempFile(pid_t childpid
) {
8569 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
8573 /* Virtual Memory is composed mainly of two subsystems:
8574 * - Blocking Virutal Memory
8575 * - Threaded Virtual Memory I/O
8576 * The two parts are not fully decoupled, but functions are split among two
8577 * different sections of the source code (delimited by comments) in order to
8578 * make more clear what functionality is about the blocking VM and what about
8579 * the threaded (not blocking) VM.
8583 * Redis VM is a blocking VM (one that blocks reading swapped values from
8584 * disk into memory when a value swapped out is needed in memory) that is made
8585 * unblocking by trying to examine the command argument vector in order to
8586 * load in background values that will likely be needed in order to exec
8587 * the command. The command is executed only once all the relevant keys
8588 * are loaded into memory.
8590 * This basically is almost as simple of a blocking VM, but almost as parallel
8591 * as a fully non-blocking VM.
8594 /* =================== Virtual Memory - Blocking Side ====================== */
8596 static void vmInit(void) {
8602 if (server
.vm_max_threads
!= 0)
8603 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8605 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
8606 /* Try to open the old swap file, otherwise create it */
8607 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
8608 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
8610 if (server
.vm_fp
== NULL
) {
8611 redisLog(REDIS_WARNING
,
8612 "Can't open the swap file: %s. Exiting.",
8616 server
.vm_fd
= fileno(server
.vm_fp
);
8617 /* Lock the swap file for writing, this is useful in order to avoid
8618 * another instance to use the same swap file for a config error. */
8619 fl
.l_type
= F_WRLCK
;
8620 fl
.l_whence
= SEEK_SET
;
8621 fl
.l_start
= fl
.l_len
= 0;
8622 if (fcntl(server
.vm_fd
,F_SETLK
,&fl
) == -1) {
8623 redisLog(REDIS_WARNING
,
8624 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server
.vm_swap_file
, strerror(errno
));
8628 server
.vm_next_page
= 0;
8629 server
.vm_near_pages
= 0;
8630 server
.vm_stats_used_pages
= 0;
8631 server
.vm_stats_swapped_objects
= 0;
8632 server
.vm_stats_swapouts
= 0;
8633 server
.vm_stats_swapins
= 0;
8634 totsize
= server
.vm_pages
*server
.vm_page_size
;
8635 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
8636 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
8637 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
8641 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
8643 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
8644 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
8645 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
8646 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
8648 /* Initialize threaded I/O (used by Virtual Memory) */
8649 server
.io_newjobs
= listCreate();
8650 server
.io_processing
= listCreate();
8651 server
.io_processed
= listCreate();
8652 server
.io_ready_clients
= listCreate();
8653 pthread_mutex_init(&server
.io_mutex
,NULL
);
8654 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
8655 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
8656 server
.io_active_threads
= 0;
8657 if (pipe(pipefds
) == -1) {
8658 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
8662 server
.io_ready_pipe_read
= pipefds
[0];
8663 server
.io_ready_pipe_write
= pipefds
[1];
8664 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
8665 /* LZF requires a lot of stack */
8666 pthread_attr_init(&server
.io_threads_attr
);
8667 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
8668 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
8669 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
8670 /* Listen for events in the threaded I/O pipe */
8671 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
8672 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
8673 oom("creating file event");
8676 /* Mark the page as used */
8677 static void vmMarkPageUsed(off_t page
) {
8678 off_t byte
= page
/8;
8680 redisAssert(vmFreePage(page
) == 1);
8681 server
.vm_bitmap
[byte
] |= 1<<bit
;
8684 /* Mark N contiguous pages as used, with 'page' being the first. */
8685 static void vmMarkPagesUsed(off_t page
, off_t count
) {
8688 for (j
= 0; j
< count
; j
++)
8689 vmMarkPageUsed(page
+j
);
8690 server
.vm_stats_used_pages
+= count
;
8691 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
8692 (long long)count
, (long long)page
);
8695 /* Mark the page as free */
8696 static void vmMarkPageFree(off_t page
) {
8697 off_t byte
= page
/8;
8699 redisAssert(vmFreePage(page
) == 0);
8700 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
8703 /* Mark N contiguous pages as free, with 'page' being the first. */
8704 static void vmMarkPagesFree(off_t page
, off_t count
) {
8707 for (j
= 0; j
< count
; j
++)
8708 vmMarkPageFree(page
+j
);
8709 server
.vm_stats_used_pages
-= count
;
8710 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
8711 (long long)count
, (long long)page
);
8714 /* Test if the page is free */
8715 static int vmFreePage(off_t page
) {
8716 off_t byte
= page
/8;
8718 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
8721 /* Find N contiguous free pages storing the first page of the cluster in *first.
8722 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8723 * REDIS_ERR is returned.
8725 * This function uses a simple algorithm: we try to allocate
8726 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8727 * again from the start of the swap file searching for free spaces.
8729 * If it looks pretty clear that there are no free pages near our offset
8730 * we try to find less populated places doing a forward jump of
8731 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8732 * without hurry, and then we jump again and so forth...
8734 * This function can be improved using a free list to avoid to guess
8735 * too much, since we could collect data about freed pages.
8737 * note: I implemented this function just after watching an episode of
8738 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8740 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
8741 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
8743 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
8744 server
.vm_near_pages
= 0;
8745 server
.vm_next_page
= 0;
8747 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
8748 base
= server
.vm_next_page
;
8750 while(offset
< server
.vm_pages
) {
8751 off_t
this = base
+offset
;
8753 /* If we overflow, restart from page zero */
8754 if (this >= server
.vm_pages
) {
8755 this -= server
.vm_pages
;
8757 /* Just overflowed, what we found on tail is no longer
8758 * interesting, as it's no longer contiguous. */
8762 if (vmFreePage(this)) {
8763 /* This is a free page */
8765 /* Already got N free pages? Return to the caller, with success */
8767 *first
= this-(n
-1);
8768 server
.vm_next_page
= this+1;
8769 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
8773 /* The current one is not a free page */
8777 /* Fast-forward if the current page is not free and we already
8778 * searched enough near this place. */
8780 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
8781 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
8783 /* Note that even if we rewind after the jump, we are don't need
8784 * to make sure numfree is set to zero as we only jump *if* it
8785 * is set to zero. */
8787 /* Otherwise just check the next page */
8794 /* Write the specified object at the specified page of the swap file */
8795 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
8796 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
8797 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
8798 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8799 redisLog(REDIS_WARNING
,
8800 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8804 rdbSaveObject(server
.vm_fp
,o
);
8805 fflush(server
.vm_fp
);
8806 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8810 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8811 * needed to later retrieve the object into the key object.
8812 * If we can't find enough contiguous empty pages to swap the object on disk
8813 * REDIS_ERR is returned. */
8814 static int vmSwapObjectBlocking(robj
*key
, robj
*val
) {
8815 off_t pages
= rdbSavedObjectPages(val
,NULL
);
8818 assert(key
->storage
== REDIS_VM_MEMORY
);
8819 assert(key
->refcount
== 1);
8820 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return REDIS_ERR
;
8821 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return REDIS_ERR
;
8822 key
->vm
.page
= page
;
8823 key
->vm
.usedpages
= pages
;
8824 key
->storage
= REDIS_VM_SWAPPED
;
8825 key
->vtype
= val
->type
;
8826 decrRefCount(val
); /* Deallocate the object from memory. */
8827 vmMarkPagesUsed(page
,pages
);
8828 redisLog(REDIS_DEBUG
,"VM: object %s swapped out at %lld (%lld pages)",
8829 (unsigned char*) key
->ptr
,
8830 (unsigned long long) page
, (unsigned long long) pages
);
8831 server
.vm_stats_swapped_objects
++;
8832 server
.vm_stats_swapouts
++;
8836 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
8839 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
8840 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
8841 redisLog(REDIS_WARNING
,
8842 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8846 o
= rdbLoadObject(type
,server
.vm_fp
);
8848 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
8851 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8855 /* Load the value object relative to the 'key' object from swap to memory.
8856 * The newly allocated object is returned.
8858 * If preview is true the unserialized object is returned to the caller but
8859 * no changes are made to the key object, nor the pages are marked as freed */
8860 static robj
*vmGenericLoadObject(robj
*key
, int preview
) {
8863 redisAssert(key
->storage
== REDIS_VM_SWAPPED
|| key
->storage
== REDIS_VM_LOADING
);
8864 val
= vmReadObjectFromSwap(key
->vm
.page
,key
->vtype
);
8866 key
->storage
= REDIS_VM_MEMORY
;
8867 key
->vm
.atime
= server
.unixtime
;
8868 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
8869 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk",
8870 (unsigned char*) key
->ptr
);
8871 server
.vm_stats_swapped_objects
--;
8873 redisLog(REDIS_DEBUG
, "VM: object %s previewed from disk",
8874 (unsigned char*) key
->ptr
);
8876 server
.vm_stats_swapins
++;
8880 /* Plain object loading, from swap to memory */
8881 static robj
*vmLoadObject(robj
*key
) {
8882 /* If we are loading the object in background, stop it, we
8883 * need to load this object synchronously ASAP. */
8884 if (key
->storage
== REDIS_VM_LOADING
)
8885 vmCancelThreadedIOJob(key
);
8886 return vmGenericLoadObject(key
,0);
8889 /* Just load the value on disk, without to modify the key.
8890 * This is useful when we want to perform some operation on the value
8891 * without to really bring it from swap to memory, like while saving the
8892 * dataset or rewriting the append only log. */
8893 static robj
*vmPreviewObject(robj
*key
) {
8894 return vmGenericLoadObject(key
,1);
8897 /* How a good candidate is this object for swapping?
8898 * The better candidate it is, the greater the returned value.
8900 * Currently we try to perform a fast estimation of the object size in
8901 * memory, and combine it with aging informations.
8903 * Basically swappability = idle-time * log(estimated size)
8905 * Bigger objects are preferred over smaller objects, but not
8906 * proportionally, this is why we use the logarithm. This algorithm is
8907 * just a first try and will probably be tuned later. */
8908 static double computeObjectSwappability(robj
*o
) {
8909 time_t age
= server
.unixtime
- o
->vm
.atime
;
8913 struct dictEntry
*de
;
8916 if (age
<= 0) return 0;
8919 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
8922 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
8927 listNode
*ln
= listFirst(l
);
8929 asize
= sizeof(list
);
8931 robj
*ele
= ln
->value
;
8934 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8935 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8937 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
8942 z
= (o
->type
== REDIS_ZSET
);
8943 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
8945 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
8946 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
8951 de
= dictGetRandomKey(d
);
8952 ele
= dictGetEntryKey(de
);
8953 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8954 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8956 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
8957 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
8961 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
8962 unsigned char *p
= zipmapRewind((unsigned char*)o
->ptr
);
8963 unsigned int len
= zipmapLen((unsigned char*)o
->ptr
);
8964 unsigned int klen
, vlen
;
8965 unsigned char *key
, *val
;
8967 if ((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) == NULL
) {
8971 asize
= len
*(klen
+vlen
+3);
8972 } else if (o
->encoding
== REDIS_ENCODING_HT
) {
8974 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
8979 de
= dictGetRandomKey(d
);
8980 ele
= dictGetEntryKey(de
);
8981 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8982 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8984 ele
= dictGetEntryVal(de
);
8985 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8986 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8988 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
8993 return (double)age
*log(1+asize
);
8996 /* Try to swap an object that's a good candidate for swapping.
8997 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8998 * to swap any object at all.
9000 * If 'usethreaded' is true, Redis will try to swap the object in background
9001 * using I/O threads. */
9002 static int vmSwapOneObject(int usethreads
) {
9004 struct dictEntry
*best
= NULL
;
9005 double best_swappability
= 0;
9006 redisDb
*best_db
= NULL
;
9009 for (j
= 0; j
< server
.dbnum
; j
++) {
9010 redisDb
*db
= server
.db
+j
;
9011 /* Why maxtries is set to 100?
9012 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9013 * are swappable objects */
9016 if (dictSize(db
->dict
) == 0) continue;
9017 for (i
= 0; i
< 5; i
++) {
9019 double swappability
;
9021 if (maxtries
) maxtries
--;
9022 de
= dictGetRandomKey(db
->dict
);
9023 key
= dictGetEntryKey(de
);
9024 val
= dictGetEntryVal(de
);
9025 /* Only swap objects that are currently in memory.
9027 * Also don't swap shared objects if threaded VM is on, as we
9028 * try to ensure that the main thread does not touch the
9029 * object while the I/O thread is using it, but we can't
9030 * control other keys without adding additional mutex. */
9031 if (key
->storage
!= REDIS_VM_MEMORY
||
9032 (server
.vm_max_threads
!= 0 && val
->refcount
!= 1)) {
9033 if (maxtries
) i
--; /* don't count this try */
9036 swappability
= computeObjectSwappability(val
);
9037 if (!best
|| swappability
> best_swappability
) {
9039 best_swappability
= swappability
;
9044 if (best
== NULL
) return REDIS_ERR
;
9045 key
= dictGetEntryKey(best
);
9046 val
= dictGetEntryVal(best
);
9048 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
9049 key
->ptr
, best_swappability
);
9051 /* Unshare the key if needed */
9052 if (key
->refcount
> 1) {
9053 robj
*newkey
= dupStringObject(key
);
9055 key
= dictGetEntryKey(best
) = newkey
;
9059 vmSwapObjectThreaded(key
,val
,best_db
);
9062 if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
9063 dictGetEntryVal(best
) = NULL
;
9071 static int vmSwapOneObjectBlocking() {
9072 return vmSwapOneObject(0);
9075 static int vmSwapOneObjectThreaded() {
9076 return vmSwapOneObject(1);
9079 /* Return true if it's safe to swap out objects in a given moment.
9080 * Basically we don't want to swap objects out while there is a BGSAVE
9081 * or a BGAEOREWRITE running in backgroud. */
9082 static int vmCanSwapOut(void) {
9083 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
9086 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9087 * and was deleted. Otherwise 0 is returned. */
9088 static int deleteIfSwapped(redisDb
*db
, robj
*key
) {
9092 if ((de
= dictFind(db
->dict
,key
)) == NULL
) return 0;
9093 foundkey
= dictGetEntryKey(de
);
9094 if (foundkey
->storage
== REDIS_VM_MEMORY
) return 0;
9099 /* =================== Virtual Memory - Threaded I/O ======================= */
9101 static void freeIOJob(iojob
*j
) {
9102 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
9103 j
->type
== REDIS_IOJOB_DO_SWAP
||
9104 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
9105 decrRefCount(j
->val
);
9106 /* We don't decrRefCount the j->key field as we did't incremented
9107 * the count creating IO Jobs. This is because the key field here is
9108 * just used as an indentifier and if a key is removed the Job should
9109 * never be touched again. */
9113 /* Every time a thread finished a Job, it writes a byte into the write side
9114 * of an unix pipe in order to "awake" the main thread, and this function
9116 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
9120 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
9122 REDIS_NOTUSED(mask
);
9123 REDIS_NOTUSED(privdata
);
9125 /* For every byte we read in the read side of the pipe, there is one
9126 * I/O job completed to process. */
9127 while((retval
= read(fd
,buf
,1)) == 1) {
9131 struct dictEntry
*de
;
9133 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
9135 /* Get the processed element (the oldest one) */
9137 assert(listLength(server
.io_processed
) != 0);
9138 if (toprocess
== -1) {
9139 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
9140 if (toprocess
<= 0) toprocess
= 1;
9142 ln
= listFirst(server
.io_processed
);
9144 listDelNode(server
.io_processed
,ln
);
9146 /* If this job is marked as canceled, just ignore it */
9151 /* Post process it in the main thread, as there are things we
9152 * can do just here to avoid race conditions and/or invasive locks */
9153 redisLog(REDIS_DEBUG
,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j
, j
->type
, (void*)j
->key
, (char*)j
->key
->ptr
, j
->key
->refcount
);
9154 de
= dictFind(j
->db
->dict
,j
->key
);
9156 key
= dictGetEntryKey(de
);
9157 if (j
->type
== REDIS_IOJOB_LOAD
) {
9160 /* Key loaded, bring it at home */
9161 key
->storage
= REDIS_VM_MEMORY
;
9162 key
->vm
.atime
= server
.unixtime
;
9163 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
9164 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
9165 (unsigned char*) key
->ptr
);
9166 server
.vm_stats_swapped_objects
--;
9167 server
.vm_stats_swapins
++;
9168 dictGetEntryVal(de
) = j
->val
;
9169 incrRefCount(j
->val
);
9172 /* Handle clients waiting for this key to be loaded. */
9173 handleClientsBlockedOnSwappedKey(db
,key
);
9174 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
9175 /* Now we know the amount of pages required to swap this object.
9176 * Let's find some space for it, and queue this task again
9177 * rebranded as REDIS_IOJOB_DO_SWAP. */
9178 if (!vmCanSwapOut() ||
9179 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
9181 /* Ooops... no space or we can't swap as there is
9182 * a fork()ed Redis trying to save stuff on disk. */
9184 key
->storage
= REDIS_VM_MEMORY
; /* undo operation */
9186 /* Note that we need to mark this pages as used now,
9187 * if the job will be canceled, we'll mark them as freed
9189 vmMarkPagesUsed(j
->page
,j
->pages
);
9190 j
->type
= REDIS_IOJOB_DO_SWAP
;
9195 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
9198 /* Key swapped. We can finally free some memory. */
9199 if (key
->storage
!= REDIS_VM_SWAPPING
) {
9200 printf("key->storage: %d\n",key
->storage
);
9201 printf("key->name: %s\n",(char*)key
->ptr
);
9202 printf("key->refcount: %d\n",key
->refcount
);
9203 printf("val: %p\n",(void*)j
->val
);
9204 printf("val->type: %d\n",j
->val
->type
);
9205 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
9207 redisAssert(key
->storage
== REDIS_VM_SWAPPING
);
9208 val
= dictGetEntryVal(de
);
9209 key
->vm
.page
= j
->page
;
9210 key
->vm
.usedpages
= j
->pages
;
9211 key
->storage
= REDIS_VM_SWAPPED
;
9212 key
->vtype
= j
->val
->type
;
9213 decrRefCount(val
); /* Deallocate the object from memory. */
9214 dictGetEntryVal(de
) = NULL
;
9215 redisLog(REDIS_DEBUG
,
9216 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9217 (unsigned char*) key
->ptr
,
9218 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
9219 server
.vm_stats_swapped_objects
++;
9220 server
.vm_stats_swapouts
++;
9222 /* Put a few more swap requests in queue if we are still
9224 if (trytoswap
&& vmCanSwapOut() &&
9225 zmalloc_used_memory() > server
.vm_max_memory
)
9230 more
= listLength(server
.io_newjobs
) <
9231 (unsigned) server
.vm_max_threads
;
9233 /* Don't waste CPU time if swappable objects are rare. */
9234 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
9242 if (processed
== toprocess
) return;
9244 if (retval
< 0 && errno
!= EAGAIN
) {
9245 redisLog(REDIS_WARNING
,
9246 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9251 static void lockThreadedIO(void) {
9252 pthread_mutex_lock(&server
.io_mutex
);
9255 static void unlockThreadedIO(void) {
9256 pthread_mutex_unlock(&server
.io_mutex
);
9259 /* Remove the specified object from the threaded I/O queue if still not
9260 * processed, otherwise make sure to flag it as canceled. */
9261 static void vmCancelThreadedIOJob(robj
*o
) {
9263 server
.io_newjobs
, /* 0 */
9264 server
.io_processing
, /* 1 */
9265 server
.io_processed
/* 2 */
9269 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
9272 /* Search for a matching key in one of the queues */
9273 for (i
= 0; i
< 3; i
++) {
9277 listRewind(lists
[i
],&li
);
9278 while ((ln
= listNext(&li
)) != NULL
) {
9279 iojob
*job
= ln
->value
;
9281 if (job
->canceled
) continue; /* Skip this, already canceled. */
9282 if (job
->key
== o
) {
9283 redisLog(REDIS_DEBUG
,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9284 (void*)job
, (char*)o
->ptr
, job
->type
, i
);
9285 /* Mark the pages as free since the swap didn't happened
9286 * or happened but is now discarded. */
9287 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
9288 vmMarkPagesFree(job
->page
,job
->pages
);
9289 /* Cancel the job. It depends on the list the job is
9292 case 0: /* io_newjobs */
9293 /* If the job was yet not processed the best thing to do
9294 * is to remove it from the queue at all */
9296 listDelNode(lists
[i
],ln
);
9298 case 1: /* io_processing */
9299 /* Oh Shi- the thread is messing with the Job:
9301 * Probably it's accessing the object if this is a
9302 * PREPARE_SWAP or DO_SWAP job.
9303 * If it's a LOAD job it may be reading from disk and
9304 * if we don't wait for the job to terminate before to
9305 * cancel it, maybe in a few microseconds data can be
9306 * corrupted in this pages. So the short story is:
9308 * Better to wait for the job to move into the
9309 * next queue (processed)... */
9311 /* We try again and again until the job is completed. */
9313 /* But let's wait some time for the I/O thread
9314 * to finish with this job. After all this condition
9315 * should be very rare. */
9318 case 2: /* io_processed */
9319 /* The job was already processed, that's easy...
9320 * just mark it as canceled so that we'll ignore it
9321 * when processing completed jobs. */
9325 /* Finally we have to adjust the storage type of the object
9326 * in order to "UNDO" the operaiton. */
9327 if (o
->storage
== REDIS_VM_LOADING
)
9328 o
->storage
= REDIS_VM_SWAPPED
;
9329 else if (o
->storage
== REDIS_VM_SWAPPING
)
9330 o
->storage
= REDIS_VM_MEMORY
;
9337 assert(1 != 1); /* We should never reach this */
9340 static void *IOThreadEntryPoint(void *arg
) {
9345 pthread_detach(pthread_self());
9347 /* Get a new job to process */
9349 if (listLength(server
.io_newjobs
) == 0) {
9350 /* No new jobs in queue, exit. */
9351 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
9352 (long) pthread_self());
9353 server
.io_active_threads
--;
9357 ln
= listFirst(server
.io_newjobs
);
9359 listDelNode(server
.io_newjobs
,ln
);
9360 /* Add the job in the processing queue */
9361 j
->thread
= pthread_self();
9362 listAddNodeTail(server
.io_processing
,j
);
9363 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
9365 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
9366 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
9368 /* Process the Job */
9369 if (j
->type
== REDIS_IOJOB_LOAD
) {
9370 j
->val
= vmReadObjectFromSwap(j
->page
,j
->key
->vtype
);
9371 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
9372 FILE *fp
= fopen("/dev/null","w+");
9373 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
9375 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
9376 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
9380 /* Done: insert the job into the processed queue */
9381 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
9382 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
9384 listDelNode(server
.io_processing
,ln
);
9385 listAddNodeTail(server
.io_processed
,j
);
9388 /* Signal the main thread there is new stuff to process */
9389 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
9391 return NULL
; /* never reached */
9394 static void spawnIOThread(void) {
9396 sigset_t mask
, omask
;
9400 sigaddset(&mask
,SIGCHLD
);
9401 sigaddset(&mask
,SIGHUP
);
9402 sigaddset(&mask
,SIGPIPE
);
9403 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
9404 while ((err
= pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
)) != 0) {
9405 redisLog(REDIS_WARNING
,"Unable to spawn an I/O thread: %s",
9409 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
9410 server
.io_active_threads
++;
9413 /* We need to wait for the last thread to exit before we are able to
9414 * fork() in order to BGSAVE or BGREWRITEAOF. */
9415 static void waitEmptyIOJobsQueue(void) {
9417 int io_processed_len
;
9420 if (listLength(server
.io_newjobs
) == 0 &&
9421 listLength(server
.io_processing
) == 0 &&
9422 server
.io_active_threads
== 0)
9427 /* While waiting for empty jobs queue condition we post-process some
9428 * finshed job, as I/O threads may be hanging trying to write against
9429 * the io_ready_pipe_write FD but there are so much pending jobs that
9431 io_processed_len
= listLength(server
.io_processed
);
9433 if (io_processed_len
) {
9434 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
9435 usleep(1000); /* 1 millisecond */
9437 usleep(10000); /* 10 milliseconds */
9442 static void vmReopenSwapFile(void) {
9443 /* Note: we don't close the old one as we are in the child process
9444 * and don't want to mess at all with the original file object. */
9445 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
9446 if (server
.vm_fp
== NULL
) {
9447 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
9448 server
.vm_swap_file
);
9451 server
.vm_fd
= fileno(server
.vm_fp
);
9454 /* This function must be called while with threaded IO locked */
9455 static void queueIOJob(iojob
*j
) {
9456 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
9457 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
9458 listAddNodeTail(server
.io_newjobs
,j
);
9459 if (server
.io_active_threads
< server
.vm_max_threads
)
9463 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
9466 assert(key
->storage
== REDIS_VM_MEMORY
);
9467 assert(key
->refcount
== 1);
9469 j
= zmalloc(sizeof(*j
));
9470 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
9476 j
->thread
= (pthread_t
) -1;
9477 key
->storage
= REDIS_VM_SWAPPING
;
9485 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9487 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9488 * If there is not already a job loading the key, it is craeted.
9489 * The key is added to the io_keys list in the client structure, and also
9490 * in the hash table mapping swapped keys to waiting clients, that is,
9491 * server.io_waited_keys. */
9492 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
9493 struct dictEntry
*de
;
9497 /* If the key does not exist or is already in RAM we don't need to
9498 * block the client at all. */
9499 de
= dictFind(c
->db
->dict
,key
);
9500 if (de
== NULL
) return 0;
9501 o
= dictGetEntryKey(de
);
9502 if (o
->storage
== REDIS_VM_MEMORY
) {
9504 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
9505 /* We were swapping the key, undo it! */
9506 vmCancelThreadedIOJob(o
);
9510 /* OK: the key is either swapped, or being loaded just now. */
9512 /* Add the key to the list of keys this client is waiting for.
9513 * This maps clients to keys they are waiting for. */
9514 listAddNodeTail(c
->io_keys
,key
);
9517 /* Add the client to the swapped keys => clients waiting map. */
9518 de
= dictFind(c
->db
->io_keys
,key
);
9522 /* For every key we take a list of clients blocked for it */
9524 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
9526 assert(retval
== DICT_OK
);
9528 l
= dictGetEntryVal(de
);
9530 listAddNodeTail(l
,c
);
9532 /* Are we already loading the key from disk? If not create a job */
9533 if (o
->storage
== REDIS_VM_SWAPPED
) {
9536 o
->storage
= REDIS_VM_LOADING
;
9537 j
= zmalloc(sizeof(*j
));
9538 j
->type
= REDIS_IOJOB_LOAD
;
9541 j
->key
->vtype
= o
->vtype
;
9542 j
->page
= o
->vm
.page
;
9545 j
->thread
= (pthread_t
) -1;
9553 /* Preload keys for any command with first, last and step values for
9554 * the command keys prototype, as defined in the command table. */
9555 static void waitForMultipleSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
) {
9557 if (cmd
->vm_firstkey
== 0) return;
9558 last
= cmd
->vm_lastkey
;
9559 if (last
< 0) last
= argc
+last
;
9560 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
) {
9561 redisAssert(j
< argc
);
9562 waitForSwappedKey(c
,argv
[j
]);
9566 /* Preload keys needed for the ZUNION and ZINTER commands.
9567 * Note that the number of keys to preload is user-defined, so we need to
9568 * apply a sanity check against argc. */
9569 static void zunionInterBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
) {
9573 num
= atoi(argv
[2]->ptr
);
9574 if (num
> (argc
-3)) return;
9575 for (i
= 0; i
< num
; i
++) {
9576 waitForSwappedKey(c
,argv
[3+i
]);
9580 /* Preload keys needed to execute the entire MULTI/EXEC block.
9582 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9583 * and will block the client when any command requires a swapped out value. */
9584 static void execBlockClientOnSwappedKeys(redisClient
*c
, struct redisCommand
*cmd
, int argc
, robj
**argv
) {
9586 struct redisCommand
*mcmd
;
9589 REDIS_NOTUSED(argc
);
9590 REDIS_NOTUSED(argv
);
9592 if (!(c
->flags
& REDIS_MULTI
)) return;
9593 for (i
= 0; i
< c
->mstate
.count
; i
++) {
9594 mcmd
= c
->mstate
.commands
[i
].cmd
;
9595 margc
= c
->mstate
.commands
[i
].argc
;
9596 margv
= c
->mstate
.commands
[i
].argv
;
9598 if (mcmd
->vm_preload_proc
!= NULL
) {
9599 mcmd
->vm_preload_proc(c
,mcmd
,margc
,margv
);
9601 waitForMultipleSwappedKeys(c
,mcmd
,margc
,margv
);
9606 /* Is this client attempting to run a command against swapped keys?
9607 * If so, block it ASAP, load the keys in background, then resume it.
9609 * The important idea about this function is that it can fail! If keys will
9610 * still be swapped when the client is resumed, this key lookups will
9611 * just block loading keys from disk. In practical terms this should only
9612 * happen with SORT BY command or if there is a bug in this function.
9614 * Return 1 if the client is marked as blocked, 0 if the client can
9615 * continue as the keys it is going to access appear to be in memory. */
9616 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
) {
9617 if (cmd
->vm_preload_proc
!= NULL
) {
9618 cmd
->vm_preload_proc(c
,cmd
,c
->argc
,c
->argv
);
9620 waitForMultipleSwappedKeys(c
,cmd
,c
->argc
,c
->argv
);
9623 /* If the client was blocked for at least one key, mark it as blocked. */
9624 if (listLength(c
->io_keys
)) {
9625 c
->flags
|= REDIS_IO_WAIT
;
9626 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
9627 server
.vm_blocked_clients
++;
9634 /* Remove the 'key' from the list of blocked keys for a given client.
9636 * The function returns 1 when there are no longer blocking keys after
9637 * the current one was removed (and the client can be unblocked). */
9638 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
9642 struct dictEntry
*de
;
9644 /* Remove the key from the list of keys this client is waiting for. */
9645 listRewind(c
->io_keys
,&li
);
9646 while ((ln
= listNext(&li
)) != NULL
) {
9647 if (compareStringObjects(ln
->value
,key
) == 0) {
9648 listDelNode(c
->io_keys
,ln
);
9654 /* Remove the client form the key => waiting clients map. */
9655 de
= dictFind(c
->db
->io_keys
,key
);
9657 l
= dictGetEntryVal(de
);
9658 ln
= listSearchKey(l
,c
);
9661 if (listLength(l
) == 0)
9662 dictDelete(c
->db
->io_keys
,key
);
9664 return listLength(c
->io_keys
) == 0;
9667 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
9668 struct dictEntry
*de
;
9673 de
= dictFind(db
->io_keys
,key
);
9676 l
= dictGetEntryVal(de
);
9677 len
= listLength(l
);
9678 /* Note: we can't use something like while(listLength(l)) as the list
9679 * can be freed by the calling function when we remove the last element. */
9682 redisClient
*c
= ln
->value
;
9684 if (dontWaitForSwappedKey(c
,key
)) {
9685 /* Put the client in the list of clients ready to go as we
9686 * loaded all the keys about it. */
9687 listAddNodeTail(server
.io_ready_clients
,c
);
9692 /* =========================== Remote Configuration ========================= */
9694 static void configSetCommand(redisClient
*c
) {
9695 robj
*o
= getDecodedObject(c
->argv
[3]);
9696 if (!strcasecmp(c
->argv
[2]->ptr
,"dbfilename")) {
9697 zfree(server
.dbfilename
);
9698 server
.dbfilename
= zstrdup(o
->ptr
);
9699 } else if (!strcasecmp(c
->argv
[2]->ptr
,"requirepass")) {
9700 zfree(server
.requirepass
);
9701 server
.requirepass
= zstrdup(o
->ptr
);
9702 } else if (!strcasecmp(c
->argv
[2]->ptr
,"masterauth")) {
9703 zfree(server
.masterauth
);
9704 server
.masterauth
= zstrdup(o
->ptr
);
9705 } else if (!strcasecmp(c
->argv
[2]->ptr
,"maxmemory")) {
9706 server
.maxmemory
= strtoll(o
->ptr
, NULL
, 10);
9708 addReplySds(c
,sdscatprintf(sdsempty(),
9709 "-ERR not supported CONFIG parameter %s\r\n",
9710 (char*)c
->argv
[2]->ptr
));
9715 addReply(c
,shared
.ok
);
9718 static void configGetCommand(redisClient
*c
) {
9719 robj
*o
= getDecodedObject(c
->argv
[2]);
9720 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
9721 char *pattern
= o
->ptr
;
9725 decrRefCount(lenobj
);
9727 if (stringmatch(pattern
,"dbfilename",0)) {
9728 addReplyBulkCString(c
,"dbfilename");
9729 addReplyBulkCString(c
,server
.dbfilename
);
9732 if (stringmatch(pattern
,"requirepass",0)) {
9733 addReplyBulkCString(c
,"requirepass");
9734 addReplyBulkCString(c
,server
.requirepass
);
9737 if (stringmatch(pattern
,"masterauth",0)) {
9738 addReplyBulkCString(c
,"masterauth");
9739 addReplyBulkCString(c
,server
.masterauth
);
9742 if (stringmatch(pattern
,"maxmemory",0)) {
9745 snprintf(buf
,128,"%llu\n",server
.maxmemory
);
9746 addReplyBulkCString(c
,"maxmemory");
9747 addReplyBulkCString(c
,buf
);
9751 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%d\r\n",matches
*2);
9754 static void configCommand(redisClient
*c
) {
9755 if (!strcasecmp(c
->argv
[1]->ptr
,"set")) {
9756 if (c
->argc
!= 4) goto badarity
;
9757 configSetCommand(c
);
9758 } else if (!strcasecmp(c
->argv
[1]->ptr
,"get")) {
9759 if (c
->argc
!= 3) goto badarity
;
9760 configGetCommand(c
);
9761 } else if (!strcasecmp(c
->argv
[1]->ptr
,"resetstat")) {
9762 if (c
->argc
!= 2) goto badarity
;
9763 server
.stat_numcommands
= 0;
9764 server
.stat_numconnections
= 0;
9765 server
.stat_expiredkeys
= 0;
9766 server
.stat_starttime
= time(NULL
);
9767 addReply(c
,shared
.ok
);
9769 addReplySds(c
,sdscatprintf(sdsempty(),
9770 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9775 addReplySds(c
,sdscatprintf(sdsempty(),
9776 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9777 (char*) c
->argv
[1]->ptr
));
9780 /* =========================== Pubsub implementation ======================== */
9782 static void freePubsubPattern(void *p
) {
9783 pubsubPattern
*pat
= p
;
9785 decrRefCount(pat
->pattern
);
9789 static int listMatchPubsubPattern(void *a
, void *b
) {
9790 pubsubPattern
*pa
= a
, *pb
= b
;
9792 return (pa
->client
== pb
->client
) &&
9793 (compareStringObjects(pa
->pattern
,pb
->pattern
) == 0);
9796 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9797 * 0 if the client was already subscribed to that channel. */
9798 static int pubsubSubscribeChannel(redisClient
*c
, robj
*channel
) {
9799 struct dictEntry
*de
;
9800 list
*clients
= NULL
;
9803 /* Add the channel to the client -> channels hash table */
9804 if (dictAdd(c
->pubsub_channels
,channel
,NULL
) == DICT_OK
) {
9806 incrRefCount(channel
);
9807 /* Add the client to the channel -> list of clients hash table */
9808 de
= dictFind(server
.pubsub_channels
,channel
);
9810 clients
= listCreate();
9811 dictAdd(server
.pubsub_channels
,channel
,clients
);
9812 incrRefCount(channel
);
9814 clients
= dictGetEntryVal(de
);
9816 listAddNodeTail(clients
,c
);
9818 /* Notify the client */
9819 addReply(c
,shared
.mbulk3
);
9820 addReply(c
,shared
.subscribebulk
);
9821 addReplyBulk(c
,channel
);
9822 addReplyLong(c
,dictSize(c
->pubsub_channels
)+listLength(c
->pubsub_patterns
));
9826 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9827 * 0 if the client was not subscribed to the specified channel. */
9828 static int pubsubUnsubscribeChannel(redisClient
*c
, robj
*channel
, int notify
) {
9829 struct dictEntry
*de
;
9834 /* Remove the channel from the client -> channels hash table */
9835 incrRefCount(channel
); /* channel may be just a pointer to the same object
9836 we have in the hash tables. Protect it... */
9837 if (dictDelete(c
->pubsub_channels
,channel
) == DICT_OK
) {
9839 /* Remove the client from the channel -> clients list hash table */
9840 de
= dictFind(server
.pubsub_channels
,channel
);
9842 clients
= dictGetEntryVal(de
);
9843 ln
= listSearchKey(clients
,c
);
9845 listDelNode(clients
,ln
);
9846 if (listLength(clients
) == 0) {
9847 /* Free the list and associated hash entry at all if this was
9848 * the latest client, so that it will be possible to abuse
9849 * Redis PUBSUB creating millions of channels. */
9850 dictDelete(server
.pubsub_channels
,channel
);
9853 /* Notify the client */
9855 addReply(c
,shared
.mbulk3
);
9856 addReply(c
,shared
.unsubscribebulk
);
9857 addReplyBulk(c
,channel
);
9858 addReplyLong(c
,dictSize(c
->pubsub_channels
)+
9859 listLength(c
->pubsub_patterns
));
9862 decrRefCount(channel
); /* it is finally safe to release it */
9866 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9867 static int pubsubSubscribePattern(redisClient
*c
, robj
*pattern
) {
9870 if (listSearchKey(c
->pubsub_patterns
,pattern
) == NULL
) {
9873 listAddNodeTail(c
->pubsub_patterns
,pattern
);
9874 incrRefCount(pattern
);
9875 pat
= zmalloc(sizeof(*pat
));
9876 pat
->pattern
= getDecodedObject(pattern
);
9878 listAddNodeTail(server
.pubsub_patterns
,pat
);
9880 /* Notify the client */
9881 addReply(c
,shared
.mbulk3
);
9882 addReply(c
,shared
.psubscribebulk
);
9883 addReplyBulk(c
,pattern
);
9884 addReplyLong(c
,dictSize(c
->pubsub_channels
)+listLength(c
->pubsub_patterns
));
9888 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9889 * 0 if the client was not subscribed to the specified channel. */
9890 static int pubsubUnsubscribePattern(redisClient
*c
, robj
*pattern
, int notify
) {
9895 incrRefCount(pattern
); /* Protect the object. May be the same we remove */
9896 if ((ln
= listSearchKey(c
->pubsub_patterns
,pattern
)) != NULL
) {
9898 listDelNode(c
->pubsub_patterns
,ln
);
9900 pat
.pattern
= pattern
;
9901 ln
= listSearchKey(server
.pubsub_patterns
,&pat
);
9902 listDelNode(server
.pubsub_patterns
,ln
);
9904 /* Notify the client */
9906 addReply(c
,shared
.mbulk3
);
9907 addReply(c
,shared
.punsubscribebulk
);
9908 addReplyBulk(c
,pattern
);
9909 addReplyLong(c
,dictSize(c
->pubsub_channels
)+
9910 listLength(c
->pubsub_patterns
));
9912 decrRefCount(pattern
);
9916 /* Unsubscribe from all the channels. Return the number of channels the
9917 * client was subscribed from. */
9918 static int pubsubUnsubscribeAllChannels(redisClient
*c
, int notify
) {
9919 dictIterator
*di
= dictGetIterator(c
->pubsub_channels
);
9923 while((de
= dictNext(di
)) != NULL
) {
9924 robj
*channel
= dictGetEntryKey(de
);
9926 count
+= pubsubUnsubscribeChannel(c
,channel
,notify
);
9928 dictReleaseIterator(di
);
9932 /* Unsubscribe from all the patterns. Return the number of patterns the
9933 * client was subscribed from. */
9934 static int pubsubUnsubscribeAllPatterns(redisClient
*c
, int notify
) {
9939 listRewind(c
->pubsub_patterns
,&li
);
9940 while ((ln
= listNext(&li
)) != NULL
) {
9941 robj
*pattern
= ln
->value
;
9943 count
+= pubsubUnsubscribePattern(c
,pattern
,notify
);
9948 /* Publish a message */
9949 static int pubsubPublishMessage(robj
*channel
, robj
*message
) {
9951 struct dictEntry
*de
;
9955 /* Send to clients listening for that channel */
9956 de
= dictFind(server
.pubsub_channels
,channel
);
9958 list
*list
= dictGetEntryVal(de
);
9962 listRewind(list
,&li
);
9963 while ((ln
= listNext(&li
)) != NULL
) {
9964 redisClient
*c
= ln
->value
;
9966 addReply(c
,shared
.mbulk3
);
9967 addReply(c
,shared
.messagebulk
);
9968 addReplyBulk(c
,channel
);
9969 addReplyBulk(c
,message
);
9973 /* Send to clients listening to matching channels */
9974 if (listLength(server
.pubsub_patterns
)) {
9975 listRewind(server
.pubsub_patterns
,&li
);
9976 channel
= getDecodedObject(channel
);
9977 while ((ln
= listNext(&li
)) != NULL
) {
9978 pubsubPattern
*pat
= ln
->value
;
9980 if (stringmatchlen((char*)pat
->pattern
->ptr
,
9981 sdslen(pat
->pattern
->ptr
),
9982 (char*)channel
->ptr
,
9983 sdslen(channel
->ptr
),0)) {
9984 addReply(pat
->client
,shared
.mbulk4
);
9985 addReply(pat
->client
,shared
.pmessagebulk
);
9986 addReplyBulk(pat
->client
,pat
->pattern
);
9987 addReplyBulk(pat
->client
,channel
);
9988 addReplyBulk(pat
->client
,message
);
9992 decrRefCount(channel
);
9997 static void subscribeCommand(redisClient
*c
) {
10000 for (j
= 1; j
< c
->argc
; j
++)
10001 pubsubSubscribeChannel(c
,c
->argv
[j
]);
10004 static void unsubscribeCommand(redisClient
*c
) {
10005 if (c
->argc
== 1) {
10006 pubsubUnsubscribeAllChannels(c
,1);
10011 for (j
= 1; j
< c
->argc
; j
++)
10012 pubsubUnsubscribeChannel(c
,c
->argv
[j
],1);
10016 static void psubscribeCommand(redisClient
*c
) {
10019 for (j
= 1; j
< c
->argc
; j
++)
10020 pubsubSubscribePattern(c
,c
->argv
[j
]);
10023 static void punsubscribeCommand(redisClient
*c
) {
10024 if (c
->argc
== 1) {
10025 pubsubUnsubscribeAllPatterns(c
,1);
10030 for (j
= 1; j
< c
->argc
; j
++)
10031 pubsubUnsubscribePattern(c
,c
->argv
[j
],1);
10035 static void publishCommand(redisClient
*c
) {
10036 int receivers
= pubsubPublishMessage(c
->argv
[1],c
->argv
[2]);
10037 addReplyLong(c
,receivers
);
10040 /* ================================= Debugging ============================== */
10042 static void debugCommand(redisClient
*c
) {
10043 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
10044 *((char*)-1) = 'x';
10045 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
10046 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
10047 addReply(c
,shared
.err
);
10051 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
10052 addReply(c
,shared
.err
);
10055 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
10056 addReply(c
,shared
.ok
);
10057 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
10059 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
10060 addReply(c
,shared
.err
);
10063 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
10064 addReply(c
,shared
.ok
);
10065 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
10066 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
10070 addReply(c
,shared
.nokeyerr
);
10073 key
= dictGetEntryKey(de
);
10074 val
= dictGetEntryVal(de
);
10075 if (!server
.vm_enabled
|| (key
->storage
== REDIS_VM_MEMORY
||
10076 key
->storage
== REDIS_VM_SWAPPING
)) {
10080 if (val
->encoding
< (sizeof(strencoding
)/sizeof(char*))) {
10081 strenc
= strencoding
[val
->encoding
];
10083 snprintf(buf
,64,"unknown encoding %d\n", val
->encoding
);
10086 addReplySds(c
,sdscatprintf(sdsempty(),
10087 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10088 "encoding:%s serializedlength:%lld\r\n",
10089 (void*)key
, key
->refcount
, (void*)val
, val
->refcount
,
10090 strenc
, (long long) rdbSavedObjectLen(val
,NULL
)));
10092 addReplySds(c
,sdscatprintf(sdsempty(),
10093 "+Key at:%p refcount:%d, value swapped at: page %llu "
10094 "using %llu pages\r\n",
10095 (void*)key
, key
->refcount
, (unsigned long long) key
->vm
.page
,
10096 (unsigned long long) key
->vm
.usedpages
));
10098 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapin") && c
->argc
== 3) {
10099 lookupKeyRead(c
->db
,c
->argv
[2]);
10100 addReply(c
,shared
.ok
);
10101 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
10102 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
10105 if (!server
.vm_enabled
) {
10106 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10110 addReply(c
,shared
.nokeyerr
);
10113 key
= dictGetEntryKey(de
);
10114 val
= dictGetEntryVal(de
);
10115 /* If the key is shared we want to create a copy */
10116 if (key
->refcount
> 1) {
10117 robj
*newkey
= dupStringObject(key
);
10119 key
= dictGetEntryKey(de
) = newkey
;
10122 if (key
->storage
!= REDIS_VM_MEMORY
) {
10123 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
10124 } else if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
10125 dictGetEntryVal(de
) = NULL
;
10126 addReply(c
,shared
.ok
);
10128 addReply(c
,shared
.err
);
10131 addReplySds(c
,sdsnew(
10132 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10136 static void _redisAssert(char *estr
, char *file
, int line
) {
10137 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
10138 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true\n",file
,line
,estr
);
10139 #ifdef HAVE_BACKTRACE
10140 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
10141 *((char*)-1) = 'x';
10145 static void _redisPanic(char *msg
, char *file
, int line
) {
10146 redisLog(REDIS_WARNING
,"!!! Software Failure. Press left mouse button to continue");
10147 redisLog(REDIS_WARNING
,"Guru Meditation: %s #%s:%d",msg
,file
,line
);
10148 #ifdef HAVE_BACKTRACE
10149 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
10150 *((char*)-1) = 'x';
10154 /* =================================== Main! ================================ */
10157 int linuxOvercommitMemoryValue(void) {
10158 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
10161 if (!fp
) return -1;
10162 if (fgets(buf
,64,fp
) == NULL
) {
10171 void linuxOvercommitMemoryWarning(void) {
10172 if (linuxOvercommitMemoryValue() == 0) {
10173 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10176 #endif /* __linux__ */
10178 static void daemonize(void) {
10182 if (fork() != 0) exit(0); /* parent exits */
10183 setsid(); /* create a new session */
10185 /* Every output goes to /dev/null. If Redis is daemonized but
10186 * the 'logfile' is set to 'stdout' in the configuration file
10187 * it will not log at all. */
10188 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
10189 dup2(fd
, STDIN_FILENO
);
10190 dup2(fd
, STDOUT_FILENO
);
10191 dup2(fd
, STDERR_FILENO
);
10192 if (fd
> STDERR_FILENO
) close(fd
);
10194 /* Try to write the pid file */
10195 fp
= fopen(server
.pidfile
,"w");
10197 fprintf(fp
,"%d\n",getpid());
10202 static void version() {
10203 printf("Redis server version %s\n", REDIS_VERSION
);
10207 static void usage() {
10208 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
10209 fprintf(stderr
," ./redis-server - (read config from stdin)\n");
10213 int main(int argc
, char **argv
) {
10216 initServerConfig();
10218 if (strcmp(argv
[1], "-v") == 0 ||
10219 strcmp(argv
[1], "--version") == 0) version();
10220 if (strcmp(argv
[1], "--help") == 0) usage();
10221 resetServerSaveParams();
10222 loadServerConfig(argv
[1]);
10223 } else if ((argc
> 2)) {
10226 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10228 if (server
.daemonize
) daemonize();
10230 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
10232 linuxOvercommitMemoryWarning();
10234 start
= time(NULL
);
10235 if (server
.appendonly
) {
10236 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
10237 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
10239 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
10240 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
10242 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
10243 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
10245 aeDeleteEventLoop(server
.el
);
10249 /* ============================= Backtrace support ========================= */
10251 #ifdef HAVE_BACKTRACE
10252 static char *findFuncName(void *pointer
, unsigned long *offset
);
10254 static void *getMcontextEip(ucontext_t
*uc
) {
10255 #if defined(__FreeBSD__)
10256 return (void*) uc
->uc_mcontext
.mc_eip
;
10257 #elif defined(__dietlibc__)
10258 return (void*) uc
->uc_mcontext
.eip
;
10259 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10261 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
10263 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
10265 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10266 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10267 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
10269 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
10271 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10272 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
10273 #elif defined(__ia64__) /* Linux IA64 */
10274 return (void*) uc
->uc_mcontext
.sc_ip
;
10280 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
10282 char **messages
= NULL
;
10283 int i
, trace_size
= 0;
10284 unsigned long offset
=0;
10285 ucontext_t
*uc
= (ucontext_t
*) secret
;
10287 REDIS_NOTUSED(info
);
10289 redisLog(REDIS_WARNING
,
10290 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
10291 infostring
= genRedisInfoString();
10292 redisLog(REDIS_WARNING
, "%s",infostring
);
10293 /* It's not safe to sdsfree() the returned string under memory
10294 * corruption conditions. Let it leak as we are going to abort */
10296 trace_size
= backtrace(trace
, 100);
10297 /* overwrite sigaction with caller's address */
10298 if (getMcontextEip(uc
) != NULL
) {
10299 trace
[1] = getMcontextEip(uc
);
10301 messages
= backtrace_symbols(trace
, trace_size
);
10303 for (i
=1; i
<trace_size
; ++i
) {
10304 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
10306 p
= strchr(messages
[i
],'+');
10307 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
10308 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
10310 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
10313 /* free(messages); Don't call free() with possibly corrupted memory. */
10317 static void setupSigSegvAction(void) {
10318 struct sigaction act
;
10320 sigemptyset (&act
.sa_mask
);
10321 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10322 * is used. Otherwise, sa_handler is used */
10323 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
10324 act
.sa_sigaction
= segvHandler
;
10325 sigaction (SIGSEGV
, &act
, NULL
);
10326 sigaction (SIGBUS
, &act
, NULL
);
10327 sigaction (SIGFPE
, &act
, NULL
);
10328 sigaction (SIGILL
, &act
, NULL
);
10329 sigaction (SIGBUS
, &act
, NULL
);
10333 #include "staticsymbols.h"
10334 /* This function try to convert a pointer into a function name. It's used in
10335 * oreder to provide a backtrace under segmentation fault that's able to
10336 * display functions declared as static (otherwise the backtrace is useless). */
10337 static char *findFuncName(void *pointer
, unsigned long *offset
){
10339 unsigned long off
, minoff
= 0;
10341 /* Try to match against the Symbol with the smallest offset */
10342 for (i
=0; symsTable
[i
].pointer
; i
++) {
10343 unsigned long lp
= (unsigned long) pointer
;
10345 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
10346 off
=lp
-symsTable
[i
].pointer
;
10347 if (ret
< 0 || off
< minoff
) {
10353 if (ret
== -1) return NULL
;
10355 return symsTable
[ret
].name
;
10357 #else /* HAVE_BACKTRACE */
10358 static void setupSigSegvAction(void) {
10360 #endif /* HAVE_BACKTRACE */