2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
30 #define REDIS_VERSION "1.3.4"
40 #define __USE_POSIX199309
47 #endif /* HAVE_BACKTRACE */
55 #include <arpa/inet.h>
59 #include <sys/resource.h>
66 #include "solarisfixes.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_STRING 0
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130 static char* strencoding
[] = {
131 "raw", "int", "zipmap", "hashtable"
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206 /* List related stuff */
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr
, char *file
, int line
);
241 /*================================= Data types ============================== */
243 /* A redis object, that is a type able to hold a string / list / set */
245 /* The VM object structure */
246 struct redisObjectVM
{
247 off_t page
; /* the page at witch the object is stored on disk */
248 off_t usedpages
; /* number of pages used on disk */
249 time_t atime
; /* Last access time */
252 /* The actual Redis Object */
253 typedef struct redisObject
{
256 unsigned char encoding
;
257 unsigned char storage
; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype
; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm
;
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
281 typedef struct redisDb
{
282 dict
*dict
; /* The keyspace for this DB */
283 dict
*expires
; /* Timeout of keys with a timeout set */
284 dict
*blockingkeys
; /* Keys with clients waiting for data (BLPOP) */
285 dict
*io_keys
; /* Keys with clients waiting for VM I/O */
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd
{
293 struct redisCommand
*cmd
;
296 typedef struct multiState
{
297 multiCmd
*commands
; /* Array of MULTI commands */
298 int count
; /* Total number of MULTI commands */
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient
{
308 robj
**argv
, **mbargv
;
310 int bulklen
; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk
; /* multi bulk command format active */
314 time_t lastinteraction
; /* time of the last interaction, used for timeout */
315 int flags
; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb
; /* slave selected db, if this client is a slave */
317 int authenticated
; /* when requirepass is non-NULL */
318 int replstate
; /* replication state if this is a slave */
319 int repldbfd
; /* replication DB file descriptor */
320 long repldboff
; /* replication DB file offset */
321 off_t repldbsize
; /* replication DB file size */
322 multiState mstate
; /* MULTI/EXEC state */
323 robj
**blockingkeys
; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum
; /* Number of blocking keys */
326 time_t blockingto
; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list
*io_keys
; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
337 /* Global server state structure */
342 dict
*sharingpool
; /* Poll used for object sharing */
343 unsigned int sharingpoolsize
;
344 long long dirty
; /* changes to DB from the last save */
346 list
*slaves
, *monitors
;
347 char neterr
[ANET_ERR_LEN
];
349 int cronloops
; /* number of times the cron function run */
350 list
*objfreelist
; /* A list of freed objects to avoid malloc() */
351 time_t lastsave
; /* Unix time of last save succeeede */
352 /* Fields used only for stats */
353 time_t stat_starttime
; /* server start time */
354 long long stat_numcommands
; /* number of processed commands */
355 long long stat_numconnections
; /* number of connections received */
368 pid_t bgsavechildpid
;
369 pid_t bgrewritechildpid
;
370 sds bgrewritebuf
; /* buffer taken by parent during oppend only rewrite */
371 struct saveparam
*saveparams
;
376 char *appendfilename
;
380 /* Replication related */
385 redisClient
*master
; /* client that is master for this slave */
387 unsigned int maxclients
;
388 unsigned long long maxmemory
;
389 unsigned int blpop_blocked_clients
;
390 unsigned int vm_blocked_clients
;
391 /* Sort parameters - qsort_r() is only available under BSD so we
392 * have to take this state global, in order to pass it to sortCompare() */
396 /* Virtual memory configuration */
401 unsigned long long vm_max_memory
;
403 size_t hash_max_zipmap_entries
;
404 size_t hash_max_zipmap_value
;
405 /* Virtual memory state */
408 off_t vm_next_page
; /* Next probably empty page */
409 off_t vm_near_pages
; /* Number of pages allocated sequentially */
410 unsigned char *vm_bitmap
; /* Bitmap of free/used pages */
411 time_t unixtime
; /* Unix time sampled every second. */
412 /* Virtual memory I/O threads stuff */
413 /* An I/O thread process an element taken from the io_jobs queue and
414 * put the result of the operation in the io_done list. While the
415 * job is being processed, it's put on io_processing queue. */
416 list
*io_newjobs
; /* List of VM I/O jobs yet to be processed */
417 list
*io_processing
; /* List of VM I/O jobs being processed */
418 list
*io_processed
; /* List of VM I/O jobs already processed */
419 list
*io_ready_clients
; /* Clients ready to be unblocked. All keys loaded */
420 pthread_mutex_t io_mutex
; /* lock to access io_jobs/io_done/io_thread_job */
421 pthread_mutex_t obj_freelist_mutex
; /* safe redis objects creation/free */
422 pthread_mutex_t io_swapfile_mutex
; /* So we can lseek + write */
423 pthread_attr_t io_threads_attr
; /* attributes for threads creation */
424 int io_active_threads
; /* Number of running I/O threads */
425 int vm_max_threads
; /* Max number of I/O threads running at the same time */
426 /* Our main thread is blocked on the event loop, locking for sockets ready
427 * to be read or written, so when a threaded I/O operation is ready to be
428 * processed by the main thread, the I/O thread will use a unix pipe to
429 * awake the main thread. The followings are the two pipe FDs. */
430 int io_ready_pipe_read
;
431 int io_ready_pipe_write
;
432 /* Virtual memory stats */
433 unsigned long long vm_stats_used_pages
;
434 unsigned long long vm_stats_swapped_objects
;
435 unsigned long long vm_stats_swapouts
;
436 unsigned long long vm_stats_swapins
;
440 typedef void redisCommandProc(redisClient
*c
);
441 struct redisCommand
{
443 redisCommandProc
*proc
;
446 /* What keys should be loaded in background when calling this command? */
447 int vm_firstkey
; /* The first argument that's a key (0 = no keys) */
448 int vm_lastkey
; /* THe last argument that's a key */
449 int vm_keystep
; /* The step between first and last key */
452 struct redisFunctionSym
{
454 unsigned long pointer
;
457 typedef struct _redisSortObject
{
465 typedef struct _redisSortOperation
{
468 } redisSortOperation
;
470 /* ZSETs use a specialized version of Skiplists */
472 typedef struct zskiplistNode
{
473 struct zskiplistNode
**forward
;
474 struct zskiplistNode
*backward
;
480 typedef struct zskiplist
{
481 struct zskiplistNode
*header
, *tail
;
482 unsigned long length
;
486 typedef struct zset
{
491 /* Our shared "common" objects */
493 struct sharedObjectsStruct
{
494 robj
*crlf
, *ok
, *err
, *emptybulk
, *czero
, *cone
, *pong
, *space
,
495 *colon
, *nullbulk
, *nullmultibulk
, *queued
,
496 *emptymultibulk
, *wrongtypeerr
, *nokeyerr
, *syntaxerr
, *sameobjecterr
,
497 *outofrangeerr
, *plus
,
498 *select0
, *select1
, *select2
, *select3
, *select4
,
499 *select5
, *select6
, *select7
, *select8
, *select9
;
502 /* Global vars that are actally used as constants. The following double
503 * values are used for double on-disk serialization, and are initialized
504 * at runtime to avoid strange compiler optimizations. */
506 static double R_Zero
, R_PosInf
, R_NegInf
, R_Nan
;
508 /* VM threaded I/O request message */
509 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
510 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
511 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
512 typedef struct iojob
{
513 int type
; /* Request type, REDIS_IOJOB_* */
514 redisDb
*db
;/* Redis database */
515 robj
*key
; /* This I/O request is about swapping this key */
516 robj
*val
; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
517 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
518 off_t page
; /* Swap page where to read/write the object */
519 off_t pages
; /* Swap pages needed to safe object. PREPARE_SWAP return val */
520 int canceled
; /* True if this command was canceled by blocking side of VM */
521 pthread_t thread
; /* ID of the thread processing this entry */
524 /*================================ Prototypes =============================== */
526 static void freeStringObject(robj
*o
);
527 static void freeListObject(robj
*o
);
528 static void freeSetObject(robj
*o
);
529 static void decrRefCount(void *o
);
530 static robj
*createObject(int type
, void *ptr
);
531 static void freeClient(redisClient
*c
);
532 static int rdbLoad(char *filename
);
533 static void addReply(redisClient
*c
, robj
*obj
);
534 static void addReplySds(redisClient
*c
, sds s
);
535 static void incrRefCount(robj
*o
);
536 static int rdbSaveBackground(char *filename
);
537 static robj
*createStringObject(char *ptr
, size_t len
);
538 static robj
*dupStringObject(robj
*o
);
539 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
540 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
);
541 static int syncWithMaster(void);
542 static robj
*tryObjectSharing(robj
*o
);
543 static int tryObjectEncoding(robj
*o
);
544 static robj
*getDecodedObject(robj
*o
);
545 static int removeExpire(redisDb
*db
, robj
*key
);
546 static int expireIfNeeded(redisDb
*db
, robj
*key
);
547 static int deleteIfVolatile(redisDb
*db
, robj
*key
);
548 static int deleteIfSwapped(redisDb
*db
, robj
*key
);
549 static int deleteKey(redisDb
*db
, robj
*key
);
550 static time_t getExpire(redisDb
*db
, robj
*key
);
551 static int setExpire(redisDb
*db
, robj
*key
, time_t when
);
552 static void updateSlavesWaitingBgsave(int bgsaveerr
);
553 static void freeMemoryIfNeeded(void);
554 static int processCommand(redisClient
*c
);
555 static void setupSigSegvAction(void);
556 static void rdbRemoveTempFile(pid_t childpid
);
557 static void aofRemoveTempFile(pid_t childpid
);
558 static size_t stringObjectLen(robj
*o
);
559 static void processInputBuffer(redisClient
*c
);
560 static zskiplist
*zslCreate(void);
561 static void zslFree(zskiplist
*zsl
);
562 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
);
563 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
564 static void initClientMultiState(redisClient
*c
);
565 static void freeClientMultiState(redisClient
*c
);
566 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
);
567 static void unblockClientWaitingData(redisClient
*c
);
568 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
);
569 static void vmInit(void);
570 static void vmMarkPagesFree(off_t page
, off_t count
);
571 static robj
*vmLoadObject(robj
*key
);
572 static robj
*vmPreviewObject(robj
*key
);
573 static int vmSwapOneObjectBlocking(void);
574 static int vmSwapOneObjectThreaded(void);
575 static int vmCanSwapOut(void);
576 static int tryFreeOneObjectFromFreelist(void);
577 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
578 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
579 static void vmCancelThreadedIOJob(robj
*o
);
580 static void lockThreadedIO(void);
581 static void unlockThreadedIO(void);
582 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
);
583 static void freeIOJob(iojob
*j
);
584 static void queueIOJob(iojob
*j
);
585 static int vmWriteObjectOnSwap(robj
*o
, off_t page
);
586 static robj
*vmReadObjectFromSwap(off_t page
, int type
);
587 static void waitEmptyIOJobsQueue(void);
588 static void vmReopenSwapFile(void);
589 static int vmFreePage(off_t page
);
590 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
);
591 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
);
592 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
);
593 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
);
594 static struct redisCommand
*lookupCommand(char *name
);
595 static void call(redisClient
*c
, struct redisCommand
*cmd
);
596 static void resetClient(redisClient
*c
);
597 static void convertToRealHash(robj
*o
);
599 static void authCommand(redisClient
*c
);
600 static void pingCommand(redisClient
*c
);
601 static void echoCommand(redisClient
*c
);
602 static void setCommand(redisClient
*c
);
603 static void setnxCommand(redisClient
*c
);
604 static void getCommand(redisClient
*c
);
605 static void delCommand(redisClient
*c
);
606 static void existsCommand(redisClient
*c
);
607 static void incrCommand(redisClient
*c
);
608 static void decrCommand(redisClient
*c
);
609 static void incrbyCommand(redisClient
*c
);
610 static void decrbyCommand(redisClient
*c
);
611 static void selectCommand(redisClient
*c
);
612 static void randomkeyCommand(redisClient
*c
);
613 static void keysCommand(redisClient
*c
);
614 static void dbsizeCommand(redisClient
*c
);
615 static void lastsaveCommand(redisClient
*c
);
616 static void saveCommand(redisClient
*c
);
617 static void bgsaveCommand(redisClient
*c
);
618 static void bgrewriteaofCommand(redisClient
*c
);
619 static void shutdownCommand(redisClient
*c
);
620 static void moveCommand(redisClient
*c
);
621 static void renameCommand(redisClient
*c
);
622 static void renamenxCommand(redisClient
*c
);
623 static void lpushCommand(redisClient
*c
);
624 static void rpushCommand(redisClient
*c
);
625 static void lpopCommand(redisClient
*c
);
626 static void rpopCommand(redisClient
*c
);
627 static void llenCommand(redisClient
*c
);
628 static void lindexCommand(redisClient
*c
);
629 static void lrangeCommand(redisClient
*c
);
630 static void ltrimCommand(redisClient
*c
);
631 static void typeCommand(redisClient
*c
);
632 static void lsetCommand(redisClient
*c
);
633 static void saddCommand(redisClient
*c
);
634 static void sremCommand(redisClient
*c
);
635 static void smoveCommand(redisClient
*c
);
636 static void sismemberCommand(redisClient
*c
);
637 static void scardCommand(redisClient
*c
);
638 static void spopCommand(redisClient
*c
);
639 static void srandmemberCommand(redisClient
*c
);
640 static void sinterCommand(redisClient
*c
);
641 static void sinterstoreCommand(redisClient
*c
);
642 static void sunionCommand(redisClient
*c
);
643 static void sunionstoreCommand(redisClient
*c
);
644 static void sdiffCommand(redisClient
*c
);
645 static void sdiffstoreCommand(redisClient
*c
);
646 static void syncCommand(redisClient
*c
);
647 static void flushdbCommand(redisClient
*c
);
648 static void flushallCommand(redisClient
*c
);
649 static void sortCommand(redisClient
*c
);
650 static void lremCommand(redisClient
*c
);
651 static void rpoplpushcommand(redisClient
*c
);
652 static void infoCommand(redisClient
*c
);
653 static void mgetCommand(redisClient
*c
);
654 static void monitorCommand(redisClient
*c
);
655 static void expireCommand(redisClient
*c
);
656 static void expireatCommand(redisClient
*c
);
657 static void getsetCommand(redisClient
*c
);
658 static void ttlCommand(redisClient
*c
);
659 static void slaveofCommand(redisClient
*c
);
660 static void debugCommand(redisClient
*c
);
661 static void msetCommand(redisClient
*c
);
662 static void msetnxCommand(redisClient
*c
);
663 static void zaddCommand(redisClient
*c
);
664 static void zincrbyCommand(redisClient
*c
);
665 static void zrangeCommand(redisClient
*c
);
666 static void zrangebyscoreCommand(redisClient
*c
);
667 static void zcountCommand(redisClient
*c
);
668 static void zrevrangeCommand(redisClient
*c
);
669 static void zcardCommand(redisClient
*c
);
670 static void zremCommand(redisClient
*c
);
671 static void zscoreCommand(redisClient
*c
);
672 static void zremrangebyscoreCommand(redisClient
*c
);
673 static void multiCommand(redisClient
*c
);
674 static void execCommand(redisClient
*c
);
675 static void discardCommand(redisClient
*c
);
676 static void blpopCommand(redisClient
*c
);
677 static void brpopCommand(redisClient
*c
);
678 static void appendCommand(redisClient
*c
);
679 static void substrCommand(redisClient
*c
);
680 static void zrankCommand(redisClient
*c
);
681 static void zrevrankCommand(redisClient
*c
);
682 static void hsetCommand(redisClient
*c
);
683 static void hgetCommand(redisClient
*c
);
684 static void hdelCommand(redisClient
*c
);
685 static void hlenCommand(redisClient
*c
);
686 static void zremrangebyrankCommand(redisClient
*c
);
687 static void zunionCommand(redisClient
*c
);
688 static void zinterCommand(redisClient
*c
);
690 /*================================= Globals ================================= */
693 static struct redisServer server
; /* server global state */
694 static struct redisCommand cmdTable
[] = {
695 {"get",getCommand
,2,REDIS_CMD_INLINE
,1,1,1},
696 {"set",setCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
697 {"setnx",setnxCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,0,0,0},
698 {"append",appendCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
699 {"substr",substrCommand
,4,REDIS_CMD_INLINE
,1,1,1},
700 {"del",delCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
701 {"exists",existsCommand
,2,REDIS_CMD_INLINE
,1,1,1},
702 {"incr",incrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
703 {"decr",decrCommand
,2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
704 {"mget",mgetCommand
,-2,REDIS_CMD_INLINE
,1,-1,1},
705 {"rpush",rpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
706 {"lpush",lpushCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
707 {"rpop",rpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
708 {"lpop",lpopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
709 {"brpop",brpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
710 {"blpop",blpopCommand
,-3,REDIS_CMD_INLINE
,1,1,1},
711 {"llen",llenCommand
,2,REDIS_CMD_INLINE
,1,1,1},
712 {"lindex",lindexCommand
,3,REDIS_CMD_INLINE
,1,1,1},
713 {"lset",lsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
714 {"lrange",lrangeCommand
,4,REDIS_CMD_INLINE
,1,1,1},
715 {"ltrim",ltrimCommand
,4,REDIS_CMD_INLINE
,1,1,1},
716 {"lrem",lremCommand
,4,REDIS_CMD_BULK
,1,1,1},
717 {"rpoplpush",rpoplpushcommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,2,1},
718 {"sadd",saddCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
719 {"srem",sremCommand
,3,REDIS_CMD_BULK
,1,1,1},
720 {"smove",smoveCommand
,4,REDIS_CMD_BULK
,1,2,1},
721 {"sismember",sismemberCommand
,3,REDIS_CMD_BULK
,1,1,1},
722 {"scard",scardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
723 {"spop",spopCommand
,2,REDIS_CMD_INLINE
,1,1,1},
724 {"srandmember",srandmemberCommand
,2,REDIS_CMD_INLINE
,1,1,1},
725 {"sinter",sinterCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
726 {"sinterstore",sinterstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
727 {"sunion",sunionCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
728 {"sunionstore",sunionstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
729 {"sdiff",sdiffCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,-1,1},
730 {"sdiffstore",sdiffstoreCommand
,-3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,2,-1,1},
731 {"smembers",sinterCommand
,2,REDIS_CMD_INLINE
,1,1,1},
732 {"zadd",zaddCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
733 {"zincrby",zincrbyCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
734 {"zrem",zremCommand
,3,REDIS_CMD_BULK
,1,1,1},
735 {"zremrangebyscore",zremrangebyscoreCommand
,4,REDIS_CMD_INLINE
,1,1,1},
736 {"zremrangebyrank",zremrangebyrankCommand
,4,REDIS_CMD_INLINE
,1,1,1},
737 {"zunion",zunionCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,0,0,0},
738 {"zinter",zinterCommand
,-4,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,0,0,0},
739 {"zrange",zrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
740 {"zrangebyscore",zrangebyscoreCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
741 {"zcount",zcountCommand
,4,REDIS_CMD_INLINE
,1,1,1},
742 {"zrevrange",zrevrangeCommand
,-4,REDIS_CMD_INLINE
,1,1,1},
743 {"zcard",zcardCommand
,2,REDIS_CMD_INLINE
,1,1,1},
744 {"zscore",zscoreCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
745 {"zrank",zrankCommand
,3,REDIS_CMD_BULK
,1,1,1},
746 {"zrevrank",zrevrankCommand
,3,REDIS_CMD_BULK
,1,1,1},
747 {"hset",hsetCommand
,4,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
748 {"hget",hgetCommand
,3,REDIS_CMD_BULK
,1,1,1},
749 {"hdel",hdelCommand
,3,REDIS_CMD_BULK
,1,1,1},
750 {"hlen",hlenCommand
,2,REDIS_CMD_INLINE
,1,1,1},
751 {"incrby",incrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
752 {"decrby",decrbyCommand
,3,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
753 {"getset",getsetCommand
,3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,1,1},
754 {"mset",msetCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
755 {"msetnx",msetnxCommand
,-3,REDIS_CMD_BULK
|REDIS_CMD_DENYOOM
,1,-1,2},
756 {"randomkey",randomkeyCommand
,1,REDIS_CMD_INLINE
,0,0,0},
757 {"select",selectCommand
,2,REDIS_CMD_INLINE
,0,0,0},
758 {"move",moveCommand
,3,REDIS_CMD_INLINE
,1,1,1},
759 {"rename",renameCommand
,3,REDIS_CMD_INLINE
,1,1,1},
760 {"renamenx",renamenxCommand
,3,REDIS_CMD_INLINE
,1,1,1},
761 {"expire",expireCommand
,3,REDIS_CMD_INLINE
,0,0,0},
762 {"expireat",expireatCommand
,3,REDIS_CMD_INLINE
,0,0,0},
763 {"keys",keysCommand
,2,REDIS_CMD_INLINE
,0,0,0},
764 {"dbsize",dbsizeCommand
,1,REDIS_CMD_INLINE
,0,0,0},
765 {"auth",authCommand
,2,REDIS_CMD_INLINE
,0,0,0},
766 {"ping",pingCommand
,1,REDIS_CMD_INLINE
,0,0,0},
767 {"echo",echoCommand
,2,REDIS_CMD_BULK
,0,0,0},
768 {"save",saveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
769 {"bgsave",bgsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
770 {"bgrewriteaof",bgrewriteaofCommand
,1,REDIS_CMD_INLINE
,0,0,0},
771 {"shutdown",shutdownCommand
,1,REDIS_CMD_INLINE
,0,0,0},
772 {"lastsave",lastsaveCommand
,1,REDIS_CMD_INLINE
,0,0,0},
773 {"type",typeCommand
,2,REDIS_CMD_INLINE
,1,1,1},
774 {"multi",multiCommand
,1,REDIS_CMD_INLINE
,0,0,0},
775 {"exec",execCommand
,1,REDIS_CMD_INLINE
,0,0,0},
776 {"discard",discardCommand
,1,REDIS_CMD_INLINE
,0,0,0},
777 {"sync",syncCommand
,1,REDIS_CMD_INLINE
,0,0,0},
778 {"flushdb",flushdbCommand
,1,REDIS_CMD_INLINE
,0,0,0},
779 {"flushall",flushallCommand
,1,REDIS_CMD_INLINE
,0,0,0},
780 {"sort",sortCommand
,-2,REDIS_CMD_INLINE
|REDIS_CMD_DENYOOM
,1,1,1},
781 {"info",infoCommand
,1,REDIS_CMD_INLINE
,0,0,0},
782 {"monitor",monitorCommand
,1,REDIS_CMD_INLINE
,0,0,0},
783 {"ttl",ttlCommand
,2,REDIS_CMD_INLINE
,1,1,1},
784 {"slaveof",slaveofCommand
,3,REDIS_CMD_INLINE
,0,0,0},
785 {"debug",debugCommand
,-2,REDIS_CMD_INLINE
,0,0,0},
786 {NULL
,NULL
,0,0,0,0,0}
789 /*============================ Utility functions ============================ */
791 /* Glob-style pattern matching. */
792 int stringmatchlen(const char *pattern
, int patternLen
,
793 const char *string
, int stringLen
, int nocase
)
798 while (pattern
[1] == '*') {
803 return 1; /* match */
805 if (stringmatchlen(pattern
+1, patternLen
-1,
806 string
, stringLen
, nocase
))
807 return 1; /* match */
811 return 0; /* no match */
815 return 0; /* no match */
825 not = pattern
[0] == '^';
832 if (pattern
[0] == '\\') {
835 if (pattern
[0] == string
[0])
837 } else if (pattern
[0] == ']') {
839 } else if (patternLen
== 0) {
843 } else if (pattern
[1] == '-' && patternLen
>= 3) {
844 int start
= pattern
[0];
845 int end
= pattern
[2];
853 start
= tolower(start
);
859 if (c
>= start
&& c
<= end
)
863 if (pattern
[0] == string
[0])
866 if (tolower((int)pattern
[0]) == tolower((int)string
[0]))
876 return 0; /* no match */
882 if (patternLen
>= 2) {
889 if (pattern
[0] != string
[0])
890 return 0; /* no match */
892 if (tolower((int)pattern
[0]) != tolower((int)string
[0]))
893 return 0; /* no match */
901 if (stringLen
== 0) {
902 while(*pattern
== '*') {
909 if (patternLen
== 0 && stringLen
== 0)
914 static void redisLog(int level
, const char *fmt
, ...) {
918 fp
= (server
.logfile
== NULL
) ? stdout
: fopen(server
.logfile
,"a");
922 if (level
>= server
.verbosity
) {
928 strftime(buf
,64,"%d %b %H:%M:%S",localtime(&now
));
929 fprintf(fp
,"[%d] %s %c ",(int)getpid(),buf
,c
[level
]);
930 vfprintf(fp
, fmt
, ap
);
936 if (server
.logfile
) fclose(fp
);
939 /*====================== Hash table type implementation ==================== */
941 /* This is an hash table type that uses the SDS dynamic strings libary as
942 * keys and radis objects as values (objects can hold SDS strings,
945 static void dictVanillaFree(void *privdata
, void *val
)
947 DICT_NOTUSED(privdata
);
951 static void dictListDestructor(void *privdata
, void *val
)
953 DICT_NOTUSED(privdata
);
954 listRelease((list
*)val
);
957 static int sdsDictKeyCompare(void *privdata
, const void *key1
,
961 DICT_NOTUSED(privdata
);
963 l1
= sdslen((sds
)key1
);
964 l2
= sdslen((sds
)key2
);
965 if (l1
!= l2
) return 0;
966 return memcmp(key1
, key2
, l1
) == 0;
969 static void dictRedisObjectDestructor(void *privdata
, void *val
)
971 DICT_NOTUSED(privdata
);
973 if (val
== NULL
) return; /* Values of swapped out keys as set to NULL */
977 static int dictObjKeyCompare(void *privdata
, const void *key1
,
980 const robj
*o1
= key1
, *o2
= key2
;
981 return sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
984 static unsigned int dictObjHash(const void *key
) {
986 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
989 static int dictEncObjKeyCompare(void *privdata
, const void *key1
,
992 robj
*o1
= (robj
*) key1
, *o2
= (robj
*) key2
;
995 o1
= getDecodedObject(o1
);
996 o2
= getDecodedObject(o2
);
997 cmp
= sdsDictKeyCompare(privdata
,o1
->ptr
,o2
->ptr
);
1003 static unsigned int dictEncObjHash(const void *key
) {
1004 robj
*o
= (robj
*) key
;
1006 if (o
->encoding
== REDIS_ENCODING_RAW
) {
1007 return dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1009 if (o
->encoding
== REDIS_ENCODING_INT
) {
1013 len
= snprintf(buf
,32,"%ld",(long)o
->ptr
);
1014 return dictGenHashFunction((unsigned char*)buf
, len
);
1018 o
= getDecodedObject(o
);
1019 hash
= dictGenHashFunction(o
->ptr
, sdslen((sds
)o
->ptr
));
1026 /* Sets type and expires */
1027 static dictType setDictType
= {
1028 dictEncObjHash
, /* hash function */
1031 dictEncObjKeyCompare
, /* key compare */
1032 dictRedisObjectDestructor
, /* key destructor */
1033 NULL
/* val destructor */
1036 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1037 static dictType zsetDictType
= {
1038 dictEncObjHash
, /* hash function */
1041 dictEncObjKeyCompare
, /* key compare */
1042 dictRedisObjectDestructor
, /* key destructor */
1043 dictVanillaFree
/* val destructor of malloc(sizeof(double)) */
1047 static dictType dbDictType
= {
1048 dictObjHash
, /* hash function */
1051 dictObjKeyCompare
, /* key compare */
1052 dictRedisObjectDestructor
, /* key destructor */
1053 dictRedisObjectDestructor
/* val destructor */
1057 static dictType keyptrDictType
= {
1058 dictObjHash
, /* hash function */
1061 dictObjKeyCompare
, /* key compare */
1062 dictRedisObjectDestructor
, /* key destructor */
1063 NULL
/* val destructor */
1066 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1067 static dictType hashDictType
= {
1068 dictEncObjHash
, /* hash function */
1071 dictEncObjKeyCompare
, /* key compare */
1072 dictRedisObjectDestructor
, /* key destructor */
1073 dictRedisObjectDestructor
/* val destructor */
1076 /* Keylist hash table type has unencoded redis objects as keys and
1077 * lists as values. It's used for blocking operations (BLPOP) and to
1078 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1079 static dictType keylistDictType
= {
1080 dictObjHash
, /* hash function */
1083 dictObjKeyCompare
, /* key compare */
1084 dictRedisObjectDestructor
, /* key destructor */
1085 dictListDestructor
/* val destructor */
1088 /* ========================= Random utility functions ======================= */
1090 /* Redis generally does not try to recover from out of memory conditions
1091 * when allocating objects or strings, it is not clear if it will be possible
1092 * to report this condition to the client since the networking layer itself
1093 * is based on heap allocation for send buffers, so we simply abort.
1094 * At least the code will be simpler to read... */
1095 static void oom(const char *msg
) {
1096 redisLog(REDIS_WARNING
, "%s: Out of memory\n",msg
);
1101 /* ====================== Redis server networking stuff ===================== */
1102 static void closeTimedoutClients(void) {
1105 time_t now
= time(NULL
);
1108 listRewind(server
.clients
,&li
);
1109 while ((ln
= listNext(&li
)) != NULL
) {
1110 c
= listNodeValue(ln
);
1111 if (server
.maxidletime
&&
1112 !(c
->flags
& REDIS_SLAVE
) && /* no timeout for slaves */
1113 !(c
->flags
& REDIS_MASTER
) && /* no timeout for masters */
1114 (now
- c
->lastinteraction
> server
.maxidletime
))
1116 redisLog(REDIS_VERBOSE
,"Closing idle client");
1118 } else if (c
->flags
& REDIS_BLOCKED
) {
1119 if (c
->blockingto
!= 0 && c
->blockingto
< now
) {
1120 addReply(c
,shared
.nullmultibulk
);
1121 unblockClientWaitingData(c
);
1127 static int htNeedsResize(dict
*dict
) {
1128 long long size
, used
;
1130 size
= dictSlots(dict
);
1131 used
= dictSize(dict
);
1132 return (size
&& used
&& size
> DICT_HT_INITIAL_SIZE
&&
1133 (used
*100/size
< REDIS_HT_MINFILL
));
1136 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1137 * we resize the hash table to save memory */
1138 static void tryResizeHashTables(void) {
1141 for (j
= 0; j
< server
.dbnum
; j
++) {
1142 if (htNeedsResize(server
.db
[j
].dict
)) {
1143 redisLog(REDIS_VERBOSE
,"The hash table %d is too sparse, resize it...",j
);
1144 dictResize(server
.db
[j
].dict
);
1145 redisLog(REDIS_VERBOSE
,"Hash table %d resized.",j
);
1147 if (htNeedsResize(server
.db
[j
].expires
))
1148 dictResize(server
.db
[j
].expires
);
1152 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1153 void backgroundSaveDoneHandler(int statloc
) {
1154 int exitcode
= WEXITSTATUS(statloc
);
1155 int bysignal
= WIFSIGNALED(statloc
);
1157 if (!bysignal
&& exitcode
== 0) {
1158 redisLog(REDIS_NOTICE
,
1159 "Background saving terminated with success");
1161 server
.lastsave
= time(NULL
);
1162 } else if (!bysignal
&& exitcode
!= 0) {
1163 redisLog(REDIS_WARNING
, "Background saving error");
1165 redisLog(REDIS_WARNING
,
1166 "Background saving terminated by signal");
1167 rdbRemoveTempFile(server
.bgsavechildpid
);
1169 server
.bgsavechildpid
= -1;
1170 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1171 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1172 updateSlavesWaitingBgsave(exitcode
== 0 ? REDIS_OK
: REDIS_ERR
);
1175 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1177 void backgroundRewriteDoneHandler(int statloc
) {
1178 int exitcode
= WEXITSTATUS(statloc
);
1179 int bysignal
= WIFSIGNALED(statloc
);
1181 if (!bysignal
&& exitcode
== 0) {
1185 redisLog(REDIS_NOTICE
,
1186 "Background append only file rewriting terminated with success");
1187 /* Now it's time to flush the differences accumulated by the parent */
1188 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) server
.bgrewritechildpid
);
1189 fd
= open(tmpfile
,O_WRONLY
|O_APPEND
);
1191 redisLog(REDIS_WARNING
, "Not able to open the temp append only file produced by the child: %s", strerror(errno
));
1194 /* Flush our data... */
1195 if (write(fd
,server
.bgrewritebuf
,sdslen(server
.bgrewritebuf
)) !=
1196 (signed) sdslen(server
.bgrewritebuf
)) {
1197 redisLog(REDIS_WARNING
, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno
));
1201 redisLog(REDIS_NOTICE
,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server
.bgrewritebuf
));
1202 /* Now our work is to rename the temp file into the stable file. And
1203 * switch the file descriptor used by the server for append only. */
1204 if (rename(tmpfile
,server
.appendfilename
) == -1) {
1205 redisLog(REDIS_WARNING
,"Can't rename the temp append only file into the stable one: %s", strerror(errno
));
1209 /* Mission completed... almost */
1210 redisLog(REDIS_NOTICE
,"Append only file successfully rewritten.");
1211 if (server
.appendfd
!= -1) {
1212 /* If append only is actually enabled... */
1213 close(server
.appendfd
);
1214 server
.appendfd
= fd
;
1216 server
.appendseldb
= -1; /* Make sure it will issue SELECT */
1217 redisLog(REDIS_NOTICE
,"The new append only file was selected for future appends.");
1219 /* If append only is disabled we just generate a dump in this
1220 * format. Why not? */
1223 } else if (!bysignal
&& exitcode
!= 0) {
1224 redisLog(REDIS_WARNING
, "Background append only file rewriting error");
1226 redisLog(REDIS_WARNING
,
1227 "Background append only file rewriting terminated by signal");
1230 sdsfree(server
.bgrewritebuf
);
1231 server
.bgrewritebuf
= sdsempty();
1232 aofRemoveTempFile(server
.bgrewritechildpid
);
1233 server
.bgrewritechildpid
= -1;
1236 static int serverCron(struct aeEventLoop
*eventLoop
, long long id
, void *clientData
) {
1237 int j
, loops
= server
.cronloops
++;
1238 REDIS_NOTUSED(eventLoop
);
1240 REDIS_NOTUSED(clientData
);
1242 /* We take a cached value of the unix time in the global state because
1243 * with virtual memory and aging there is to store the current time
1244 * in objects at every object access, and accuracy is not needed.
1245 * To access a global var is faster than calling time(NULL) */
1246 server
.unixtime
= time(NULL
);
1248 /* Show some info about non-empty databases */
1249 for (j
= 0; j
< server
.dbnum
; j
++) {
1250 long long size
, used
, vkeys
;
1252 size
= dictSlots(server
.db
[j
].dict
);
1253 used
= dictSize(server
.db
[j
].dict
);
1254 vkeys
= dictSize(server
.db
[j
].expires
);
1255 if (!(loops
% 5) && (used
|| vkeys
)) {
1256 redisLog(REDIS_VERBOSE
,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j
,used
,vkeys
,size
);
1257 /* dictPrintStats(server.dict); */
1261 /* We don't want to resize the hash tables while a bacground saving
1262 * is in progress: the saving child is created using fork() that is
1263 * implemented with a copy-on-write semantic in most modern systems, so
1264 * if we resize the HT while there is the saving child at work actually
1265 * a lot of memory movements in the parent will cause a lot of pages
1267 if (server
.bgsavechildpid
== -1) tryResizeHashTables();
1269 /* Show information about connected clients */
1271 redisLog(REDIS_VERBOSE
,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1272 listLength(server
.clients
)-listLength(server
.slaves
),
1273 listLength(server
.slaves
),
1274 zmalloc_used_memory(),
1275 dictSize(server
.sharingpool
));
1278 /* Close connections of timedout clients */
1279 if ((server
.maxidletime
&& !(loops
% 10)) || server
.blpop_blocked_clients
)
1280 closeTimedoutClients();
1282 /* Check if a background saving or AOF rewrite in progress terminated */
1283 if (server
.bgsavechildpid
!= -1 || server
.bgrewritechildpid
!= -1) {
1287 if ((pid
= wait3(&statloc
,WNOHANG
,NULL
)) != 0) {
1288 if (pid
== server
.bgsavechildpid
) {
1289 backgroundSaveDoneHandler(statloc
);
1291 backgroundRewriteDoneHandler(statloc
);
1295 /* If there is not a background saving in progress check if
1296 * we have to save now */
1297 time_t now
= time(NULL
);
1298 for (j
= 0; j
< server
.saveparamslen
; j
++) {
1299 struct saveparam
*sp
= server
.saveparams
+j
;
1301 if (server
.dirty
>= sp
->changes
&&
1302 now
-server
.lastsave
> sp
->seconds
) {
1303 redisLog(REDIS_NOTICE
,"%d changes in %d seconds. Saving...",
1304 sp
->changes
, sp
->seconds
);
1305 rdbSaveBackground(server
.dbfilename
);
1311 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1312 * will use few CPU cycles if there are few expiring keys, otherwise
1313 * it will get more aggressive to avoid that too much memory is used by
1314 * keys that can be removed from the keyspace. */
1315 for (j
= 0; j
< server
.dbnum
; j
++) {
1317 redisDb
*db
= server
.db
+j
;
1319 /* Continue to expire if at the end of the cycle more than 25%
1320 * of the keys were expired. */
1322 long num
= dictSize(db
->expires
);
1323 time_t now
= time(NULL
);
1326 if (num
> REDIS_EXPIRELOOKUPS_PER_CRON
)
1327 num
= REDIS_EXPIRELOOKUPS_PER_CRON
;
1332 if ((de
= dictGetRandomKey(db
->expires
)) == NULL
) break;
1333 t
= (time_t) dictGetEntryVal(de
);
1335 deleteKey(db
,dictGetEntryKey(de
));
1339 } while (expired
> REDIS_EXPIRELOOKUPS_PER_CRON
/4);
1342 /* Swap a few keys on disk if we are over the memory limit and VM
1343 * is enbled. Try to free objects from the free list first. */
1344 if (vmCanSwapOut()) {
1345 while (server
.vm_enabled
&& zmalloc_used_memory() >
1346 server
.vm_max_memory
)
1350 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
1351 retval
= (server
.vm_max_threads
== 0) ?
1352 vmSwapOneObjectBlocking() :
1353 vmSwapOneObjectThreaded();
1354 if (retval
== REDIS_ERR
&& (loops
% 30) == 0 &&
1355 zmalloc_used_memory() >
1356 (server
.vm_max_memory
+server
.vm_max_memory
/10))
1358 redisLog(REDIS_WARNING
,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1360 /* Note that when using threade I/O we free just one object,
1361 * because anyway when the I/O thread in charge to swap this
1362 * object out will finish, the handler of completed jobs
1363 * will try to swap more objects if we are still out of memory. */
1364 if (retval
== REDIS_ERR
|| server
.vm_max_threads
> 0) break;
1368 /* Check if we should connect to a MASTER */
1369 if (server
.replstate
== REDIS_REPL_CONNECT
) {
1370 redisLog(REDIS_NOTICE
,"Connecting to MASTER...");
1371 if (syncWithMaster() == REDIS_OK
) {
1372 redisLog(REDIS_NOTICE
,"MASTER <-> SLAVE sync succeeded");
1378 /* This function gets called every time Redis is entering the
1379 * main loop of the event driven library, that is, before to sleep
1380 * for ready file descriptors. */
1381 static void beforeSleep(struct aeEventLoop
*eventLoop
) {
1382 REDIS_NOTUSED(eventLoop
);
1384 if (server
.vm_enabled
&& listLength(server
.io_ready_clients
)) {
1388 listRewind(server
.io_ready_clients
,&li
);
1389 while((ln
= listNext(&li
))) {
1390 redisClient
*c
= ln
->value
;
1391 struct redisCommand
*cmd
;
1393 /* Resume the client. */
1394 listDelNode(server
.io_ready_clients
,ln
);
1395 c
->flags
&= (~REDIS_IO_WAIT
);
1396 server
.vm_blocked_clients
--;
1397 aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
1398 readQueryFromClient
, c
);
1399 cmd
= lookupCommand(c
->argv
[0]->ptr
);
1400 assert(cmd
!= NULL
);
1403 /* There may be more data to process in the input buffer. */
1404 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0)
1405 processInputBuffer(c
);
1410 static void createSharedObjects(void) {
1411 shared
.crlf
= createObject(REDIS_STRING
,sdsnew("\r\n"));
1412 shared
.ok
= createObject(REDIS_STRING
,sdsnew("+OK\r\n"));
1413 shared
.err
= createObject(REDIS_STRING
,sdsnew("-ERR\r\n"));
1414 shared
.emptybulk
= createObject(REDIS_STRING
,sdsnew("$0\r\n\r\n"));
1415 shared
.czero
= createObject(REDIS_STRING
,sdsnew(":0\r\n"));
1416 shared
.cone
= createObject(REDIS_STRING
,sdsnew(":1\r\n"));
1417 shared
.nullbulk
= createObject(REDIS_STRING
,sdsnew("$-1\r\n"));
1418 shared
.nullmultibulk
= createObject(REDIS_STRING
,sdsnew("*-1\r\n"));
1419 shared
.emptymultibulk
= createObject(REDIS_STRING
,sdsnew("*0\r\n"));
1420 shared
.pong
= createObject(REDIS_STRING
,sdsnew("+PONG\r\n"));
1421 shared
.queued
= createObject(REDIS_STRING
,sdsnew("+QUEUED\r\n"));
1422 shared
.wrongtypeerr
= createObject(REDIS_STRING
,sdsnew(
1423 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1424 shared
.nokeyerr
= createObject(REDIS_STRING
,sdsnew(
1425 "-ERR no such key\r\n"));
1426 shared
.syntaxerr
= createObject(REDIS_STRING
,sdsnew(
1427 "-ERR syntax error\r\n"));
1428 shared
.sameobjecterr
= createObject(REDIS_STRING
,sdsnew(
1429 "-ERR source and destination objects are the same\r\n"));
1430 shared
.outofrangeerr
= createObject(REDIS_STRING
,sdsnew(
1431 "-ERR index out of range\r\n"));
1432 shared
.space
= createObject(REDIS_STRING
,sdsnew(" "));
1433 shared
.colon
= createObject(REDIS_STRING
,sdsnew(":"));
1434 shared
.plus
= createObject(REDIS_STRING
,sdsnew("+"));
1435 shared
.select0
= createStringObject("select 0\r\n",10);
1436 shared
.select1
= createStringObject("select 1\r\n",10);
1437 shared
.select2
= createStringObject("select 2\r\n",10);
1438 shared
.select3
= createStringObject("select 3\r\n",10);
1439 shared
.select4
= createStringObject("select 4\r\n",10);
1440 shared
.select5
= createStringObject("select 5\r\n",10);
1441 shared
.select6
= createStringObject("select 6\r\n",10);
1442 shared
.select7
= createStringObject("select 7\r\n",10);
1443 shared
.select8
= createStringObject("select 8\r\n",10);
1444 shared
.select9
= createStringObject("select 9\r\n",10);
1447 static void appendServerSaveParams(time_t seconds
, int changes
) {
1448 server
.saveparams
= zrealloc(server
.saveparams
,sizeof(struct saveparam
)*(server
.saveparamslen
+1));
1449 server
.saveparams
[server
.saveparamslen
].seconds
= seconds
;
1450 server
.saveparams
[server
.saveparamslen
].changes
= changes
;
1451 server
.saveparamslen
++;
1454 static void resetServerSaveParams() {
1455 zfree(server
.saveparams
);
1456 server
.saveparams
= NULL
;
1457 server
.saveparamslen
= 0;
1460 static void initServerConfig() {
1461 server
.dbnum
= REDIS_DEFAULT_DBNUM
;
1462 server
.port
= REDIS_SERVERPORT
;
1463 server
.verbosity
= REDIS_VERBOSE
;
1464 server
.maxidletime
= REDIS_MAXIDLETIME
;
1465 server
.saveparams
= NULL
;
1466 server
.logfile
= NULL
; /* NULL = log on standard output */
1467 server
.bindaddr
= NULL
;
1468 server
.glueoutputbuf
= 1;
1469 server
.daemonize
= 0;
1470 server
.appendonly
= 0;
1471 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1472 server
.lastfsync
= time(NULL
);
1473 server
.appendfd
= -1;
1474 server
.appendseldb
= -1; /* Make sure the first time will not match */
1475 server
.pidfile
= "/var/run/redis.pid";
1476 server
.dbfilename
= "dump.rdb";
1477 server
.appendfilename
= "appendonly.aof";
1478 server
.requirepass
= NULL
;
1479 server
.shareobjects
= 0;
1480 server
.rdbcompression
= 1;
1481 server
.sharingpoolsize
= 1024;
1482 server
.maxclients
= 0;
1483 server
.blpop_blocked_clients
= 0;
1484 server
.maxmemory
= 0;
1485 server
.vm_enabled
= 0;
1486 server
.vm_swap_file
= zstrdup("/tmp/redis-%p.vm");
1487 server
.vm_page_size
= 256; /* 256 bytes per page */
1488 server
.vm_pages
= 1024*1024*100; /* 104 millions of pages */
1489 server
.vm_max_memory
= 1024LL*1024*1024*1; /* 1 GB of RAM */
1490 server
.vm_max_threads
= 4;
1491 server
.vm_blocked_clients
= 0;
1492 server
.hash_max_zipmap_entries
= REDIS_HASH_MAX_ZIPMAP_ENTRIES
;
1493 server
.hash_max_zipmap_value
= REDIS_HASH_MAX_ZIPMAP_VALUE
;
1495 resetServerSaveParams();
1497 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1498 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1499 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1500 /* Replication related */
1502 server
.masterauth
= NULL
;
1503 server
.masterhost
= NULL
;
1504 server
.masterport
= 6379;
1505 server
.master
= NULL
;
1506 server
.replstate
= REDIS_REPL_NONE
;
1508 /* Double constants initialization */
1510 R_PosInf
= 1.0/R_Zero
;
1511 R_NegInf
= -1.0/R_Zero
;
1512 R_Nan
= R_Zero
/R_Zero
;
1515 static void initServer() {
1518 signal(SIGHUP
, SIG_IGN
);
1519 signal(SIGPIPE
, SIG_IGN
);
1520 setupSigSegvAction();
1522 server
.devnull
= fopen("/dev/null","w");
1523 if (server
.devnull
== NULL
) {
1524 redisLog(REDIS_WARNING
, "Can't open /dev/null: %s", server
.neterr
);
1527 server
.clients
= listCreate();
1528 server
.slaves
= listCreate();
1529 server
.monitors
= listCreate();
1530 server
.objfreelist
= listCreate();
1531 createSharedObjects();
1532 server
.el
= aeCreateEventLoop();
1533 server
.db
= zmalloc(sizeof(redisDb
)*server
.dbnum
);
1534 server
.sharingpool
= dictCreate(&setDictType
,NULL
);
1535 server
.fd
= anetTcpServer(server
.neterr
, server
.port
, server
.bindaddr
);
1536 if (server
.fd
== -1) {
1537 redisLog(REDIS_WARNING
, "Opening TCP port: %s", server
.neterr
);
1540 for (j
= 0; j
< server
.dbnum
; j
++) {
1541 server
.db
[j
].dict
= dictCreate(&dbDictType
,NULL
);
1542 server
.db
[j
].expires
= dictCreate(&keyptrDictType
,NULL
);
1543 server
.db
[j
].blockingkeys
= dictCreate(&keylistDictType
,NULL
);
1544 if (server
.vm_enabled
)
1545 server
.db
[j
].io_keys
= dictCreate(&keylistDictType
,NULL
);
1546 server
.db
[j
].id
= j
;
1548 server
.cronloops
= 0;
1549 server
.bgsavechildpid
= -1;
1550 server
.bgrewritechildpid
= -1;
1551 server
.bgrewritebuf
= sdsempty();
1552 server
.lastsave
= time(NULL
);
1554 server
.stat_numcommands
= 0;
1555 server
.stat_numconnections
= 0;
1556 server
.stat_starttime
= time(NULL
);
1557 server
.unixtime
= time(NULL
);
1558 aeCreateTimeEvent(server
.el
, 1, serverCron
, NULL
, NULL
);
1559 if (aeCreateFileEvent(server
.el
, server
.fd
, AE_READABLE
,
1560 acceptHandler
, NULL
) == AE_ERR
) oom("creating file event");
1562 if (server
.appendonly
) {
1563 server
.appendfd
= open(server
.appendfilename
,O_WRONLY
|O_APPEND
|O_CREAT
,0644);
1564 if (server
.appendfd
== -1) {
1565 redisLog(REDIS_WARNING
, "Can't open the append-only file: %s",
1571 if (server
.vm_enabled
) vmInit();
1574 /* Empty the whole database */
1575 static long long emptyDb() {
1577 long long removed
= 0;
1579 for (j
= 0; j
< server
.dbnum
; j
++) {
1580 removed
+= dictSize(server
.db
[j
].dict
);
1581 dictEmpty(server
.db
[j
].dict
);
1582 dictEmpty(server
.db
[j
].expires
);
1587 static int yesnotoi(char *s
) {
1588 if (!strcasecmp(s
,"yes")) return 1;
1589 else if (!strcasecmp(s
,"no")) return 0;
1593 /* I agree, this is a very rudimental way to load a configuration...
1594 will improve later if the config gets more complex */
1595 static void loadServerConfig(char *filename
) {
1597 char buf
[REDIS_CONFIGLINE_MAX
+1], *err
= NULL
;
1601 if (filename
[0] == '-' && filename
[1] == '\0')
1604 if ((fp
= fopen(filename
,"r")) == NULL
) {
1605 redisLog(REDIS_WARNING
,"Fatal error, can't open config file");
1610 while(fgets(buf
,REDIS_CONFIGLINE_MAX
+1,fp
) != NULL
) {
1616 line
= sdstrim(line
," \t\r\n");
1618 /* Skip comments and blank lines*/
1619 if (line
[0] == '#' || line
[0] == '\0') {
1624 /* Split into arguments */
1625 argv
= sdssplitlen(line
,sdslen(line
)," ",1,&argc
);
1626 sdstolower(argv
[0]);
1628 /* Execute config directives */
1629 if (!strcasecmp(argv
[0],"timeout") && argc
== 2) {
1630 server
.maxidletime
= atoi(argv
[1]);
1631 if (server
.maxidletime
< 0) {
1632 err
= "Invalid timeout value"; goto loaderr
;
1634 } else if (!strcasecmp(argv
[0],"port") && argc
== 2) {
1635 server
.port
= atoi(argv
[1]);
1636 if (server
.port
< 1 || server
.port
> 65535) {
1637 err
= "Invalid port"; goto loaderr
;
1639 } else if (!strcasecmp(argv
[0],"bind") && argc
== 2) {
1640 server
.bindaddr
= zstrdup(argv
[1]);
1641 } else if (!strcasecmp(argv
[0],"save") && argc
== 3) {
1642 int seconds
= atoi(argv
[1]);
1643 int changes
= atoi(argv
[2]);
1644 if (seconds
< 1 || changes
< 0) {
1645 err
= "Invalid save parameters"; goto loaderr
;
1647 appendServerSaveParams(seconds
,changes
);
1648 } else if (!strcasecmp(argv
[0],"dir") && argc
== 2) {
1649 if (chdir(argv
[1]) == -1) {
1650 redisLog(REDIS_WARNING
,"Can't chdir to '%s': %s",
1651 argv
[1], strerror(errno
));
1654 } else if (!strcasecmp(argv
[0],"loglevel") && argc
== 2) {
1655 if (!strcasecmp(argv
[1],"debug")) server
.verbosity
= REDIS_DEBUG
;
1656 else if (!strcasecmp(argv
[1],"verbose")) server
.verbosity
= REDIS_VERBOSE
;
1657 else if (!strcasecmp(argv
[1],"notice")) server
.verbosity
= REDIS_NOTICE
;
1658 else if (!strcasecmp(argv
[1],"warning")) server
.verbosity
= REDIS_WARNING
;
1660 err
= "Invalid log level. Must be one of debug, notice, warning";
1663 } else if (!strcasecmp(argv
[0],"logfile") && argc
== 2) {
1666 server
.logfile
= zstrdup(argv
[1]);
1667 if (!strcasecmp(server
.logfile
,"stdout")) {
1668 zfree(server
.logfile
);
1669 server
.logfile
= NULL
;
1671 if (server
.logfile
) {
1672 /* Test if we are able to open the file. The server will not
1673 * be able to abort just for this problem later... */
1674 logfp
= fopen(server
.logfile
,"a");
1675 if (logfp
== NULL
) {
1676 err
= sdscatprintf(sdsempty(),
1677 "Can't open the log file: %s", strerror(errno
));
1682 } else if (!strcasecmp(argv
[0],"databases") && argc
== 2) {
1683 server
.dbnum
= atoi(argv
[1]);
1684 if (server
.dbnum
< 1) {
1685 err
= "Invalid number of databases"; goto loaderr
;
1687 } else if (!strcasecmp(argv
[0],"maxclients") && argc
== 2) {
1688 server
.maxclients
= atoi(argv
[1]);
1689 } else if (!strcasecmp(argv
[0],"maxmemory") && argc
== 2) {
1690 server
.maxmemory
= strtoll(argv
[1], NULL
, 10);
1691 } else if (!strcasecmp(argv
[0],"slaveof") && argc
== 3) {
1692 server
.masterhost
= sdsnew(argv
[1]);
1693 server
.masterport
= atoi(argv
[2]);
1694 server
.replstate
= REDIS_REPL_CONNECT
;
1695 } else if (!strcasecmp(argv
[0],"masterauth") && argc
== 2) {
1696 server
.masterauth
= zstrdup(argv
[1]);
1697 } else if (!strcasecmp(argv
[0],"glueoutputbuf") && argc
== 2) {
1698 if ((server
.glueoutputbuf
= yesnotoi(argv
[1])) == -1) {
1699 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1701 } else if (!strcasecmp(argv
[0],"shareobjects") && argc
== 2) {
1702 if ((server
.shareobjects
= yesnotoi(argv
[1])) == -1) {
1703 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1705 } else if (!strcasecmp(argv
[0],"rdbcompression") && argc
== 2) {
1706 if ((server
.rdbcompression
= yesnotoi(argv
[1])) == -1) {
1707 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1709 } else if (!strcasecmp(argv
[0],"shareobjectspoolsize") && argc
== 2) {
1710 server
.sharingpoolsize
= atoi(argv
[1]);
1711 if (server
.sharingpoolsize
< 1) {
1712 err
= "invalid object sharing pool size"; goto loaderr
;
1714 } else if (!strcasecmp(argv
[0],"daemonize") && argc
== 2) {
1715 if ((server
.daemonize
= yesnotoi(argv
[1])) == -1) {
1716 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1718 } else if (!strcasecmp(argv
[0],"appendonly") && argc
== 2) {
1719 if ((server
.appendonly
= yesnotoi(argv
[1])) == -1) {
1720 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1722 } else if (!strcasecmp(argv
[0],"appendfsync") && argc
== 2) {
1723 if (!strcasecmp(argv
[1],"no")) {
1724 server
.appendfsync
= APPENDFSYNC_NO
;
1725 } else if (!strcasecmp(argv
[1],"always")) {
1726 server
.appendfsync
= APPENDFSYNC_ALWAYS
;
1727 } else if (!strcasecmp(argv
[1],"everysec")) {
1728 server
.appendfsync
= APPENDFSYNC_EVERYSEC
;
1730 err
= "argument must be 'no', 'always' or 'everysec'";
1733 } else if (!strcasecmp(argv
[0],"requirepass") && argc
== 2) {
1734 server
.requirepass
= zstrdup(argv
[1]);
1735 } else if (!strcasecmp(argv
[0],"pidfile") && argc
== 2) {
1736 server
.pidfile
= zstrdup(argv
[1]);
1737 } else if (!strcasecmp(argv
[0],"dbfilename") && argc
== 2) {
1738 server
.dbfilename
= zstrdup(argv
[1]);
1739 } else if (!strcasecmp(argv
[0],"vm-enabled") && argc
== 2) {
1740 if ((server
.vm_enabled
= yesnotoi(argv
[1])) == -1) {
1741 err
= "argument must be 'yes' or 'no'"; goto loaderr
;
1743 } else if (!strcasecmp(argv
[0],"vm-swap-file") && argc
== 2) {
1744 zfree(server
.vm_swap_file
);
1745 server
.vm_swap_file
= zstrdup(argv
[1]);
1746 } else if (!strcasecmp(argv
[0],"vm-max-memory") && argc
== 2) {
1747 server
.vm_max_memory
= strtoll(argv
[1], NULL
, 10);
1748 } else if (!strcasecmp(argv
[0],"vm-page-size") && argc
== 2) {
1749 server
.vm_page_size
= strtoll(argv
[1], NULL
, 10);
1750 } else if (!strcasecmp(argv
[0],"vm-pages") && argc
== 2) {
1751 server
.vm_pages
= strtoll(argv
[1], NULL
, 10);
1752 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1753 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1754 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-entries") && argc
== 2){
1755 server
.hash_max_zipmap_entries
= strtol(argv
[1], NULL
, 10);
1756 } else if (!strcasecmp(argv
[0],"hash-max-zipmap-value") && argc
== 2){
1757 server
.hash_max_zipmap_value
= strtol(argv
[1], NULL
, 10);
1758 } else if (!strcasecmp(argv
[0],"vm-max-threads") && argc
== 2) {
1759 server
.vm_max_threads
= strtoll(argv
[1], NULL
, 10);
1761 err
= "Bad directive or wrong number of arguments"; goto loaderr
;
1763 for (j
= 0; j
< argc
; j
++)
1768 if (fp
!= stdin
) fclose(fp
);
1772 fprintf(stderr
, "\n*** FATAL CONFIG FILE ERROR ***\n");
1773 fprintf(stderr
, "Reading the configuration file, at line %d\n", linenum
);
1774 fprintf(stderr
, ">>> '%s'\n", line
);
1775 fprintf(stderr
, "%s\n", err
);
1779 static void freeClientArgv(redisClient
*c
) {
1782 for (j
= 0; j
< c
->argc
; j
++)
1783 decrRefCount(c
->argv
[j
]);
1784 for (j
= 0; j
< c
->mbargc
; j
++)
1785 decrRefCount(c
->mbargv
[j
]);
1790 static void freeClient(redisClient
*c
) {
1793 /* Note that if the client we are freeing is blocked into a blocking
1794 * call, we have to set querybuf to NULL *before* to call
1795 * unblockClientWaitingData() to avoid processInputBuffer() will get
1796 * called. Also it is important to remove the file events after
1797 * this, because this call adds the READABLE event. */
1798 sdsfree(c
->querybuf
);
1800 if (c
->flags
& REDIS_BLOCKED
)
1801 unblockClientWaitingData(c
);
1803 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
1804 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1805 listRelease(c
->reply
);
1808 /* Remove from the list of clients */
1809 ln
= listSearchKey(server
.clients
,c
);
1810 redisAssert(ln
!= NULL
);
1811 listDelNode(server
.clients
,ln
);
1812 /* Remove from the list of clients waiting for swapped keys */
1813 if (c
->flags
& REDIS_IO_WAIT
&& listLength(c
->io_keys
) == 0) {
1814 ln
= listSearchKey(server
.io_ready_clients
,c
);
1816 listDelNode(server
.io_ready_clients
,ln
);
1817 server
.vm_blocked_clients
--;
1820 while (server
.vm_enabled
&& listLength(c
->io_keys
)) {
1821 ln
= listFirst(c
->io_keys
);
1822 dontWaitForSwappedKey(c
,ln
->value
);
1824 listRelease(c
->io_keys
);
1826 if (c
->flags
& REDIS_SLAVE
) {
1827 if (c
->replstate
== REDIS_REPL_SEND_BULK
&& c
->repldbfd
!= -1)
1829 list
*l
= (c
->flags
& REDIS_MONITOR
) ? server
.monitors
: server
.slaves
;
1830 ln
= listSearchKey(l
,c
);
1831 redisAssert(ln
!= NULL
);
1834 if (c
->flags
& REDIS_MASTER
) {
1835 server
.master
= NULL
;
1836 server
.replstate
= REDIS_REPL_CONNECT
;
1840 freeClientMultiState(c
);
1844 #define GLUEREPLY_UP_TO (1024)
1845 static void glueReplyBuffersIfNeeded(redisClient
*c
) {
1847 char buf
[GLUEREPLY_UP_TO
];
1852 listRewind(c
->reply
,&li
);
1853 while((ln
= listNext(&li
))) {
1857 objlen
= sdslen(o
->ptr
);
1858 if (copylen
+ objlen
<= GLUEREPLY_UP_TO
) {
1859 memcpy(buf
+copylen
,o
->ptr
,objlen
);
1861 listDelNode(c
->reply
,ln
);
1863 if (copylen
== 0) return;
1867 /* Now the output buffer is empty, add the new single element */
1868 o
= createObject(REDIS_STRING
,sdsnewlen(buf
,copylen
));
1869 listAddNodeHead(c
->reply
,o
);
1872 static void sendReplyToClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
1873 redisClient
*c
= privdata
;
1874 int nwritten
= 0, totwritten
= 0, objlen
;
1877 REDIS_NOTUSED(mask
);
1879 /* Use writev() if we have enough buffers to send */
1880 if (!server
.glueoutputbuf
&&
1881 listLength(c
->reply
) > REDIS_WRITEV_THRESHOLD
&&
1882 !(c
->flags
& REDIS_MASTER
))
1884 sendReplyToClientWritev(el
, fd
, privdata
, mask
);
1888 while(listLength(c
->reply
)) {
1889 if (server
.glueoutputbuf
&& listLength(c
->reply
) > 1)
1890 glueReplyBuffersIfNeeded(c
);
1892 o
= listNodeValue(listFirst(c
->reply
));
1893 objlen
= sdslen(o
->ptr
);
1896 listDelNode(c
->reply
,listFirst(c
->reply
));
1900 if (c
->flags
& REDIS_MASTER
) {
1901 /* Don't reply to a master */
1902 nwritten
= objlen
- c
->sentlen
;
1904 nwritten
= write(fd
, ((char*)o
->ptr
)+c
->sentlen
, objlen
- c
->sentlen
);
1905 if (nwritten
<= 0) break;
1907 c
->sentlen
+= nwritten
;
1908 totwritten
+= nwritten
;
1909 /* If we fully sent the object on head go to the next one */
1910 if (c
->sentlen
== objlen
) {
1911 listDelNode(c
->reply
,listFirst(c
->reply
));
1914 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1915 * bytes, in a single threaded server it's a good idea to serve
1916 * other clients as well, even if a very large request comes from
1917 * super fast link that is always able to accept data (in real world
1918 * scenario think about 'KEYS *' against the loopback interfae) */
1919 if (totwritten
> REDIS_MAX_WRITE_PER_EVENT
) break;
1921 if (nwritten
== -1) {
1922 if (errno
== EAGAIN
) {
1925 redisLog(REDIS_VERBOSE
,
1926 "Error writing to client: %s", strerror(errno
));
1931 if (totwritten
> 0) c
->lastinteraction
= time(NULL
);
1932 if (listLength(c
->reply
) == 0) {
1934 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
1938 static void sendReplyToClientWritev(aeEventLoop
*el
, int fd
, void *privdata
, int mask
)
1940 redisClient
*c
= privdata
;
1941 int nwritten
= 0, totwritten
= 0, objlen
, willwrite
;
1943 struct iovec iov
[REDIS_WRITEV_IOVEC_COUNT
];
1944 int offset
, ion
= 0;
1946 REDIS_NOTUSED(mask
);
1949 while (listLength(c
->reply
)) {
1950 offset
= c
->sentlen
;
1954 /* fill-in the iov[] array */
1955 for(node
= listFirst(c
->reply
); node
; node
= listNextNode(node
)) {
1956 o
= listNodeValue(node
);
1957 objlen
= sdslen(o
->ptr
);
1959 if (totwritten
+ objlen
- offset
> REDIS_MAX_WRITE_PER_EVENT
)
1962 if(ion
== REDIS_WRITEV_IOVEC_COUNT
)
1963 break; /* no more iovecs */
1965 iov
[ion
].iov_base
= ((char*)o
->ptr
) + offset
;
1966 iov
[ion
].iov_len
= objlen
- offset
;
1967 willwrite
+= objlen
- offset
;
1968 offset
= 0; /* just for the first item */
1975 /* write all collected blocks at once */
1976 if((nwritten
= writev(fd
, iov
, ion
)) < 0) {
1977 if (errno
!= EAGAIN
) {
1978 redisLog(REDIS_VERBOSE
,
1979 "Error writing to client: %s", strerror(errno
));
1986 totwritten
+= nwritten
;
1987 offset
= c
->sentlen
;
1989 /* remove written robjs from c->reply */
1990 while (nwritten
&& listLength(c
->reply
)) {
1991 o
= listNodeValue(listFirst(c
->reply
));
1992 objlen
= sdslen(o
->ptr
);
1994 if(nwritten
>= objlen
- offset
) {
1995 listDelNode(c
->reply
, listFirst(c
->reply
));
1996 nwritten
-= objlen
- offset
;
2000 c
->sentlen
+= nwritten
;
2008 c
->lastinteraction
= time(NULL
);
2010 if (listLength(c
->reply
) == 0) {
2012 aeDeleteFileEvent(server
.el
,c
->fd
,AE_WRITABLE
);
2016 static struct redisCommand
*lookupCommand(char *name
) {
2018 while(cmdTable
[j
].name
!= NULL
) {
2019 if (!strcasecmp(name
,cmdTable
[j
].name
)) return &cmdTable
[j
];
2025 /* resetClient prepare the client to process the next command */
2026 static void resetClient(redisClient
*c
) {
2032 /* Call() is the core of Redis execution of a command */
2033 static void call(redisClient
*c
, struct redisCommand
*cmd
) {
2036 dirty
= server
.dirty
;
2038 if (server
.appendonly
&& server
.dirty
-dirty
)
2039 feedAppendOnlyFile(cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2040 if (server
.dirty
-dirty
&& listLength(server
.slaves
))
2041 replicationFeedSlaves(server
.slaves
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2042 if (listLength(server
.monitors
))
2043 replicationFeedSlaves(server
.monitors
,cmd
,c
->db
->id
,c
->argv
,c
->argc
);
2044 server
.stat_numcommands
++;
2047 /* If this function gets called we already read a whole
2048 * command, argments are in the client argv/argc fields.
2049 * processCommand() execute the command or prepare the
2050 * server for a bulk read from the client.
2052 * If 1 is returned the client is still alive and valid and
2053 * and other operations can be performed by the caller. Otherwise
2054 * if 0 is returned the client was destroied (i.e. after QUIT). */
2055 static int processCommand(redisClient
*c
) {
2056 struct redisCommand
*cmd
;
2058 /* Free some memory if needed (maxmemory setting) */
2059 if (server
.maxmemory
) freeMemoryIfNeeded();
2061 /* Handle the multi bulk command type. This is an alternative protocol
2062 * supported by Redis in order to receive commands that are composed of
2063 * multiple binary-safe "bulk" arguments. The latency of processing is
2064 * a bit higher but this allows things like multi-sets, so if this
2065 * protocol is used only for MSET and similar commands this is a big win. */
2066 if (c
->multibulk
== 0 && c
->argc
== 1 && ((char*)(c
->argv
[0]->ptr
))[0] == '*') {
2067 c
->multibulk
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2068 if (c
->multibulk
<= 0) {
2072 decrRefCount(c
->argv
[c
->argc
-1]);
2076 } else if (c
->multibulk
) {
2077 if (c
->bulklen
== -1) {
2078 if (((char*)c
->argv
[0]->ptr
)[0] != '$') {
2079 addReplySds(c
,sdsnew("-ERR multi bulk protocol error\r\n"));
2083 int bulklen
= atoi(((char*)c
->argv
[0]->ptr
)+1);
2084 decrRefCount(c
->argv
[0]);
2085 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2087 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2092 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2096 c
->mbargv
= zrealloc(c
->mbargv
,(sizeof(robj
*))*(c
->mbargc
+1));
2097 c
->mbargv
[c
->mbargc
] = c
->argv
[0];
2101 if (c
->multibulk
== 0) {
2105 /* Here we need to swap the multi-bulk argc/argv with the
2106 * normal argc/argv of the client structure. */
2108 c
->argv
= c
->mbargv
;
2109 c
->mbargv
= auxargv
;
2112 c
->argc
= c
->mbargc
;
2113 c
->mbargc
= auxargc
;
2115 /* We need to set bulklen to something different than -1
2116 * in order for the code below to process the command without
2117 * to try to read the last argument of a bulk command as
2118 * a special argument. */
2120 /* continue below and process the command */
2127 /* -- end of multi bulk commands processing -- */
2129 /* The QUIT command is handled as a special case. Normal command
2130 * procs are unable to close the client connection safely */
2131 if (!strcasecmp(c
->argv
[0]->ptr
,"quit")) {
2136 /* Now lookup the command and check ASAP about trivial error conditions
2137 * such wrong arity, bad command name and so forth. */
2138 cmd
= lookupCommand(c
->argv
[0]->ptr
);
2141 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2142 (char*)c
->argv
[0]->ptr
));
2145 } else if ((cmd
->arity
> 0 && cmd
->arity
!= c
->argc
) ||
2146 (c
->argc
< -cmd
->arity
)) {
2148 sdscatprintf(sdsempty(),
2149 "-ERR wrong number of arguments for '%s' command\r\n",
2153 } else if (server
.maxmemory
&& cmd
->flags
& REDIS_CMD_DENYOOM
&& zmalloc_used_memory() > server
.maxmemory
) {
2154 addReplySds(c
,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2157 } else if (cmd
->flags
& REDIS_CMD_BULK
&& c
->bulklen
== -1) {
2158 /* This is a bulk command, we have to read the last argument yet. */
2159 int bulklen
= atoi(c
->argv
[c
->argc
-1]->ptr
);
2161 decrRefCount(c
->argv
[c
->argc
-1]);
2162 if (bulklen
< 0 || bulklen
> 1024*1024*1024) {
2164 addReplySds(c
,sdsnew("-ERR invalid bulk write count\r\n"));
2169 c
->bulklen
= bulklen
+2; /* add two bytes for CR+LF */
2170 /* It is possible that the bulk read is already in the
2171 * buffer. Check this condition and handle it accordingly.
2172 * This is just a fast path, alternative to call processInputBuffer().
2173 * It's a good idea since the code is small and this condition
2174 * happens most of the times. */
2175 if ((signed)sdslen(c
->querybuf
) >= c
->bulklen
) {
2176 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2178 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2180 /* Otherwise return... there is to read the last argument
2181 * from the socket. */
2185 /* Let's try to share objects on the command arguments vector */
2186 if (server
.shareobjects
) {
2188 for(j
= 1; j
< c
->argc
; j
++)
2189 c
->argv
[j
] = tryObjectSharing(c
->argv
[j
]);
2191 /* Let's try to encode the bulk object to save space. */
2192 if (cmd
->flags
& REDIS_CMD_BULK
)
2193 tryObjectEncoding(c
->argv
[c
->argc
-1]);
2195 /* Check if the user is authenticated */
2196 if (server
.requirepass
&& !c
->authenticated
&& cmd
->proc
!= authCommand
) {
2197 addReplySds(c
,sdsnew("-ERR operation not permitted\r\n"));
2202 /* Exec the command */
2203 if (c
->flags
& REDIS_MULTI
&& cmd
->proc
!= execCommand
&& cmd
->proc
!= discardCommand
) {
2204 queueMultiCommand(c
,cmd
);
2205 addReply(c
,shared
.queued
);
2207 if (server
.vm_enabled
&& server
.vm_max_threads
> 0 &&
2208 blockClientOnSwappedKeys(cmd
,c
)) return 1;
2212 /* Prepare the client for the next command */
2217 static void replicationFeedSlaves(list
*slaves
, struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
2222 /* (args*2)+1 is enough room for args, spaces, newlines */
2223 robj
*static_outv
[REDIS_STATIC_ARGS
*2+1];
2225 if (argc
<= REDIS_STATIC_ARGS
) {
2228 outv
= zmalloc(sizeof(robj
*)*(argc
*2+1));
2231 for (j
= 0; j
< argc
; j
++) {
2232 if (j
!= 0) outv
[outc
++] = shared
.space
;
2233 if ((cmd
->flags
& REDIS_CMD_BULK
) && j
== argc
-1) {
2236 lenobj
= createObject(REDIS_STRING
,
2237 sdscatprintf(sdsempty(),"%lu\r\n",
2238 (unsigned long) stringObjectLen(argv
[j
])));
2239 lenobj
->refcount
= 0;
2240 outv
[outc
++] = lenobj
;
2242 outv
[outc
++] = argv
[j
];
2244 outv
[outc
++] = shared
.crlf
;
2246 /* Increment all the refcounts at start and decrement at end in order to
2247 * be sure to free objects if there is no slave in a replication state
2248 * able to be feed with commands */
2249 for (j
= 0; j
< outc
; j
++) incrRefCount(outv
[j
]);
2250 listRewind(slaves
,&li
);
2251 while((ln
= listNext(&li
))) {
2252 redisClient
*slave
= ln
->value
;
2254 /* Don't feed slaves that are still waiting for BGSAVE to start */
2255 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) continue;
2257 /* Feed all the other slaves, MONITORs and so on */
2258 if (slave
->slaveseldb
!= dictid
) {
2262 case 0: selectcmd
= shared
.select0
; break;
2263 case 1: selectcmd
= shared
.select1
; break;
2264 case 2: selectcmd
= shared
.select2
; break;
2265 case 3: selectcmd
= shared
.select3
; break;
2266 case 4: selectcmd
= shared
.select4
; break;
2267 case 5: selectcmd
= shared
.select5
; break;
2268 case 6: selectcmd
= shared
.select6
; break;
2269 case 7: selectcmd
= shared
.select7
; break;
2270 case 8: selectcmd
= shared
.select8
; break;
2271 case 9: selectcmd
= shared
.select9
; break;
2273 selectcmd
= createObject(REDIS_STRING
,
2274 sdscatprintf(sdsempty(),"select %d\r\n",dictid
));
2275 selectcmd
->refcount
= 0;
2278 addReply(slave
,selectcmd
);
2279 slave
->slaveseldb
= dictid
;
2281 for (j
= 0; j
< outc
; j
++) addReply(slave
,outv
[j
]);
2283 for (j
= 0; j
< outc
; j
++) decrRefCount(outv
[j
]);
2284 if (outv
!= static_outv
) zfree(outv
);
2287 static void processInputBuffer(redisClient
*c
) {
2289 /* Before to process the input buffer, make sure the client is not
2290 * waitig for a blocking operation such as BLPOP. Note that the first
2291 * iteration the client is never blocked, otherwise the processInputBuffer
2292 * would not be called at all, but after the execution of the first commands
2293 * in the input buffer the client may be blocked, and the "goto again"
2294 * will try to reiterate. The following line will make it return asap. */
2295 if (c
->flags
& REDIS_BLOCKED
|| c
->flags
& REDIS_IO_WAIT
) return;
2296 if (c
->bulklen
== -1) {
2297 /* Read the first line of the query */
2298 char *p
= strchr(c
->querybuf
,'\n');
2305 query
= c
->querybuf
;
2306 c
->querybuf
= sdsempty();
2307 querylen
= 1+(p
-(query
));
2308 if (sdslen(query
) > querylen
) {
2309 /* leave data after the first line of the query in the buffer */
2310 c
->querybuf
= sdscatlen(c
->querybuf
,query
+querylen
,sdslen(query
)-querylen
);
2312 *p
= '\0'; /* remove "\n" */
2313 if (*(p
-1) == '\r') *(p
-1) = '\0'; /* and "\r" if any */
2314 sdsupdatelen(query
);
2316 /* Now we can split the query in arguments */
2317 argv
= sdssplitlen(query
,sdslen(query
)," ",1,&argc
);
2320 if (c
->argv
) zfree(c
->argv
);
2321 c
->argv
= zmalloc(sizeof(robj
*)*argc
);
2323 for (j
= 0; j
< argc
; j
++) {
2324 if (sdslen(argv
[j
])) {
2325 c
->argv
[c
->argc
] = createObject(REDIS_STRING
,argv
[j
]);
2333 /* Execute the command. If the client is still valid
2334 * after processCommand() return and there is something
2335 * on the query buffer try to process the next command. */
2336 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2338 /* Nothing to process, argc == 0. Just process the query
2339 * buffer if it's not empty or return to the caller */
2340 if (sdslen(c
->querybuf
)) goto again
;
2343 } else if (sdslen(c
->querybuf
) >= REDIS_REQUEST_MAX_SIZE
) {
2344 redisLog(REDIS_VERBOSE
, "Client protocol error");
2349 /* Bulk read handling. Note that if we are at this point
2350 the client already sent a command terminated with a newline,
2351 we are reading the bulk data that is actually the last
2352 argument of the command. */
2353 int qbl
= sdslen(c
->querybuf
);
2355 if (c
->bulklen
<= qbl
) {
2356 /* Copy everything but the final CRLF as final argument */
2357 c
->argv
[c
->argc
] = createStringObject(c
->querybuf
,c
->bulklen
-2);
2359 c
->querybuf
= sdsrange(c
->querybuf
,c
->bulklen
,-1);
2360 /* Process the command. If the client is still valid after
2361 * the processing and there is more data in the buffer
2362 * try to parse it. */
2363 if (processCommand(c
) && sdslen(c
->querybuf
)) goto again
;
2369 static void readQueryFromClient(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2370 redisClient
*c
= (redisClient
*) privdata
;
2371 char buf
[REDIS_IOBUF_LEN
];
2374 REDIS_NOTUSED(mask
);
2376 nread
= read(fd
, buf
, REDIS_IOBUF_LEN
);
2378 if (errno
== EAGAIN
) {
2381 redisLog(REDIS_VERBOSE
, "Reading from client: %s",strerror(errno
));
2385 } else if (nread
== 0) {
2386 redisLog(REDIS_VERBOSE
, "Client closed connection");
2391 c
->querybuf
= sdscatlen(c
->querybuf
, buf
, nread
);
2392 c
->lastinteraction
= time(NULL
);
2396 if (!(c
->flags
& REDIS_BLOCKED
))
2397 processInputBuffer(c
);
2400 static int selectDb(redisClient
*c
, int id
) {
2401 if (id
< 0 || id
>= server
.dbnum
)
2403 c
->db
= &server
.db
[id
];
2407 static void *dupClientReplyValue(void *o
) {
2408 incrRefCount((robj
*)o
);
2412 static redisClient
*createClient(int fd
) {
2413 redisClient
*c
= zmalloc(sizeof(*c
));
2415 anetNonBlock(NULL
,fd
);
2416 anetTcpNoDelay(NULL
,fd
);
2417 if (!c
) return NULL
;
2420 c
->querybuf
= sdsempty();
2429 c
->lastinteraction
= time(NULL
);
2430 c
->authenticated
= 0;
2431 c
->replstate
= REDIS_REPL_NONE
;
2432 c
->reply
= listCreate();
2433 listSetFreeMethod(c
->reply
,decrRefCount
);
2434 listSetDupMethod(c
->reply
,dupClientReplyValue
);
2435 c
->blockingkeys
= NULL
;
2436 c
->blockingkeysnum
= 0;
2437 c
->io_keys
= listCreate();
2438 listSetFreeMethod(c
->io_keys
,decrRefCount
);
2439 if (aeCreateFileEvent(server
.el
, c
->fd
, AE_READABLE
,
2440 readQueryFromClient
, c
) == AE_ERR
) {
2444 listAddNodeTail(server
.clients
,c
);
2445 initClientMultiState(c
);
2449 static void addReply(redisClient
*c
, robj
*obj
) {
2450 if (listLength(c
->reply
) == 0 &&
2451 (c
->replstate
== REDIS_REPL_NONE
||
2452 c
->replstate
== REDIS_REPL_ONLINE
) &&
2453 aeCreateFileEvent(server
.el
, c
->fd
, AE_WRITABLE
,
2454 sendReplyToClient
, c
) == AE_ERR
) return;
2456 if (server
.vm_enabled
&& obj
->storage
!= REDIS_VM_MEMORY
) {
2457 obj
= dupStringObject(obj
);
2458 obj
->refcount
= 0; /* getDecodedObject() will increment the refcount */
2460 listAddNodeTail(c
->reply
,getDecodedObject(obj
));
2463 static void addReplySds(redisClient
*c
, sds s
) {
2464 robj
*o
= createObject(REDIS_STRING
,s
);
2469 static void addReplyDouble(redisClient
*c
, double d
) {
2472 snprintf(buf
,sizeof(buf
),"%.17g",d
);
2473 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2474 (unsigned long) strlen(buf
),buf
));
2477 static void addReplyLong(redisClient
*c
, long l
) {
2481 len
= snprintf(buf
,sizeof(buf
),":%ld\r\n",l
);
2482 addReplySds(c
,sdsnewlen(buf
,len
));
2485 static void addReplyUlong(redisClient
*c
, unsigned long ul
) {
2489 len
= snprintf(buf
,sizeof(buf
),":%lu\r\n",ul
);
2490 addReplySds(c
,sdsnewlen(buf
,len
));
2493 static void addReplyBulkLen(redisClient
*c
, robj
*obj
) {
2496 if (obj
->encoding
== REDIS_ENCODING_RAW
) {
2497 len
= sdslen(obj
->ptr
);
2499 long n
= (long)obj
->ptr
;
2501 /* Compute how many bytes will take this integer as a radix 10 string */
2507 while((n
= n
/10) != 0) {
2511 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len
));
2514 static void acceptHandler(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
2519 REDIS_NOTUSED(mask
);
2520 REDIS_NOTUSED(privdata
);
2522 cfd
= anetAccept(server
.neterr
, fd
, cip
, &cport
);
2523 if (cfd
== AE_ERR
) {
2524 redisLog(REDIS_VERBOSE
,"Accepting client connection: %s", server
.neterr
);
2527 redisLog(REDIS_VERBOSE
,"Accepted %s:%d", cip
, cport
);
2528 if ((c
= createClient(cfd
)) == NULL
) {
2529 redisLog(REDIS_WARNING
,"Error allocating resoures for the client");
2530 close(cfd
); /* May be already closed, just ingore errors */
2533 /* If maxclient directive is set and this is one client more... close the
2534 * connection. Note that we create the client instead to check before
2535 * for this condition, since now the socket is already set in nonblocking
2536 * mode and we can send an error for free using the Kernel I/O */
2537 if (server
.maxclients
&& listLength(server
.clients
) > server
.maxclients
) {
2538 char *err
= "-ERR max number of clients reached\r\n";
2540 /* That's a best effort error message, don't check write errors */
2541 if (write(c
->fd
,err
,strlen(err
)) == -1) {
2542 /* Nothing to do, Just to avoid the warning... */
2547 server
.stat_numconnections
++;
2550 /* ======================= Redis objects implementation ===================== */
2552 static robj
*createObject(int type
, void *ptr
) {
2555 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2556 if (listLength(server
.objfreelist
)) {
2557 listNode
*head
= listFirst(server
.objfreelist
);
2558 o
= listNodeValue(head
);
2559 listDelNode(server
.objfreelist
,head
);
2560 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2562 if (server
.vm_enabled
) {
2563 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2564 o
= zmalloc(sizeof(*o
));
2566 o
= zmalloc(sizeof(*o
)-sizeof(struct redisObjectVM
));
2570 o
->encoding
= REDIS_ENCODING_RAW
;
2573 if (server
.vm_enabled
) {
2574 /* Note that this code may run in the context of an I/O thread
2575 * and accessing to server.unixtime in theory is an error
2576 * (no locks). But in practice this is safe, and even if we read
2577 * garbage Redis will not fail, as it's just a statistical info */
2578 o
->vm
.atime
= server
.unixtime
;
2579 o
->storage
= REDIS_VM_MEMORY
;
2584 static robj
*createStringObject(char *ptr
, size_t len
) {
2585 return createObject(REDIS_STRING
,sdsnewlen(ptr
,len
));
2588 static robj
*dupStringObject(robj
*o
) {
2589 assert(o
->encoding
== REDIS_ENCODING_RAW
);
2590 return createStringObject(o
->ptr
,sdslen(o
->ptr
));
2593 static robj
*createListObject(void) {
2594 list
*l
= listCreate();
2596 listSetFreeMethod(l
,decrRefCount
);
2597 return createObject(REDIS_LIST
,l
);
2600 static robj
*createSetObject(void) {
2601 dict
*d
= dictCreate(&setDictType
,NULL
);
2602 return createObject(REDIS_SET
,d
);
2605 static robj
*createHashObject(void) {
2606 /* All the Hashes start as zipmaps. Will be automatically converted
2607 * into hash tables if there are enough elements or big elements
2609 unsigned char *zm
= zipmapNew();
2610 robj
*o
= createObject(REDIS_HASH
,zm
);
2611 o
->encoding
= REDIS_ENCODING_ZIPMAP
;
2615 static robj
*createZsetObject(void) {
2616 zset
*zs
= zmalloc(sizeof(*zs
));
2618 zs
->dict
= dictCreate(&zsetDictType
,NULL
);
2619 zs
->zsl
= zslCreate();
2620 return createObject(REDIS_ZSET
,zs
);
2623 static void freeStringObject(robj
*o
) {
2624 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2629 static void freeListObject(robj
*o
) {
2630 listRelease((list
*) o
->ptr
);
2633 static void freeSetObject(robj
*o
) {
2634 dictRelease((dict
*) o
->ptr
);
2637 static void freeZsetObject(robj
*o
) {
2640 dictRelease(zs
->dict
);
2645 static void freeHashObject(robj
*o
) {
2646 switch (o
->encoding
) {
2647 case REDIS_ENCODING_HT
:
2648 dictRelease((dict
*) o
->ptr
);
2650 case REDIS_ENCODING_ZIPMAP
:
2659 static void incrRefCount(robj
*o
) {
2660 redisAssert(!server
.vm_enabled
|| o
->storage
== REDIS_VM_MEMORY
);
2664 static void decrRefCount(void *obj
) {
2667 /* Object is a key of a swapped out value, or in the process of being
2669 if (server
.vm_enabled
&&
2670 (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
))
2672 if (o
->storage
== REDIS_VM_SWAPPED
|| o
->storage
== REDIS_VM_LOADING
) {
2673 redisAssert(o
->refcount
== 1);
2675 if (o
->storage
== REDIS_VM_LOADING
) vmCancelThreadedIOJob(obj
);
2676 redisAssert(o
->type
== REDIS_STRING
);
2677 freeStringObject(o
);
2678 vmMarkPagesFree(o
->vm
.page
,o
->vm
.usedpages
);
2679 pthread_mutex_lock(&server
.obj_freelist_mutex
);
2680 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2681 !listAddNodeHead(server
.objfreelist
,o
))
2683 pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2684 server
.vm_stats_swapped_objects
--;
2687 /* Object is in memory, or in the process of being swapped out. */
2688 if (--(o
->refcount
) == 0) {
2689 if (server
.vm_enabled
&& o
->storage
== REDIS_VM_SWAPPING
)
2690 vmCancelThreadedIOJob(obj
);
2692 case REDIS_STRING
: freeStringObject(o
); break;
2693 case REDIS_LIST
: freeListObject(o
); break;
2694 case REDIS_SET
: freeSetObject(o
); break;
2695 case REDIS_ZSET
: freeZsetObject(o
); break;
2696 case REDIS_HASH
: freeHashObject(o
); break;
2697 default: redisAssert(0 != 0); break;
2699 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
2700 if (listLength(server
.objfreelist
) > REDIS_OBJFREELIST_MAX
||
2701 !listAddNodeHead(server
.objfreelist
,o
))
2703 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
2707 static robj
*lookupKey(redisDb
*db
, robj
*key
) {
2708 dictEntry
*de
= dictFind(db
->dict
,key
);
2710 robj
*key
= dictGetEntryKey(de
);
2711 robj
*val
= dictGetEntryVal(de
);
2713 if (server
.vm_enabled
) {
2714 if (key
->storage
== REDIS_VM_MEMORY
||
2715 key
->storage
== REDIS_VM_SWAPPING
)
2717 /* If we were swapping the object out, stop it, this key
2719 if (key
->storage
== REDIS_VM_SWAPPING
)
2720 vmCancelThreadedIOJob(key
);
2721 /* Update the access time of the key for the aging algorithm. */
2722 key
->vm
.atime
= server
.unixtime
;
2724 int notify
= (key
->storage
== REDIS_VM_LOADING
);
2726 /* Our value was swapped on disk. Bring it at home. */
2727 redisAssert(val
== NULL
);
2728 val
= vmLoadObject(key
);
2729 dictGetEntryVal(de
) = val
;
2731 /* Clients blocked by the VM subsystem may be waiting for
2733 if (notify
) handleClientsBlockedOnSwappedKey(db
,key
);
2742 static robj
*lookupKeyRead(redisDb
*db
, robj
*key
) {
2743 expireIfNeeded(db
,key
);
2744 return lookupKey(db
,key
);
2747 static robj
*lookupKeyWrite(redisDb
*db
, robj
*key
) {
2748 deleteIfVolatile(db
,key
);
2749 return lookupKey(db
,key
);
2752 static robj
*lookupKeyReadOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
2753 robj
*o
= lookupKeyRead(c
->db
, key
);
2754 if (!o
) addReply(c
,reply
);
2758 static robj
*lookupKeyWriteOrReply(redisClient
*c
, robj
*key
, robj
*reply
) {
2759 robj
*o
= lookupKeyWrite(c
->db
, key
);
2760 if (!o
) addReply(c
,reply
);
2764 static int checkType(redisClient
*c
, robj
*o
, int type
) {
2765 if (o
->type
!= type
) {
2766 addReply(c
,shared
.wrongtypeerr
);
2772 static int deleteKey(redisDb
*db
, robj
*key
) {
2775 /* We need to protect key from destruction: after the first dictDelete()
2776 * it may happen that 'key' is no longer valid if we don't increment
2777 * it's count. This may happen when we get the object reference directly
2778 * from the hash table with dictRandomKey() or dict iterators */
2780 if (dictSize(db
->expires
)) dictDelete(db
->expires
,key
);
2781 retval
= dictDelete(db
->dict
,key
);
2784 return retval
== DICT_OK
;
2787 /* Try to share an object against the shared objects pool */
2788 static robj
*tryObjectSharing(robj
*o
) {
2789 struct dictEntry
*de
;
2792 if (o
== NULL
|| server
.shareobjects
== 0) return o
;
2794 redisAssert(o
->type
== REDIS_STRING
);
2795 de
= dictFind(server
.sharingpool
,o
);
2797 robj
*shared
= dictGetEntryKey(de
);
2799 c
= ((unsigned long) dictGetEntryVal(de
))+1;
2800 dictGetEntryVal(de
) = (void*) c
;
2801 incrRefCount(shared
);
2805 /* Here we are using a stream algorihtm: Every time an object is
2806 * shared we increment its count, everytime there is a miss we
2807 * recrement the counter of a random object. If this object reaches
2808 * zero we remove the object and put the current object instead. */
2809 if (dictSize(server
.sharingpool
) >=
2810 server
.sharingpoolsize
) {
2811 de
= dictGetRandomKey(server
.sharingpool
);
2812 redisAssert(de
!= NULL
);
2813 c
= ((unsigned long) dictGetEntryVal(de
))-1;
2814 dictGetEntryVal(de
) = (void*) c
;
2816 dictDelete(server
.sharingpool
,de
->key
);
2819 c
= 0; /* If the pool is empty we want to add this object */
2824 retval
= dictAdd(server
.sharingpool
,o
,(void*)1);
2825 redisAssert(retval
== DICT_OK
);
2832 /* Check if the nul-terminated string 's' can be represented by a long
2833 * (that is, is a number that fits into long without any other space or
2834 * character before or after the digits).
2836 * If so, the function returns REDIS_OK and *longval is set to the value
2837 * of the number. Otherwise REDIS_ERR is returned */
2838 static int isStringRepresentableAsLong(sds s
, long *longval
) {
2839 char buf
[32], *endptr
;
2843 value
= strtol(s
, &endptr
, 10);
2844 if (endptr
[0] != '\0') return REDIS_ERR
;
2845 slen
= snprintf(buf
,32,"%ld",value
);
2847 /* If the number converted back into a string is not identical
2848 * then it's not possible to encode the string as integer */
2849 if (sdslen(s
) != (unsigned)slen
|| memcmp(buf
,s
,slen
)) return REDIS_ERR
;
2850 if (longval
) *longval
= value
;
2854 /* Try to encode a string object in order to save space */
2855 static int tryObjectEncoding(robj
*o
) {
2859 if (o
->encoding
!= REDIS_ENCODING_RAW
)
2860 return REDIS_ERR
; /* Already encoded */
2862 /* It's not save to encode shared objects: shared objects can be shared
2863 * everywhere in the "object space" of Redis. Encoded objects can only
2864 * appear as "values" (and not, for instance, as keys) */
2865 if (o
->refcount
> 1) return REDIS_ERR
;
2867 /* Currently we try to encode only strings */
2868 redisAssert(o
->type
== REDIS_STRING
);
2870 /* Check if we can represent this string as a long integer */
2871 if (isStringRepresentableAsLong(s
,&value
) == REDIS_ERR
) return REDIS_ERR
;
2873 /* Ok, this object can be encoded */
2874 o
->encoding
= REDIS_ENCODING_INT
;
2876 o
->ptr
= (void*) value
;
2880 /* Get a decoded version of an encoded object (returned as a new object).
2881 * If the object is already raw-encoded just increment the ref count. */
2882 static robj
*getDecodedObject(robj
*o
) {
2885 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2889 if (o
->type
== REDIS_STRING
&& o
->encoding
== REDIS_ENCODING_INT
) {
2892 snprintf(buf
,32,"%ld",(long)o
->ptr
);
2893 dec
= createStringObject(buf
,strlen(buf
));
2896 redisAssert(1 != 1);
2900 /* Compare two string objects via strcmp() or alike.
2901 * Note that the objects may be integer-encoded. In such a case we
2902 * use snprintf() to get a string representation of the numbers on the stack
2903 * and compare the strings, it's much faster than calling getDecodedObject().
2905 * Important note: if objects are not integer encoded, but binary-safe strings,
2906 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2908 static int compareStringObjects(robj
*a
, robj
*b
) {
2909 redisAssert(a
->type
== REDIS_STRING
&& b
->type
== REDIS_STRING
);
2910 char bufa
[128], bufb
[128], *astr
, *bstr
;
2913 if (a
== b
) return 0;
2914 if (a
->encoding
!= REDIS_ENCODING_RAW
) {
2915 snprintf(bufa
,sizeof(bufa
),"%ld",(long) a
->ptr
);
2921 if (b
->encoding
!= REDIS_ENCODING_RAW
) {
2922 snprintf(bufb
,sizeof(bufb
),"%ld",(long) b
->ptr
);
2928 return bothsds
? sdscmp(astr
,bstr
) : strcmp(astr
,bstr
);
2931 static size_t stringObjectLen(robj
*o
) {
2932 redisAssert(o
->type
== REDIS_STRING
);
2933 if (o
->encoding
== REDIS_ENCODING_RAW
) {
2934 return sdslen(o
->ptr
);
2938 return snprintf(buf
,32,"%ld",(long)o
->ptr
);
2942 /*============================ RDB saving/loading =========================== */
2944 static int rdbSaveType(FILE *fp
, unsigned char type
) {
2945 if (fwrite(&type
,1,1,fp
) == 0) return -1;
2949 static int rdbSaveTime(FILE *fp
, time_t t
) {
2950 int32_t t32
= (int32_t) t
;
2951 if (fwrite(&t32
,4,1,fp
) == 0) return -1;
2955 /* check rdbLoadLen() comments for more info */
2956 static int rdbSaveLen(FILE *fp
, uint32_t len
) {
2957 unsigned char buf
[2];
2960 /* Save a 6 bit len */
2961 buf
[0] = (len
&0xFF)|(REDIS_RDB_6BITLEN
<<6);
2962 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2963 } else if (len
< (1<<14)) {
2964 /* Save a 14 bit len */
2965 buf
[0] = ((len
>>8)&0xFF)|(REDIS_RDB_14BITLEN
<<6);
2967 if (fwrite(buf
,2,1,fp
) == 0) return -1;
2969 /* Save a 32 bit len */
2970 buf
[0] = (REDIS_RDB_32BITLEN
<<6);
2971 if (fwrite(buf
,1,1,fp
) == 0) return -1;
2973 if (fwrite(&len
,4,1,fp
) == 0) return -1;
2978 /* String objects in the form "2391" "-100" without any space and with a
2979 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2980 * encoded as integers to save space */
2981 static int rdbTryIntegerEncoding(char *s
, size_t len
, unsigned char *enc
) {
2983 char *endptr
, buf
[32];
2985 /* Check if it's possible to encode this value as a number */
2986 value
= strtoll(s
, &endptr
, 10);
2987 if (endptr
[0] != '\0') return 0;
2988 snprintf(buf
,32,"%lld",value
);
2990 /* If the number converted back into a string is not identical
2991 * then it's not possible to encode the string as integer */
2992 if (strlen(buf
) != len
|| memcmp(buf
,s
,len
)) return 0;
2994 /* Finally check if it fits in our ranges */
2995 if (value
>= -(1<<7) && value
<= (1<<7)-1) {
2996 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT8
;
2997 enc
[1] = value
&0xFF;
2999 } else if (value
>= -(1<<15) && value
<= (1<<15)-1) {
3000 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT16
;
3001 enc
[1] = value
&0xFF;
3002 enc
[2] = (value
>>8)&0xFF;
3004 } else if (value
>= -((long long)1<<31) && value
<= ((long long)1<<31)-1) {
3005 enc
[0] = (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_INT32
;
3006 enc
[1] = value
&0xFF;
3007 enc
[2] = (value
>>8)&0xFF;
3008 enc
[3] = (value
>>16)&0xFF;
3009 enc
[4] = (value
>>24)&0xFF;
3016 static int rdbSaveLzfStringObject(FILE *fp
, unsigned char *s
, size_t len
) {
3017 size_t comprlen
, outlen
;
3021 /* We require at least four bytes compression for this to be worth it */
3022 if (len
<= 4) return 0;
3024 if ((out
= zmalloc(outlen
+1)) == NULL
) return 0;
3025 comprlen
= lzf_compress(s
, len
, out
, outlen
);
3026 if (comprlen
== 0) {
3030 /* Data compressed! Let's save it on disk */
3031 byte
= (REDIS_RDB_ENCVAL
<<6)|REDIS_RDB_ENC_LZF
;
3032 if (fwrite(&byte
,1,1,fp
) == 0) goto writeerr
;
3033 if (rdbSaveLen(fp
,comprlen
) == -1) goto writeerr
;
3034 if (rdbSaveLen(fp
,len
) == -1) goto writeerr
;
3035 if (fwrite(out
,comprlen
,1,fp
) == 0) goto writeerr
;
3044 /* Save a string objet as [len][data] on disk. If the object is a string
3045 * representation of an integer value we try to safe it in a special form */
3046 static int rdbSaveRawString(FILE *fp
, unsigned char *s
, size_t len
) {
3049 /* Try integer encoding */
3051 unsigned char buf
[5];
3052 if ((enclen
= rdbTryIntegerEncoding((char*)s
,len
,buf
)) > 0) {
3053 if (fwrite(buf
,enclen
,1,fp
) == 0) return -1;
3058 /* Try LZF compression - under 20 bytes it's unable to compress even
3059 * aaaaaaaaaaaaaaaaaa so skip it */
3060 if (server
.rdbcompression
&& len
> 20) {
3063 retval
= rdbSaveLzfStringObject(fp
,s
,len
);
3064 if (retval
== -1) return -1;
3065 if (retval
> 0) return 0;
3066 /* retval == 0 means data can't be compressed, save the old way */
3069 /* Store verbatim */
3070 if (rdbSaveLen(fp
,len
) == -1) return -1;
3071 if (len
&& fwrite(s
,len
,1,fp
) == 0) return -1;
3075 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3076 static int rdbSaveStringObject(FILE *fp
, robj
*obj
) {
3079 /* Avoid incr/decr ref count business when possible.
3080 * This plays well with copy-on-write given that we are probably
3081 * in a child process (BGSAVE). Also this makes sure key objects
3082 * of swapped objects are not incRefCount-ed (an assert does not allow
3083 * this in order to avoid bugs) */
3084 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
3085 obj
= getDecodedObject(obj
);
3086 retval
= rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3089 retval
= rdbSaveRawString(fp
,obj
->ptr
,sdslen(obj
->ptr
));
3094 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3095 * 8 bit integer specifing the length of the representation.
3096 * This 8 bit integer has special values in order to specify the following
3102 static int rdbSaveDoubleValue(FILE *fp
, double val
) {
3103 unsigned char buf
[128];
3109 } else if (!isfinite(val
)) {
3111 buf
[0] = (val
< 0) ? 255 : 254;
3113 snprintf((char*)buf
+1,sizeof(buf
)-1,"%.17g",val
);
3114 buf
[0] = strlen((char*)buf
+1);
3117 if (fwrite(buf
,len
,1,fp
) == 0) return -1;
3121 /* Save a Redis object. */
3122 static int rdbSaveObject(FILE *fp
, robj
*o
) {
3123 if (o
->type
== REDIS_STRING
) {
3124 /* Save a string value */
3125 if (rdbSaveStringObject(fp
,o
) == -1) return -1;
3126 } else if (o
->type
== REDIS_LIST
) {
3127 /* Save a list value */
3128 list
*list
= o
->ptr
;
3132 if (rdbSaveLen(fp
,listLength(list
)) == -1) return -1;
3133 listRewind(list
,&li
);
3134 while((ln
= listNext(&li
))) {
3135 robj
*eleobj
= listNodeValue(ln
);
3137 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3139 } else if (o
->type
== REDIS_SET
) {
3140 /* Save a set value */
3142 dictIterator
*di
= dictGetIterator(set
);
3145 if (rdbSaveLen(fp
,dictSize(set
)) == -1) return -1;
3146 while((de
= dictNext(di
)) != NULL
) {
3147 robj
*eleobj
= dictGetEntryKey(de
);
3149 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3151 dictReleaseIterator(di
);
3152 } else if (o
->type
== REDIS_ZSET
) {
3153 /* Save a set value */
3155 dictIterator
*di
= dictGetIterator(zs
->dict
);
3158 if (rdbSaveLen(fp
,dictSize(zs
->dict
)) == -1) return -1;
3159 while((de
= dictNext(di
)) != NULL
) {
3160 robj
*eleobj
= dictGetEntryKey(de
);
3161 double *score
= dictGetEntryVal(de
);
3163 if (rdbSaveStringObject(fp
,eleobj
) == -1) return -1;
3164 if (rdbSaveDoubleValue(fp
,*score
) == -1) return -1;
3166 dictReleaseIterator(di
);
3167 } else if (o
->type
== REDIS_HASH
) {
3168 /* Save a hash value */
3169 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3170 unsigned char *p
= zipmapRewind(o
->ptr
);
3171 unsigned int count
= zipmapLen(o
->ptr
);
3172 unsigned char *key
, *val
;
3173 unsigned int klen
, vlen
;
3175 if (rdbSaveLen(fp
,count
) == -1) return -1;
3176 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
3177 if (rdbSaveRawString(fp
,key
,klen
) == -1) return -1;
3178 if (rdbSaveRawString(fp
,val
,vlen
) == -1) return -1;
3181 dictIterator
*di
= dictGetIterator(o
->ptr
);
3184 if (rdbSaveLen(fp
,dictSize((dict
*)o
->ptr
)) == -1) return -1;
3185 while((de
= dictNext(di
)) != NULL
) {
3186 robj
*key
= dictGetEntryKey(de
);
3187 robj
*val
= dictGetEntryVal(de
);
3189 if (rdbSaveStringObject(fp
,key
) == -1) return -1;
3190 if (rdbSaveStringObject(fp
,val
) == -1) return -1;
3192 dictReleaseIterator(di
);
3195 redisAssert(0 != 0);
3200 /* Return the length the object will have on disk if saved with
3201 * the rdbSaveObject() function. Currently we use a trick to get
3202 * this length with very little changes to the code. In the future
3203 * we could switch to a faster solution. */
3204 static off_t
rdbSavedObjectLen(robj
*o
, FILE *fp
) {
3205 if (fp
== NULL
) fp
= server
.devnull
;
3207 assert(rdbSaveObject(fp
,o
) != 1);
3211 /* Return the number of pages required to save this object in the swap file */
3212 static off_t
rdbSavedObjectPages(robj
*o
, FILE *fp
) {
3213 off_t bytes
= rdbSavedObjectLen(o
,fp
);
3215 return (bytes
+(server
.vm_page_size
-1))/server
.vm_page_size
;
3218 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3219 static int rdbSave(char *filename
) {
3220 dictIterator
*di
= NULL
;
3225 time_t now
= time(NULL
);
3227 /* Wait for I/O therads to terminate, just in case this is a
3228 * foreground-saving, to avoid seeking the swap file descriptor at the
3230 if (server
.vm_enabled
)
3231 waitEmptyIOJobsQueue();
3233 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
3234 fp
= fopen(tmpfile
,"w");
3236 redisLog(REDIS_WARNING
, "Failed saving the DB: %s", strerror(errno
));
3239 if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
;
3240 for (j
= 0; j
< server
.dbnum
; j
++) {
3241 redisDb
*db
= server
.db
+j
;
3243 if (dictSize(d
) == 0) continue;
3244 di
= dictGetIterator(d
);
3250 /* Write the SELECT DB opcode */
3251 if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
;
3252 if (rdbSaveLen(fp
,j
) == -1) goto werr
;
3254 /* Iterate this DB writing every entry */
3255 while((de
= dictNext(di
)) != NULL
) {
3256 robj
*key
= dictGetEntryKey(de
);
3257 robj
*o
= dictGetEntryVal(de
);
3258 time_t expiretime
= getExpire(db
,key
);
3260 /* Save the expire time */
3261 if (expiretime
!= -1) {
3262 /* If this key is already expired skip it */
3263 if (expiretime
< now
) continue;
3264 if (rdbSaveType(fp
,REDIS_EXPIRETIME
) == -1) goto werr
;
3265 if (rdbSaveTime(fp
,expiretime
) == -1) goto werr
;
3267 /* Save the key and associated value. This requires special
3268 * handling if the value is swapped out. */
3269 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
3270 key
->storage
== REDIS_VM_SWAPPING
) {
3271 /* Save type, key, value */
3272 if (rdbSaveType(fp
,o
->type
) == -1) goto werr
;
3273 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3274 if (rdbSaveObject(fp
,o
) == -1) goto werr
;
3276 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3278 /* Get a preview of the object in memory */
3279 po
= vmPreviewObject(key
);
3280 /* Save type, key, value */
3281 if (rdbSaveType(fp
,key
->vtype
) == -1) goto werr
;
3282 if (rdbSaveStringObject(fp
,key
) == -1) goto werr
;
3283 if (rdbSaveObject(fp
,po
) == -1) goto werr
;
3284 /* Remove the loaded object from memory */
3288 dictReleaseIterator(di
);
3291 if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
;
3293 /* Make sure data will not remain on the OS's output buffers */
3298 /* Use RENAME to make sure the DB file is changed atomically only
3299 * if the generate DB file is ok. */
3300 if (rename(tmpfile
,filename
) == -1) {
3301 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s", strerror(errno
));
3305 redisLog(REDIS_NOTICE
,"DB saved on disk");
3307 server
.lastsave
= time(NULL
);
3313 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
3314 if (di
) dictReleaseIterator(di
);
3318 static int rdbSaveBackground(char *filename
) {
3321 if (server
.bgsavechildpid
!= -1) return REDIS_ERR
;
3322 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
3323 if ((childpid
= fork()) == 0) {
3325 if (server
.vm_enabled
) vmReopenSwapFile();
3327 if (rdbSave(filename
) == REDIS_OK
) {
3334 if (childpid
== -1) {
3335 redisLog(REDIS_WARNING
,"Can't save in background: fork: %s",
3339 redisLog(REDIS_NOTICE
,"Background saving started by pid %d",childpid
);
3340 server
.bgsavechildpid
= childpid
;
3343 return REDIS_OK
; /* unreached */
3346 static void rdbRemoveTempFile(pid_t childpid
) {
3349 snprintf(tmpfile
,256,"temp-%d.rdb", (int) childpid
);
3353 static int rdbLoadType(FILE *fp
) {
3355 if (fread(&type
,1,1,fp
) == 0) return -1;
3359 static time_t rdbLoadTime(FILE *fp
) {
3361 if (fread(&t32
,4,1,fp
) == 0) return -1;
3362 return (time_t) t32
;
3365 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3366 * of this file for a description of how this are stored on disk.
3368 * isencoded is set to 1 if the readed length is not actually a length but
3369 * an "encoding type", check the above comments for more info */
3370 static uint32_t rdbLoadLen(FILE *fp
, int *isencoded
) {
3371 unsigned char buf
[2];
3375 if (isencoded
) *isencoded
= 0;
3376 if (fread(buf
,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3377 type
= (buf
[0]&0xC0)>>6;
3378 if (type
== REDIS_RDB_6BITLEN
) {
3379 /* Read a 6 bit len */
3381 } else if (type
== REDIS_RDB_ENCVAL
) {
3382 /* Read a 6 bit len encoding type */
3383 if (isencoded
) *isencoded
= 1;
3385 } else if (type
== REDIS_RDB_14BITLEN
) {
3386 /* Read a 14 bit len */
3387 if (fread(buf
+1,1,1,fp
) == 0) return REDIS_RDB_LENERR
;
3388 return ((buf
[0]&0x3F)<<8)|buf
[1];
3390 /* Read a 32 bit len */
3391 if (fread(&len
,4,1,fp
) == 0) return REDIS_RDB_LENERR
;
3396 static robj
*rdbLoadIntegerObject(FILE *fp
, int enctype
) {
3397 unsigned char enc
[4];
3400 if (enctype
== REDIS_RDB_ENC_INT8
) {
3401 if (fread(enc
,1,1,fp
) == 0) return NULL
;
3402 val
= (signed char)enc
[0];
3403 } else if (enctype
== REDIS_RDB_ENC_INT16
) {
3405 if (fread(enc
,2,1,fp
) == 0) return NULL
;
3406 v
= enc
[0]|(enc
[1]<<8);
3408 } else if (enctype
== REDIS_RDB_ENC_INT32
) {
3410 if (fread(enc
,4,1,fp
) == 0) return NULL
;
3411 v
= enc
[0]|(enc
[1]<<8)|(enc
[2]<<16)|(enc
[3]<<24);
3414 val
= 0; /* anti-warning */
3417 return createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",val
));
3420 static robj
*rdbLoadLzfStringObject(FILE*fp
) {
3421 unsigned int len
, clen
;
3422 unsigned char *c
= NULL
;
3425 if ((clen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3426 if ((len
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3427 if ((c
= zmalloc(clen
)) == NULL
) goto err
;
3428 if ((val
= sdsnewlen(NULL
,len
)) == NULL
) goto err
;
3429 if (fread(c
,clen
,1,fp
) == 0) goto err
;
3430 if (lzf_decompress(c
,clen
,val
,len
) == 0) goto err
;
3432 return createObject(REDIS_STRING
,val
);
3439 static robj
*rdbLoadStringObject(FILE*fp
) {
3444 len
= rdbLoadLen(fp
,&isencoded
);
3447 case REDIS_RDB_ENC_INT8
:
3448 case REDIS_RDB_ENC_INT16
:
3449 case REDIS_RDB_ENC_INT32
:
3450 return tryObjectSharing(rdbLoadIntegerObject(fp
,len
));
3451 case REDIS_RDB_ENC_LZF
:
3452 return tryObjectSharing(rdbLoadLzfStringObject(fp
));
3458 if (len
== REDIS_RDB_LENERR
) return NULL
;
3459 val
= sdsnewlen(NULL
,len
);
3460 if (len
&& fread(val
,len
,1,fp
) == 0) {
3464 return tryObjectSharing(createObject(REDIS_STRING
,val
));
3467 /* For information about double serialization check rdbSaveDoubleValue() */
3468 static int rdbLoadDoubleValue(FILE *fp
, double *val
) {
3472 if (fread(&len
,1,1,fp
) == 0) return -1;
3474 case 255: *val
= R_NegInf
; return 0;
3475 case 254: *val
= R_PosInf
; return 0;
3476 case 253: *val
= R_Nan
; return 0;
3478 if (fread(buf
,len
,1,fp
) == 0) return -1;
3480 sscanf(buf
, "%lg", val
);
3485 /* Load a Redis object of the specified type from the specified file.
3486 * On success a newly allocated object is returned, otherwise NULL. */
3487 static robj
*rdbLoadObject(int type
, FILE *fp
) {
3490 redisLog(REDIS_DEBUG
,"LOADING OBJECT %d (at %d)\n",type
,ftell(fp
));
3491 if (type
== REDIS_STRING
) {
3492 /* Read string value */
3493 if ((o
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3494 tryObjectEncoding(o
);
3495 } else if (type
== REDIS_LIST
|| type
== REDIS_SET
) {
3496 /* Read list/set value */
3499 if ((listlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3500 o
= (type
== REDIS_LIST
) ? createListObject() : createSetObject();
3501 /* It's faster to expand the dict to the right size asap in order
3502 * to avoid rehashing */
3503 if (type
== REDIS_SET
&& listlen
> DICT_HT_INITIAL_SIZE
)
3504 dictExpand(o
->ptr
,listlen
);
3505 /* Load every single element of the list/set */
3509 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3510 tryObjectEncoding(ele
);
3511 if (type
== REDIS_LIST
) {
3512 listAddNodeTail((list
*)o
->ptr
,ele
);
3514 dictAdd((dict
*)o
->ptr
,ele
,NULL
);
3517 } else if (type
== REDIS_ZSET
) {
3518 /* Read list/set value */
3522 if ((zsetlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3523 o
= createZsetObject();
3525 /* Load every single element of the list/set */
3528 double *score
= zmalloc(sizeof(double));
3530 if ((ele
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3531 tryObjectEncoding(ele
);
3532 if (rdbLoadDoubleValue(fp
,score
) == -1) return NULL
;
3533 dictAdd(zs
->dict
,ele
,score
);
3534 zslInsert(zs
->zsl
,*score
,ele
);
3535 incrRefCount(ele
); /* added to skiplist */
3537 } else if (type
== REDIS_HASH
) {
3540 if ((hashlen
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
) return NULL
;
3541 o
= createHashObject();
3542 /* Too many entries? Use an hash table. */
3543 if (hashlen
> server
.hash_max_zipmap_entries
)
3544 convertToRealHash(o
);
3545 /* Load every key/value, then set it into the zipmap or hash
3546 * table, as needed. */
3550 if ((key
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3551 if ((val
= rdbLoadStringObject(fp
)) == NULL
) return NULL
;
3552 /* If we are using a zipmap and there are too big values
3553 * the object is converted to real hash table encoding. */
3554 if (o
->encoding
!= REDIS_ENCODING_HT
&&
3555 (sdslen(key
->ptr
) > server
.hash_max_zipmap_value
||
3556 sdslen(val
->ptr
) > server
.hash_max_zipmap_value
))
3558 convertToRealHash(o
);
3561 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
3562 unsigned char *zm
= o
->ptr
;
3564 zm
= zipmapSet(zm
,key
->ptr
,sdslen(key
->ptr
),
3565 val
->ptr
,sdslen(val
->ptr
),NULL
);
3570 tryObjectEncoding(key
);
3571 tryObjectEncoding(val
);
3572 dictAdd((dict
*)o
->ptr
,key
,val
);
3578 redisAssert(0 != 0);
3583 static int rdbLoad(char *filename
) {
3585 robj
*keyobj
= NULL
;
3587 int type
, retval
, rdbver
;
3588 dict
*d
= server
.db
[0].dict
;
3589 redisDb
*db
= server
.db
+0;
3591 time_t expiretime
= -1, now
= time(NULL
);
3592 long long loadedkeys
= 0;
3594 fp
= fopen(filename
,"r");
3595 if (!fp
) return REDIS_ERR
;
3596 if (fread(buf
,9,1,fp
) == 0) goto eoferr
;
3598 if (memcmp(buf
,"REDIS",5) != 0) {
3600 redisLog(REDIS_WARNING
,"Wrong signature trying to load DB from file");
3603 rdbver
= atoi(buf
+5);
3606 redisLog(REDIS_WARNING
,"Can't handle RDB format version %d",rdbver
);
3613 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3614 if (type
== REDIS_EXPIRETIME
) {
3615 if ((expiretime
= rdbLoadTime(fp
)) == -1) goto eoferr
;
3616 /* We read the time so we need to read the object type again */
3617 if ((type
= rdbLoadType(fp
)) == -1) goto eoferr
;
3619 if (type
== REDIS_EOF
) break;
3620 /* Handle SELECT DB opcode as a special case */
3621 if (type
== REDIS_SELECTDB
) {
3622 if ((dbid
= rdbLoadLen(fp
,NULL
)) == REDIS_RDB_LENERR
)
3624 if (dbid
>= (unsigned)server
.dbnum
) {
3625 redisLog(REDIS_WARNING
,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server
.dbnum
);
3628 db
= server
.db
+dbid
;
3633 if ((keyobj
= rdbLoadStringObject(fp
)) == NULL
) goto eoferr
;
3635 if ((o
= rdbLoadObject(type
,fp
)) == NULL
) goto eoferr
;
3636 /* Add the new object in the hash table */
3637 retval
= dictAdd(d
,keyobj
,o
);
3638 if (retval
== DICT_ERR
) {
3639 redisLog(REDIS_WARNING
,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj
->ptr
);
3642 /* Set the expire time if needed */
3643 if (expiretime
!= -1) {
3644 setExpire(db
,keyobj
,expiretime
);
3645 /* Delete this key if already expired */
3646 if (expiretime
< now
) deleteKey(db
,keyobj
);
3650 /* Handle swapping while loading big datasets when VM is on */
3652 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
3653 while (zmalloc_used_memory() > server
.vm_max_memory
) {
3654 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
3661 eoferr
: /* unexpected end of file is handled here with a fatal exit */
3662 if (keyobj
) decrRefCount(keyobj
);
3663 redisLog(REDIS_WARNING
,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3665 return REDIS_ERR
; /* Just to avoid warning */
3668 /*================================== Commands =============================== */
3670 static void authCommand(redisClient
*c
) {
3671 if (!server
.requirepass
|| !strcmp(c
->argv
[1]->ptr
, server
.requirepass
)) {
3672 c
->authenticated
= 1;
3673 addReply(c
,shared
.ok
);
3675 c
->authenticated
= 0;
3676 addReplySds(c
,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3680 static void pingCommand(redisClient
*c
) {
3681 addReply(c
,shared
.pong
);
3684 static void echoCommand(redisClient
*c
) {
3685 addReplyBulkLen(c
,c
->argv
[1]);
3686 addReply(c
,c
->argv
[1]);
3687 addReply(c
,shared
.crlf
);
3690 /*=================================== Strings =============================== */
3692 static void setGenericCommand(redisClient
*c
, int nx
) {
3695 if (nx
) deleteIfVolatile(c
->db
,c
->argv
[1]);
3696 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3697 if (retval
== DICT_ERR
) {
3699 /* If the key is about a swapped value, we want a new key object
3700 * to overwrite the old. So we delete the old key in the database.
3701 * This will also make sure that swap pages about the old object
3702 * will be marked as free. */
3703 if (server
.vm_enabled
&& deleteIfSwapped(c
->db
,c
->argv
[1]))
3704 incrRefCount(c
->argv
[1]);
3705 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3706 incrRefCount(c
->argv
[2]);
3708 addReply(c
,shared
.czero
);
3712 incrRefCount(c
->argv
[1]);
3713 incrRefCount(c
->argv
[2]);
3716 removeExpire(c
->db
,c
->argv
[1]);
3717 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3720 static void setCommand(redisClient
*c
) {
3721 setGenericCommand(c
,0);
3724 static void setnxCommand(redisClient
*c
) {
3725 setGenericCommand(c
,1);
3728 static int getGenericCommand(redisClient
*c
) {
3729 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3732 addReply(c
,shared
.nullbulk
);
3735 if (o
->type
!= REDIS_STRING
) {
3736 addReply(c
,shared
.wrongtypeerr
);
3739 addReplyBulkLen(c
,o
);
3741 addReply(c
,shared
.crlf
);
3747 static void getCommand(redisClient
*c
) {
3748 getGenericCommand(c
);
3751 static void getsetCommand(redisClient
*c
) {
3752 if (getGenericCommand(c
) == REDIS_ERR
) return;
3753 if (dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]) == DICT_ERR
) {
3754 dictReplace(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3756 incrRefCount(c
->argv
[1]);
3758 incrRefCount(c
->argv
[2]);
3760 removeExpire(c
->db
,c
->argv
[1]);
3763 static void mgetCommand(redisClient
*c
) {
3766 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->argc
-1));
3767 for (j
= 1; j
< c
->argc
; j
++) {
3768 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[j
]);
3770 addReply(c
,shared
.nullbulk
);
3772 if (o
->type
!= REDIS_STRING
) {
3773 addReply(c
,shared
.nullbulk
);
3775 addReplyBulkLen(c
,o
);
3777 addReply(c
,shared
.crlf
);
3783 static void msetGenericCommand(redisClient
*c
, int nx
) {
3784 int j
, busykeys
= 0;
3786 if ((c
->argc
% 2) == 0) {
3787 addReplySds(c
,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3790 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3791 * set nothing at all if at least one already key exists. */
3793 for (j
= 1; j
< c
->argc
; j
+= 2) {
3794 if (lookupKeyWrite(c
->db
,c
->argv
[j
]) != NULL
) {
3800 addReply(c
, shared
.czero
);
3804 for (j
= 1; j
< c
->argc
; j
+= 2) {
3807 tryObjectEncoding(c
->argv
[j
+1]);
3808 retval
= dictAdd(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3809 if (retval
== DICT_ERR
) {
3810 dictReplace(c
->db
->dict
,c
->argv
[j
],c
->argv
[j
+1]);
3811 incrRefCount(c
->argv
[j
+1]);
3813 incrRefCount(c
->argv
[j
]);
3814 incrRefCount(c
->argv
[j
+1]);
3816 removeExpire(c
->db
,c
->argv
[j
]);
3818 server
.dirty
+= (c
->argc
-1)/2;
3819 addReply(c
, nx
? shared
.cone
: shared
.ok
);
3822 static void msetCommand(redisClient
*c
) {
3823 msetGenericCommand(c
,0);
3826 static void msetnxCommand(redisClient
*c
) {
3827 msetGenericCommand(c
,1);
3830 static void incrDecrCommand(redisClient
*c
, long long incr
) {
3835 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3839 if (o
->type
!= REDIS_STRING
) {
3844 if (o
->encoding
== REDIS_ENCODING_RAW
)
3845 value
= strtoll(o
->ptr
, &eptr
, 10);
3846 else if (o
->encoding
== REDIS_ENCODING_INT
)
3847 value
= (long)o
->ptr
;
3849 redisAssert(1 != 1);
3854 o
= createObject(REDIS_STRING
,sdscatprintf(sdsempty(),"%lld",value
));
3855 tryObjectEncoding(o
);
3856 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],o
);
3857 if (retval
== DICT_ERR
) {
3858 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3859 removeExpire(c
->db
,c
->argv
[1]);
3861 incrRefCount(c
->argv
[1]);
3864 addReply(c
,shared
.colon
);
3866 addReply(c
,shared
.crlf
);
3869 static void incrCommand(redisClient
*c
) {
3870 incrDecrCommand(c
,1);
3873 static void decrCommand(redisClient
*c
) {
3874 incrDecrCommand(c
,-1);
3877 static void incrbyCommand(redisClient
*c
) {
3878 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3879 incrDecrCommand(c
,incr
);
3882 static void decrbyCommand(redisClient
*c
) {
3883 long long incr
= strtoll(c
->argv
[2]->ptr
, NULL
, 10);
3884 incrDecrCommand(c
,-incr
);
3887 static void appendCommand(redisClient
*c
) {
3892 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
3894 /* Create the key */
3895 retval
= dictAdd(c
->db
->dict
,c
->argv
[1],c
->argv
[2]);
3896 incrRefCount(c
->argv
[1]);
3897 incrRefCount(c
->argv
[2]);
3898 totlen
= stringObjectLen(c
->argv
[2]);
3902 de
= dictFind(c
->db
->dict
,c
->argv
[1]);
3905 o
= dictGetEntryVal(de
);
3906 if (o
->type
!= REDIS_STRING
) {
3907 addReply(c
,shared
.wrongtypeerr
);
3910 /* If the object is specially encoded or shared we have to make
3912 if (o
->refcount
!= 1 || o
->encoding
!= REDIS_ENCODING_RAW
) {
3913 robj
*decoded
= getDecodedObject(o
);
3915 o
= createStringObject(decoded
->ptr
, sdslen(decoded
->ptr
));
3916 decrRefCount(decoded
);
3917 dictReplace(c
->db
->dict
,c
->argv
[1],o
);
3920 if (c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
) {
3921 o
->ptr
= sdscatlen(o
->ptr
,
3922 c
->argv
[2]->ptr
, sdslen(c
->argv
[2]->ptr
));
3924 o
->ptr
= sdscatprintf(o
->ptr
, "%ld",
3925 (unsigned long) c
->argv
[2]->ptr
);
3927 totlen
= sdslen(o
->ptr
);
3930 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen
));
3933 static void substrCommand(redisClient
*c
) {
3935 long start
= atoi(c
->argv
[2]->ptr
);
3936 long end
= atoi(c
->argv
[3]->ptr
);
3938 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
3940 addReply(c
,shared
.nullbulk
);
3942 if (o
->type
!= REDIS_STRING
) {
3943 addReply(c
,shared
.wrongtypeerr
);
3945 size_t rangelen
, strlen
;
3948 o
= getDecodedObject(o
);
3949 strlen
= sdslen(o
->ptr
);
3951 /* convert negative indexes */
3952 if (start
< 0) start
= strlen
+start
;
3953 if (end
< 0) end
= strlen
+end
;
3954 if (start
< 0) start
= 0;
3955 if (end
< 0) end
= 0;
3957 /* indexes sanity checks */
3958 if (start
> end
|| (size_t)start
>= strlen
) {
3959 /* Out of range start or start > end result in null reply */
3960 addReply(c
,shared
.nullbulk
);
3964 if ((size_t)end
>= strlen
) end
= strlen
-1;
3965 rangelen
= (end
-start
)+1;
3967 /* Return the result */
3968 addReplySds(c
,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen
));
3969 range
= sdsnewlen((char*)o
->ptr
+start
,rangelen
);
3970 addReplySds(c
,range
);
3971 addReply(c
,shared
.crlf
);
3977 /* ========================= Type agnostic commands ========================= */
3979 static void delCommand(redisClient
*c
) {
3982 for (j
= 1; j
< c
->argc
; j
++) {
3983 if (deleteKey(c
->db
,c
->argv
[j
])) {
3990 addReply(c
,shared
.czero
);
3993 addReply(c
,shared
.cone
);
3996 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",deleted
));
4001 static void existsCommand(redisClient
*c
) {
4002 addReply(c
,lookupKeyRead(c
->db
,c
->argv
[1]) ? shared
.cone
: shared
.czero
);
4005 static void selectCommand(redisClient
*c
) {
4006 int id
= atoi(c
->argv
[1]->ptr
);
4008 if (selectDb(c
,id
) == REDIS_ERR
) {
4009 addReplySds(c
,sdsnew("-ERR invalid DB index\r\n"));
4011 addReply(c
,shared
.ok
);
4015 static void randomkeyCommand(redisClient
*c
) {
4019 de
= dictGetRandomKey(c
->db
->dict
);
4020 if (!de
|| expireIfNeeded(c
->db
,dictGetEntryKey(de
)) == 0) break;
4023 addReply(c
,shared
.plus
);
4024 addReply(c
,shared
.crlf
);
4026 addReply(c
,shared
.plus
);
4027 addReply(c
,dictGetEntryKey(de
));
4028 addReply(c
,shared
.crlf
);
4032 static void keysCommand(redisClient
*c
) {
4035 sds pattern
= c
->argv
[1]->ptr
;
4036 int plen
= sdslen(pattern
);
4037 unsigned long numkeys
= 0;
4038 robj
*lenobj
= createObject(REDIS_STRING
,NULL
);
4040 di
= dictGetIterator(c
->db
->dict
);
4042 decrRefCount(lenobj
);
4043 while((de
= dictNext(di
)) != NULL
) {
4044 robj
*keyobj
= dictGetEntryKey(de
);
4046 sds key
= keyobj
->ptr
;
4047 if ((pattern
[0] == '*' && pattern
[1] == '\0') ||
4048 stringmatchlen(pattern
,plen
,key
,sdslen(key
),0)) {
4049 if (expireIfNeeded(c
->db
,keyobj
) == 0) {
4050 addReplyBulkLen(c
,keyobj
);
4052 addReply(c
,shared
.crlf
);
4057 dictReleaseIterator(di
);
4058 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",numkeys
);
4061 static void dbsizeCommand(redisClient
*c
) {
4063 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c
->db
->dict
)));
4066 static void lastsaveCommand(redisClient
*c
) {
4068 sdscatprintf(sdsempty(),":%lu\r\n",server
.lastsave
));
4071 static void typeCommand(redisClient
*c
) {
4075 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4080 case REDIS_STRING
: type
= "+string"; break;
4081 case REDIS_LIST
: type
= "+list"; break;
4082 case REDIS_SET
: type
= "+set"; break;
4083 case REDIS_ZSET
: type
= "+zset"; break;
4084 case REDIS_HASH
: type
= "+hash"; break;
4085 default: type
= "+unknown"; break;
4088 addReplySds(c
,sdsnew(type
));
4089 addReply(c
,shared
.crlf
);
4092 static void saveCommand(redisClient
*c
) {
4093 if (server
.bgsavechildpid
!= -1) {
4094 addReplySds(c
,sdsnew("-ERR background save in progress\r\n"));
4097 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4098 addReply(c
,shared
.ok
);
4100 addReply(c
,shared
.err
);
4104 static void bgsaveCommand(redisClient
*c
) {
4105 if (server
.bgsavechildpid
!= -1) {
4106 addReplySds(c
,sdsnew("-ERR background save already in progress\r\n"));
4109 if (rdbSaveBackground(server
.dbfilename
) == REDIS_OK
) {
4110 char *status
= "+Background saving started\r\n";
4111 addReplySds(c
,sdsnew(status
));
4113 addReply(c
,shared
.err
);
4117 static void shutdownCommand(redisClient
*c
) {
4118 redisLog(REDIS_WARNING
,"User requested shutdown, saving DB...");
4119 /* Kill the saving child if there is a background saving in progress.
4120 We want to avoid race conditions, for instance our saving child may
4121 overwrite the synchronous saving did by SHUTDOWN. */
4122 if (server
.bgsavechildpid
!= -1) {
4123 redisLog(REDIS_WARNING
,"There is a live saving child. Killing it!");
4124 kill(server
.bgsavechildpid
,SIGKILL
);
4125 rdbRemoveTempFile(server
.bgsavechildpid
);
4127 if (server
.appendonly
) {
4128 /* Append only file: fsync() the AOF and exit */
4129 fsync(server
.appendfd
);
4130 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4133 /* Snapshotting. Perform a SYNC SAVE and exit */
4134 if (rdbSave(server
.dbfilename
) == REDIS_OK
) {
4135 if (server
.daemonize
)
4136 unlink(server
.pidfile
);
4137 redisLog(REDIS_WARNING
,"%zu bytes used at exit",zmalloc_used_memory());
4138 redisLog(REDIS_WARNING
,"Server exit now, bye bye...");
4139 if (server
.vm_enabled
) unlink(server
.vm_swap_file
);
4142 /* Ooops.. error saving! The best we can do is to continue operating.
4143 * Note that if there was a background saving process, in the next
4144 * cron() Redis will be notified that the background saving aborted,
4145 * handling special stuff like slaves pending for synchronization... */
4146 redisLog(REDIS_WARNING
,"Error trying to save the DB, can't exit");
4147 addReplySds(c
,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4152 static void renameGenericCommand(redisClient
*c
, int nx
) {
4155 /* To use the same key as src and dst is probably an error */
4156 if (sdscmp(c
->argv
[1]->ptr
,c
->argv
[2]->ptr
) == 0) {
4157 addReply(c
,shared
.sameobjecterr
);
4161 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4163 addReply(c
,shared
.nokeyerr
);
4167 deleteIfVolatile(c
->db
,c
->argv
[2]);
4168 if (dictAdd(c
->db
->dict
,c
->argv
[2],o
) == DICT_ERR
) {
4171 addReply(c
,shared
.czero
);
4174 dictReplace(c
->db
->dict
,c
->argv
[2],o
);
4176 incrRefCount(c
->argv
[2]);
4178 deleteKey(c
->db
,c
->argv
[1]);
4180 addReply(c
,nx
? shared
.cone
: shared
.ok
);
4183 static void renameCommand(redisClient
*c
) {
4184 renameGenericCommand(c
,0);
4187 static void renamenxCommand(redisClient
*c
) {
4188 renameGenericCommand(c
,1);
4191 static void moveCommand(redisClient
*c
) {
4196 /* Obtain source and target DB pointers */
4199 if (selectDb(c
,atoi(c
->argv
[2]->ptr
)) == REDIS_ERR
) {
4200 addReply(c
,shared
.outofrangeerr
);
4204 selectDb(c
,srcid
); /* Back to the source DB */
4206 /* If the user is moving using as target the same
4207 * DB as the source DB it is probably an error. */
4209 addReply(c
,shared
.sameobjecterr
);
4213 /* Check if the element exists and get a reference */
4214 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4216 addReply(c
,shared
.czero
);
4220 /* Try to add the element to the target DB */
4221 deleteIfVolatile(dst
,c
->argv
[1]);
4222 if (dictAdd(dst
->dict
,c
->argv
[1],o
) == DICT_ERR
) {
4223 addReply(c
,shared
.czero
);
4226 incrRefCount(c
->argv
[1]);
4229 /* OK! key moved, free the entry in the source DB */
4230 deleteKey(src
,c
->argv
[1]);
4232 addReply(c
,shared
.cone
);
4235 /* =================================== Lists ================================ */
4236 static void pushGenericCommand(redisClient
*c
, int where
) {
4240 lobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4242 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4243 addReply(c
,shared
.cone
);
4246 lobj
= createListObject();
4248 if (where
== REDIS_HEAD
) {
4249 listAddNodeHead(list
,c
->argv
[2]);
4251 listAddNodeTail(list
,c
->argv
[2]);
4253 dictAdd(c
->db
->dict
,c
->argv
[1],lobj
);
4254 incrRefCount(c
->argv
[1]);
4255 incrRefCount(c
->argv
[2]);
4257 if (lobj
->type
!= REDIS_LIST
) {
4258 addReply(c
,shared
.wrongtypeerr
);
4261 if (handleClientsWaitingListPush(c
,c
->argv
[1],c
->argv
[2])) {
4262 addReply(c
,shared
.cone
);
4266 if (where
== REDIS_HEAD
) {
4267 listAddNodeHead(list
,c
->argv
[2]);
4269 listAddNodeTail(list
,c
->argv
[2]);
4271 incrRefCount(c
->argv
[2]);
4274 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(list
)));
4277 static void lpushCommand(redisClient
*c
) {
4278 pushGenericCommand(c
,REDIS_HEAD
);
4281 static void rpushCommand(redisClient
*c
) {
4282 pushGenericCommand(c
,REDIS_TAIL
);
4285 static void llenCommand(redisClient
*c
) {
4289 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4291 addReply(c
,shared
.czero
);
4294 if (o
->type
!= REDIS_LIST
) {
4295 addReply(c
,shared
.wrongtypeerr
);
4298 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",listLength(l
)));
4303 static void lindexCommand(redisClient
*c
) {
4305 int index
= atoi(c
->argv
[2]->ptr
);
4307 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4309 addReply(c
,shared
.nullbulk
);
4311 if (o
->type
!= REDIS_LIST
) {
4312 addReply(c
,shared
.wrongtypeerr
);
4314 list
*list
= o
->ptr
;
4317 ln
= listIndex(list
, index
);
4319 addReply(c
,shared
.nullbulk
);
4321 robj
*ele
= listNodeValue(ln
);
4322 addReplyBulkLen(c
,ele
);
4324 addReply(c
,shared
.crlf
);
4330 static void lsetCommand(redisClient
*c
) {
4332 int index
= atoi(c
->argv
[2]->ptr
);
4334 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4336 addReply(c
,shared
.nokeyerr
);
4338 if (o
->type
!= REDIS_LIST
) {
4339 addReply(c
,shared
.wrongtypeerr
);
4341 list
*list
= o
->ptr
;
4344 ln
= listIndex(list
, index
);
4346 addReply(c
,shared
.outofrangeerr
);
4348 robj
*ele
= listNodeValue(ln
);
4351 listNodeValue(ln
) = c
->argv
[3];
4352 incrRefCount(c
->argv
[3]);
4353 addReply(c
,shared
.ok
);
4360 static void popGenericCommand(redisClient
*c
, int where
) {
4363 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4365 addReply(c
,shared
.nullbulk
);
4367 if (o
->type
!= REDIS_LIST
) {
4368 addReply(c
,shared
.wrongtypeerr
);
4370 list
*list
= o
->ptr
;
4373 if (where
== REDIS_HEAD
)
4374 ln
= listFirst(list
);
4376 ln
= listLast(list
);
4379 addReply(c
,shared
.nullbulk
);
4381 robj
*ele
= listNodeValue(ln
);
4382 addReplyBulkLen(c
,ele
);
4384 addReply(c
,shared
.crlf
);
4385 listDelNode(list
,ln
);
4392 static void lpopCommand(redisClient
*c
) {
4393 popGenericCommand(c
,REDIS_HEAD
);
4396 static void rpopCommand(redisClient
*c
) {
4397 popGenericCommand(c
,REDIS_TAIL
);
4400 static void lrangeCommand(redisClient
*c
) {
4402 int start
= atoi(c
->argv
[2]->ptr
);
4403 int end
= atoi(c
->argv
[3]->ptr
);
4405 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4407 addReply(c
,shared
.nullmultibulk
);
4409 if (o
->type
!= REDIS_LIST
) {
4410 addReply(c
,shared
.wrongtypeerr
);
4412 list
*list
= o
->ptr
;
4414 int llen
= listLength(list
);
4418 /* convert negative indexes */
4419 if (start
< 0) start
= llen
+start
;
4420 if (end
< 0) end
= llen
+end
;
4421 if (start
< 0) start
= 0;
4422 if (end
< 0) end
= 0;
4424 /* indexes sanity checks */
4425 if (start
> end
|| start
>= llen
) {
4426 /* Out of range start or start > end result in empty list */
4427 addReply(c
,shared
.emptymultibulk
);
4430 if (end
>= llen
) end
= llen
-1;
4431 rangelen
= (end
-start
)+1;
4433 /* Return the result in form of a multi-bulk reply */
4434 ln
= listIndex(list
, start
);
4435 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",rangelen
));
4436 for (j
= 0; j
< rangelen
; j
++) {
4437 ele
= listNodeValue(ln
);
4438 addReplyBulkLen(c
,ele
);
4440 addReply(c
,shared
.crlf
);
4447 static void ltrimCommand(redisClient
*c
) {
4449 int start
= atoi(c
->argv
[2]->ptr
);
4450 int end
= atoi(c
->argv
[3]->ptr
);
4452 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4454 addReply(c
,shared
.ok
);
4456 if (o
->type
!= REDIS_LIST
) {
4457 addReply(c
,shared
.wrongtypeerr
);
4459 list
*list
= o
->ptr
;
4461 int llen
= listLength(list
);
4462 int j
, ltrim
, rtrim
;
4464 /* convert negative indexes */
4465 if (start
< 0) start
= llen
+start
;
4466 if (end
< 0) end
= llen
+end
;
4467 if (start
< 0) start
= 0;
4468 if (end
< 0) end
= 0;
4470 /* indexes sanity checks */
4471 if (start
> end
|| start
>= llen
) {
4472 /* Out of range start or start > end result in empty list */
4476 if (end
>= llen
) end
= llen
-1;
4481 /* Remove list elements to perform the trim */
4482 for (j
= 0; j
< ltrim
; j
++) {
4483 ln
= listFirst(list
);
4484 listDelNode(list
,ln
);
4486 for (j
= 0; j
< rtrim
; j
++) {
4487 ln
= listLast(list
);
4488 listDelNode(list
,ln
);
4491 addReply(c
,shared
.ok
);
4496 static void lremCommand(redisClient
*c
) {
4499 o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4501 addReply(c
,shared
.czero
);
4503 if (o
->type
!= REDIS_LIST
) {
4504 addReply(c
,shared
.wrongtypeerr
);
4506 list
*list
= o
->ptr
;
4507 listNode
*ln
, *next
;
4508 int toremove
= atoi(c
->argv
[2]->ptr
);
4513 toremove
= -toremove
;
4516 ln
= fromtail
? list
->tail
: list
->head
;
4518 robj
*ele
= listNodeValue(ln
);
4520 next
= fromtail
? ln
->prev
: ln
->next
;
4521 if (compareStringObjects(ele
,c
->argv
[3]) == 0) {
4522 listDelNode(list
,ln
);
4525 if (toremove
&& removed
== toremove
) break;
4529 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",removed
));
4534 /* This is the semantic of this command:
4535 * RPOPLPUSH srclist dstlist:
4536 * IF LLEN(srclist) > 0
4537 * element = RPOP srclist
4538 * LPUSH dstlist element
4545 * The idea is to be able to get an element from a list in a reliable way
4546 * since the element is not just returned but pushed against another list
4547 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4549 static void rpoplpushcommand(redisClient
*c
) {
4552 sobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4554 addReply(c
,shared
.nullbulk
);
4556 if (sobj
->type
!= REDIS_LIST
) {
4557 addReply(c
,shared
.wrongtypeerr
);
4559 list
*srclist
= sobj
->ptr
;
4560 listNode
*ln
= listLast(srclist
);
4563 addReply(c
,shared
.nullbulk
);
4565 robj
*dobj
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4566 robj
*ele
= listNodeValue(ln
);
4569 if (dobj
&& dobj
->type
!= REDIS_LIST
) {
4570 addReply(c
,shared
.wrongtypeerr
);
4574 /* Add the element to the target list (unless it's directly
4575 * passed to some BLPOP-ing client */
4576 if (!handleClientsWaitingListPush(c
,c
->argv
[2],ele
)) {
4578 /* Create the list if the key does not exist */
4579 dobj
= createListObject();
4580 dictAdd(c
->db
->dict
,c
->argv
[2],dobj
);
4581 incrRefCount(c
->argv
[2]);
4583 dstlist
= dobj
->ptr
;
4584 listAddNodeHead(dstlist
,ele
);
4588 /* Send the element to the client as reply as well */
4589 addReplyBulkLen(c
,ele
);
4591 addReply(c
,shared
.crlf
);
4593 /* Finally remove the element from the source list */
4594 listDelNode(srclist
,ln
);
4602 /* ==================================== Sets ================================ */
4604 static void saddCommand(redisClient
*c
) {
4607 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4609 set
= createSetObject();
4610 dictAdd(c
->db
->dict
,c
->argv
[1],set
);
4611 incrRefCount(c
->argv
[1]);
4613 if (set
->type
!= REDIS_SET
) {
4614 addReply(c
,shared
.wrongtypeerr
);
4618 if (dictAdd(set
->ptr
,c
->argv
[2],NULL
) == DICT_OK
) {
4619 incrRefCount(c
->argv
[2]);
4621 addReply(c
,shared
.cone
);
4623 addReply(c
,shared
.czero
);
4627 static void sremCommand(redisClient
*c
) {
4630 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4632 addReply(c
,shared
.czero
);
4634 if (set
->type
!= REDIS_SET
) {
4635 addReply(c
,shared
.wrongtypeerr
);
4638 if (dictDelete(set
->ptr
,c
->argv
[2]) == DICT_OK
) {
4640 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4641 addReply(c
,shared
.cone
);
4643 addReply(c
,shared
.czero
);
4648 static void smoveCommand(redisClient
*c
) {
4649 robj
*srcset
, *dstset
;
4651 srcset
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4652 dstset
= lookupKeyWrite(c
->db
,c
->argv
[2]);
4654 /* If the source key does not exist return 0, if it's of the wrong type
4656 if (srcset
== NULL
|| srcset
->type
!= REDIS_SET
) {
4657 addReply(c
, srcset
? shared
.wrongtypeerr
: shared
.czero
);
4660 /* Error if the destination key is not a set as well */
4661 if (dstset
&& dstset
->type
!= REDIS_SET
) {
4662 addReply(c
,shared
.wrongtypeerr
);
4665 /* Remove the element from the source set */
4666 if (dictDelete(srcset
->ptr
,c
->argv
[3]) == DICT_ERR
) {
4667 /* Key not found in the src set! return zero */
4668 addReply(c
,shared
.czero
);
4672 /* Add the element to the destination set */
4674 dstset
= createSetObject();
4675 dictAdd(c
->db
->dict
,c
->argv
[2],dstset
);
4676 incrRefCount(c
->argv
[2]);
4678 if (dictAdd(dstset
->ptr
,c
->argv
[3],NULL
) == DICT_OK
)
4679 incrRefCount(c
->argv
[3]);
4680 addReply(c
,shared
.cone
);
4683 static void sismemberCommand(redisClient
*c
) {
4686 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4688 addReply(c
,shared
.czero
);
4690 if (set
->type
!= REDIS_SET
) {
4691 addReply(c
,shared
.wrongtypeerr
);
4694 if (dictFind(set
->ptr
,c
->argv
[2]))
4695 addReply(c
,shared
.cone
);
4697 addReply(c
,shared
.czero
);
4701 static void scardCommand(redisClient
*c
) {
4705 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
4707 addReply(c
,shared
.czero
);
4710 if (o
->type
!= REDIS_SET
) {
4711 addReply(c
,shared
.wrongtypeerr
);
4714 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4720 static void spopCommand(redisClient
*c
) {
4724 set
= lookupKeyWrite(c
->db
,c
->argv
[1]);
4726 addReply(c
,shared
.nullbulk
);
4728 if (set
->type
!= REDIS_SET
) {
4729 addReply(c
,shared
.wrongtypeerr
);
4732 de
= dictGetRandomKey(set
->ptr
);
4734 addReply(c
,shared
.nullbulk
);
4736 robj
*ele
= dictGetEntryKey(de
);
4738 addReplyBulkLen(c
,ele
);
4740 addReply(c
,shared
.crlf
);
4741 dictDelete(set
->ptr
,ele
);
4742 if (htNeedsResize(set
->ptr
)) dictResize(set
->ptr
);
4748 static void srandmemberCommand(redisClient
*c
) {
4752 set
= lookupKeyRead(c
->db
,c
->argv
[1]);
4754 addReply(c
,shared
.nullbulk
);
4756 if (set
->type
!= REDIS_SET
) {
4757 addReply(c
,shared
.wrongtypeerr
);
4760 de
= dictGetRandomKey(set
->ptr
);
4762 addReply(c
,shared
.nullbulk
);
4764 robj
*ele
= dictGetEntryKey(de
);
4766 addReplyBulkLen(c
,ele
);
4768 addReply(c
,shared
.crlf
);
4773 static int qsortCompareSetsByCardinality(const void *s1
, const void *s2
) {
4774 dict
**d1
= (void*) s1
, **d2
= (void*) s2
;
4776 return dictSize(*d1
)-dictSize(*d2
);
4779 static void sinterGenericCommand(redisClient
*c
, robj
**setskeys
, unsigned long setsnum
, robj
*dstkey
) {
4780 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4783 robj
*lenobj
= NULL
, *dstset
= NULL
;
4784 unsigned long j
, cardinality
= 0;
4786 for (j
= 0; j
< setsnum
; j
++) {
4790 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4791 lookupKeyRead(c
->db
,setskeys
[j
]);
4795 if (deleteKey(c
->db
,dstkey
))
4797 addReply(c
,shared
.czero
);
4799 addReply(c
,shared
.nullmultibulk
);
4803 if (setobj
->type
!= REDIS_SET
) {
4805 addReply(c
,shared
.wrongtypeerr
);
4808 dv
[j
] = setobj
->ptr
;
4810 /* Sort sets from the smallest to largest, this will improve our
4811 * algorithm's performace */
4812 qsort(dv
,setsnum
,sizeof(dict
*),qsortCompareSetsByCardinality
);
4814 /* The first thing we should output is the total number of elements...
4815 * since this is a multi-bulk write, but at this stage we don't know
4816 * the intersection set size, so we use a trick, append an empty object
4817 * to the output list and save the pointer to later modify it with the
4820 lenobj
= createObject(REDIS_STRING
,NULL
);
4822 decrRefCount(lenobj
);
4824 /* If we have a target key where to store the resulting set
4825 * create this key with an empty set inside */
4826 dstset
= createSetObject();
4829 /* Iterate all the elements of the first (smallest) set, and test
4830 * the element against all the other sets, if at least one set does
4831 * not include the element it is discarded */
4832 di
= dictGetIterator(dv
[0]);
4834 while((de
= dictNext(di
)) != NULL
) {
4837 for (j
= 1; j
< setsnum
; j
++)
4838 if (dictFind(dv
[j
],dictGetEntryKey(de
)) == NULL
) break;
4840 continue; /* at least one set does not contain the member */
4841 ele
= dictGetEntryKey(de
);
4843 addReplyBulkLen(c
,ele
);
4845 addReply(c
,shared
.crlf
);
4848 dictAdd(dstset
->ptr
,ele
,NULL
);
4852 dictReleaseIterator(di
);
4855 /* Store the resulting set into the target */
4856 deleteKey(c
->db
,dstkey
);
4857 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4858 incrRefCount(dstkey
);
4862 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",cardinality
);
4864 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4865 dictSize((dict
*)dstset
->ptr
)));
4871 static void sinterCommand(redisClient
*c
) {
4872 sinterGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
);
4875 static void sinterstoreCommand(redisClient
*c
) {
4876 sinterGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1]);
4879 #define REDIS_OP_UNION 0
4880 #define REDIS_OP_DIFF 1
4881 #define REDIS_OP_INTER 2
4883 static void sunionDiffGenericCommand(redisClient
*c
, robj
**setskeys
, int setsnum
, robj
*dstkey
, int op
) {
4884 dict
**dv
= zmalloc(sizeof(dict
*)*setsnum
);
4887 robj
*dstset
= NULL
;
4888 int j
, cardinality
= 0;
4890 for (j
= 0; j
< setsnum
; j
++) {
4894 lookupKeyWrite(c
->db
,setskeys
[j
]) :
4895 lookupKeyRead(c
->db
,setskeys
[j
]);
4900 if (setobj
->type
!= REDIS_SET
) {
4902 addReply(c
,shared
.wrongtypeerr
);
4905 dv
[j
] = setobj
->ptr
;
4908 /* We need a temp set object to store our union. If the dstkey
4909 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4910 * this set object will be the resulting object to set into the target key*/
4911 dstset
= createSetObject();
4913 /* Iterate all the elements of all the sets, add every element a single
4914 * time to the result set */
4915 for (j
= 0; j
< setsnum
; j
++) {
4916 if (op
== REDIS_OP_DIFF
&& j
== 0 && !dv
[j
]) break; /* result set is empty */
4917 if (!dv
[j
]) continue; /* non existing keys are like empty sets */
4919 di
= dictGetIterator(dv
[j
]);
4921 while((de
= dictNext(di
)) != NULL
) {
4924 /* dictAdd will not add the same element multiple times */
4925 ele
= dictGetEntryKey(de
);
4926 if (op
== REDIS_OP_UNION
|| j
== 0) {
4927 if (dictAdd(dstset
->ptr
,ele
,NULL
) == DICT_OK
) {
4931 } else if (op
== REDIS_OP_DIFF
) {
4932 if (dictDelete(dstset
->ptr
,ele
) == DICT_OK
) {
4937 dictReleaseIterator(di
);
4939 if (op
== REDIS_OP_DIFF
&& cardinality
== 0) break; /* result set is empty */
4942 /* Output the content of the resulting set, if not in STORE mode */
4944 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",cardinality
));
4945 di
= dictGetIterator(dstset
->ptr
);
4946 while((de
= dictNext(di
)) != NULL
) {
4949 ele
= dictGetEntryKey(de
);
4950 addReplyBulkLen(c
,ele
);
4952 addReply(c
,shared
.crlf
);
4954 dictReleaseIterator(di
);
4956 /* If we have a target key where to store the resulting set
4957 * create this key with the result set inside */
4958 deleteKey(c
->db
,dstkey
);
4959 dictAdd(c
->db
->dict
,dstkey
,dstset
);
4960 incrRefCount(dstkey
);
4965 decrRefCount(dstset
);
4967 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",
4968 dictSize((dict
*)dstset
->ptr
)));
4974 static void sunionCommand(redisClient
*c
) {
4975 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_UNION
);
4978 static void sunionstoreCommand(redisClient
*c
) {
4979 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_UNION
);
4982 static void sdiffCommand(redisClient
*c
) {
4983 sunionDiffGenericCommand(c
,c
->argv
+1,c
->argc
-1,NULL
,REDIS_OP_DIFF
);
4986 static void sdiffstoreCommand(redisClient
*c
) {
4987 sunionDiffGenericCommand(c
,c
->argv
+2,c
->argc
-2,c
->argv
[1],REDIS_OP_DIFF
);
4990 /* ==================================== ZSets =============================== */
4992 /* ZSETs are ordered sets using two data structures to hold the same elements
4993 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4996 * The elements are added to an hash table mapping Redis objects to scores.
4997 * At the same time the elements are added to a skip list mapping scores
4998 * to Redis objects (so objects are sorted by scores in this "view"). */
5000 /* This skiplist implementation is almost a C translation of the original
5001 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5002 * Alternative to Balanced Trees", modified in three ways:
5003 * a) this implementation allows for repeated values.
5004 * b) the comparison is not just by key (our 'score') but by satellite data.
5005 * c) there is a back pointer, so it's a doubly linked list with the back
5006 * pointers being only at "level 1". This allows to traverse the list
5007 * from tail to head, useful for ZREVRANGE. */
5009 static zskiplistNode
*zslCreateNode(int level
, double score
, robj
*obj
) {
5010 zskiplistNode
*zn
= zmalloc(sizeof(*zn
));
5012 zn
->forward
= zmalloc(sizeof(zskiplistNode
*) * level
);
5014 zn
->span
= zmalloc(sizeof(unsigned int) * (level
- 1));
5020 static zskiplist
*zslCreate(void) {
5024 zsl
= zmalloc(sizeof(*zsl
));
5027 zsl
->header
= zslCreateNode(ZSKIPLIST_MAXLEVEL
,0,NULL
);
5028 for (j
= 0; j
< ZSKIPLIST_MAXLEVEL
; j
++) {
5029 zsl
->header
->forward
[j
] = NULL
;
5031 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5032 if (j
< ZSKIPLIST_MAXLEVEL
-1)
5033 zsl
->header
->span
[j
] = 0;
5035 zsl
->header
->backward
= NULL
;
5040 static void zslFreeNode(zskiplistNode
*node
) {
5041 decrRefCount(node
->obj
);
5042 zfree(node
->forward
);
5047 static void zslFree(zskiplist
*zsl
) {
5048 zskiplistNode
*node
= zsl
->header
->forward
[0], *next
;
5050 zfree(zsl
->header
->forward
);
5051 zfree(zsl
->header
->span
);
5054 next
= node
->forward
[0];
5061 static int zslRandomLevel(void) {
5063 while ((random()&0xFFFF) < (ZSKIPLIST_P
* 0xFFFF))
5068 static void zslInsert(zskiplist
*zsl
, double score
, robj
*obj
) {
5069 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5070 unsigned int rank
[ZSKIPLIST_MAXLEVEL
];
5074 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5075 /* store rank that is crossed to reach the insert position */
5076 rank
[i
] = i
== (zsl
->level
-1) ? 0 : rank
[i
+1];
5078 while (x
->forward
[i
] &&
5079 (x
->forward
[i
]->score
< score
||
5080 (x
->forward
[i
]->score
== score
&&
5081 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0))) {
5082 rank
[i
] += i
> 0 ? x
->span
[i
-1] : 1;
5087 /* we assume the key is not already inside, since we allow duplicated
5088 * scores, and the re-insertion of score and redis object should never
5089 * happpen since the caller of zslInsert() should test in the hash table
5090 * if the element is already inside or not. */
5091 level
= zslRandomLevel();
5092 if (level
> zsl
->level
) {
5093 for (i
= zsl
->level
; i
< level
; i
++) {
5095 update
[i
] = zsl
->header
;
5096 update
[i
]->span
[i
-1] = zsl
->length
;
5100 x
= zslCreateNode(level
,score
,obj
);
5101 for (i
= 0; i
< level
; i
++) {
5102 x
->forward
[i
] = update
[i
]->forward
[i
];
5103 update
[i
]->forward
[i
] = x
;
5105 /* update span covered by update[i] as x is inserted here */
5107 x
->span
[i
-1] = update
[i
]->span
[i
-1] - (rank
[0] - rank
[i
]);
5108 update
[i
]->span
[i
-1] = (rank
[0] - rank
[i
]) + 1;
5112 /* increment span for untouched levels */
5113 for (i
= level
; i
< zsl
->level
; i
++) {
5114 update
[i
]->span
[i
-1]++;
5117 x
->backward
= (update
[0] == zsl
->header
) ? NULL
: update
[0];
5119 x
->forward
[0]->backward
= x
;
5125 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5126 void zslDeleteNode(zskiplist
*zsl
, zskiplistNode
*x
, zskiplistNode
**update
) {
5128 for (i
= 0; i
< zsl
->level
; i
++) {
5129 if (update
[i
]->forward
[i
] == x
) {
5131 update
[i
]->span
[i
-1] += x
->span
[i
-1] - 1;
5133 update
[i
]->forward
[i
] = x
->forward
[i
];
5135 /* invariant: i > 0, because update[0]->forward[0]
5136 * is always equal to x */
5137 update
[i
]->span
[i
-1] -= 1;
5140 if (x
->forward
[0]) {
5141 x
->forward
[0]->backward
= x
->backward
;
5143 zsl
->tail
= x
->backward
;
5145 while(zsl
->level
> 1 && zsl
->header
->forward
[zsl
->level
-1] == NULL
)
5150 /* Delete an element with matching score/object from the skiplist. */
5151 static int zslDelete(zskiplist
*zsl
, double score
, robj
*obj
) {
5152 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5156 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5157 while (x
->forward
[i
] &&
5158 (x
->forward
[i
]->score
< score
||
5159 (x
->forward
[i
]->score
== score
&&
5160 compareStringObjects(x
->forward
[i
]->obj
,obj
) < 0)))
5164 /* We may have multiple elements with the same score, what we need
5165 * is to find the element with both the right score and object. */
5167 if (x
&& score
== x
->score
&& compareStringObjects(x
->obj
,obj
) == 0) {
5168 zslDeleteNode(zsl
, x
, update
);
5172 return 0; /* not found */
5174 return 0; /* not found */
5177 /* Delete all the elements with score between min and max from the skiplist.
5178 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5179 * Note that this function takes the reference to the hash table view of the
5180 * sorted set, in order to remove the elements from the hash table too. */
5181 static unsigned long zslDeleteRangeByScore(zskiplist
*zsl
, double min
, double max
, dict
*dict
) {
5182 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5183 unsigned long removed
= 0;
5187 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5188 while (x
->forward
[i
] && x
->forward
[i
]->score
< min
)
5192 /* We may have multiple elements with the same score, what we need
5193 * is to find the element with both the right score and object. */
5195 while (x
&& x
->score
<= max
) {
5196 zskiplistNode
*next
= x
->forward
[0];
5197 zslDeleteNode(zsl
, x
, update
);
5198 dictDelete(dict
,x
->obj
);
5203 return removed
; /* not found */
5206 /* Delete all the elements with rank between start and end from the skiplist.
5207 * Start and end are inclusive. Note that start and end need to be 1-based */
5208 static unsigned long zslDeleteRangeByRank(zskiplist
*zsl
, unsigned int start
, unsigned int end
, dict
*dict
) {
5209 zskiplistNode
*update
[ZSKIPLIST_MAXLEVEL
], *x
;
5210 unsigned long traversed
= 0, removed
= 0;
5214 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5215 while (x
->forward
[i
] && (traversed
+ (i
> 0 ? x
->span
[i
-1] : 1)) < start
) {
5216 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5224 while (x
&& traversed
<= end
) {
5225 zskiplistNode
*next
= x
->forward
[0];
5226 zslDeleteNode(zsl
, x
, update
);
5227 dictDelete(dict
,x
->obj
);
5236 /* Find the first node having a score equal or greater than the specified one.
5237 * Returns NULL if there is no match. */
5238 static zskiplistNode
*zslFirstWithScore(zskiplist
*zsl
, double score
) {
5243 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5244 while (x
->forward
[i
] && x
->forward
[i
]->score
< score
)
5247 /* We may have multiple elements with the same score, what we need
5248 * is to find the element with both the right score and object. */
5249 return x
->forward
[0];
5252 /* Find the rank for an element by both score and key.
5253 * Returns 0 when the element cannot be found, rank otherwise.
5254 * Note that the rank is 1-based due to the span of zsl->header to the
5256 static unsigned long zslGetRank(zskiplist
*zsl
, double score
, robj
*o
) {
5258 unsigned long rank
= 0;
5262 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5263 while (x
->forward
[i
] &&
5264 (x
->forward
[i
]->score
< score
||
5265 (x
->forward
[i
]->score
== score
&&
5266 compareStringObjects(x
->forward
[i
]->obj
,o
) <= 0))) {
5267 rank
+= i
> 0 ? x
->span
[i
-1] : 1;
5271 /* x might be equal to zsl->header, so test if obj is non-NULL */
5272 if (x
->obj
&& compareStringObjects(x
->obj
,o
) == 0) {
5279 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5280 zskiplistNode
* zslGetElementByRank(zskiplist
*zsl
, unsigned long rank
) {
5282 unsigned long traversed
= 0;
5286 for (i
= zsl
->level
-1; i
>= 0; i
--) {
5287 while (x
->forward
[i
] && (traversed
+ (i
> 0 ? x
->span
[i
-1] : 1)) <= rank
) {
5288 traversed
+= i
> 0 ? x
->span
[i
-1] : 1;
5292 if (traversed
== rank
) {
5299 /* The actual Z-commands implementations */
5301 /* This generic command implements both ZADD and ZINCRBY.
5302 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5303 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5304 static void zaddGenericCommand(redisClient
*c
, robj
*key
, robj
*ele
, double scoreval
, int doincrement
) {
5309 zsetobj
= lookupKeyWrite(c
->db
,key
);
5310 if (zsetobj
== NULL
) {
5311 zsetobj
= createZsetObject();
5312 dictAdd(c
->db
->dict
,key
,zsetobj
);
5315 if (zsetobj
->type
!= REDIS_ZSET
) {
5316 addReply(c
,shared
.wrongtypeerr
);
5322 /* Ok now since we implement both ZADD and ZINCRBY here the code
5323 * needs to handle the two different conditions. It's all about setting
5324 * '*score', that is, the new score to set, to the right value. */
5325 score
= zmalloc(sizeof(double));
5329 /* Read the old score. If the element was not present starts from 0 */
5330 de
= dictFind(zs
->dict
,ele
);
5332 double *oldscore
= dictGetEntryVal(de
);
5333 *score
= *oldscore
+ scoreval
;
5341 /* What follows is a simple remove and re-insert operation that is common
5342 * to both ZADD and ZINCRBY... */
5343 if (dictAdd(zs
->dict
,ele
,score
) == DICT_OK
) {
5344 /* case 1: New element */
5345 incrRefCount(ele
); /* added to hash */
5346 zslInsert(zs
->zsl
,*score
,ele
);
5347 incrRefCount(ele
); /* added to skiplist */
5350 addReplyDouble(c
,*score
);
5352 addReply(c
,shared
.cone
);
5357 /* case 2: Score update operation */
5358 de
= dictFind(zs
->dict
,ele
);
5359 redisAssert(de
!= NULL
);
5360 oldscore
= dictGetEntryVal(de
);
5361 if (*score
!= *oldscore
) {
5364 /* Remove and insert the element in the skip list with new score */
5365 deleted
= zslDelete(zs
->zsl
,*oldscore
,ele
);
5366 redisAssert(deleted
!= 0);
5367 zslInsert(zs
->zsl
,*score
,ele
);
5369 /* Update the score in the hash table */
5370 dictReplace(zs
->dict
,ele
,score
);
5376 addReplyDouble(c
,*score
);
5378 addReply(c
,shared
.czero
);
5382 static void zaddCommand(redisClient
*c
) {
5385 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5386 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,0);
5389 static void zincrbyCommand(redisClient
*c
) {
5392 scoreval
= strtod(c
->argv
[2]->ptr
,NULL
);
5393 zaddGenericCommand(c
,c
->argv
[1],c
->argv
[3],scoreval
,1);
5396 static void zremCommand(redisClient
*c
) {
5400 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5401 if (zsetobj
== NULL
) {
5402 addReply(c
,shared
.czero
);
5408 if (zsetobj
->type
!= REDIS_ZSET
) {
5409 addReply(c
,shared
.wrongtypeerr
);
5413 de
= dictFind(zs
->dict
,c
->argv
[2]);
5415 addReply(c
,shared
.czero
);
5418 /* Delete from the skiplist */
5419 oldscore
= dictGetEntryVal(de
);
5420 deleted
= zslDelete(zs
->zsl
,*oldscore
,c
->argv
[2]);
5421 redisAssert(deleted
!= 0);
5423 /* Delete from the hash table */
5424 dictDelete(zs
->dict
,c
->argv
[2]);
5425 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5427 addReply(c
,shared
.cone
);
5431 static void zremrangebyscoreCommand(redisClient
*c
) {
5432 double min
= strtod(c
->argv
[2]->ptr
,NULL
);
5433 double max
= strtod(c
->argv
[3]->ptr
,NULL
);
5437 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5438 if (zsetobj
== NULL
) {
5439 addReply(c
,shared
.czero
);
5443 if (zsetobj
->type
!= REDIS_ZSET
) {
5444 addReply(c
,shared
.wrongtypeerr
);
5448 deleted
= zslDeleteRangeByScore(zs
->zsl
,min
,max
,zs
->dict
);
5449 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5450 server
.dirty
+= deleted
;
5451 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",deleted
));
5455 static void zremrangebyrankCommand(redisClient
*c
) {
5456 int start
= atoi(c
->argv
[2]->ptr
);
5457 int end
= atoi(c
->argv
[3]->ptr
);
5461 zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5462 if (zsetobj
== NULL
) {
5463 addReply(c
,shared
.czero
);
5465 if (zsetobj
->type
!= REDIS_ZSET
) {
5466 addReply(c
,shared
.wrongtypeerr
);
5471 int llen
= zs
->zsl
->length
;
5474 /* convert negative indexes */
5475 if (start
< 0) start
= llen
+start
;
5476 if (end
< 0) end
= llen
+end
;
5477 if (start
< 0) start
= 0;
5478 if (end
< 0) end
= 0;
5480 /* indexes sanity checks */
5481 if (start
> end
|| start
>= llen
) {
5482 addReply(c
,shared
.czero
);
5485 if (end
>= llen
) end
= llen
-1;
5487 /* increment start and end because zsl*Rank functions
5488 * use 1-based rank */
5489 deleted
= zslDeleteRangeByRank(zs
->zsl
,start
+1,end
+1,zs
->dict
);
5490 if (htNeedsResize(zs
->dict
)) dictResize(zs
->dict
);
5491 server
.dirty
+= deleted
;
5492 addReplyLong(c
, deleted
);
5501 static int qsortCompareZsetopsrcByCardinality(const void *s1
, const void *s2
) {
5502 zsetopsrc
*d1
= (void*) s1
, *d2
= (void*) s2
;
5503 unsigned long size1
, size2
;
5504 size1
= d1
->dict
? dictSize(d1
->dict
) : 0;
5505 size2
= d2
->dict
? dictSize(d2
->dict
) : 0;
5506 return size1
- size2
;
5509 static void zunionInterGenericCommand(redisClient
*c
, robj
*dstkey
, int op
) {
5517 /* expect zsetnum input keys to be given */
5518 zsetnum
= atoi(c
->argv
[2]->ptr
);
5520 addReplySds(c
,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5524 /* test if the expected number of keys would overflow */
5525 if (3+zsetnum
> c
->argc
) {
5526 addReply(c
,shared
.syntaxerr
);
5530 /* read keys to be used for input */
5531 src
= zmalloc(sizeof(zsetopsrc
) * zsetnum
);
5532 for (i
= 0, j
= 3; i
< zsetnum
; i
++, j
++) {
5533 robj
*zsetobj
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
5537 if (zsetobj
->type
!= REDIS_ZSET
) {
5539 addReply(c
,shared
.wrongtypeerr
);
5542 src
[i
].dict
= ((zset
*)zsetobj
->ptr
)->dict
;
5545 /* default all weights to 1 */
5546 src
[i
].weight
= 1.0;
5549 /* parse optional extra arguments */
5551 int remaining
= c
->argc
-j
;
5554 if (!strcasecmp(c
->argv
[j
]->ptr
,"weights")) {
5556 if (remaining
< zsetnum
) {
5558 addReplySds(c
,sdsnew("-ERR not enough weights for ZUNION/ZINTER\r\n"));
5561 for (i
= 0; i
< zsetnum
; i
++, j
++, remaining
--) {
5562 src
[i
].weight
= strtod(c
->argv
[j
]->ptr
, NULL
);
5566 addReply(c
,shared
.syntaxerr
);
5572 dstobj
= createZsetObject();
5573 dstzset
= dstobj
->ptr
;
5575 if (op
== REDIS_OP_INTER
) {
5576 /* sort sets from the smallest to largest, this will improve our
5577 * algorithm's performance */
5578 qsort(src
,zsetnum
,sizeof(zsetopsrc
), qsortCompareZsetopsrcByCardinality
);
5580 /* skip going over all entries if the smallest zset is NULL or empty */
5581 if (src
[0].dict
&& dictSize(src
[0].dict
) > 0) {
5582 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5583 * from small to large, all src[i > 0].dict are non-empty too */
5584 di
= dictGetIterator(src
[0].dict
);
5585 while((de
= dictNext(di
)) != NULL
) {
5586 double *score
= zmalloc(sizeof(double));
5589 for (j
= 0; j
< zsetnum
; j
++) {
5590 dictEntry
*other
= (j
== 0) ? de
: dictFind(src
[j
].dict
,dictGetEntryKey(de
));
5592 *score
= *score
+ src
[j
].weight
* (*(double*)dictGetEntryVal(other
));
5598 /* skip entry when not present in every source dict */
5602 robj
*o
= dictGetEntryKey(de
);
5603 dictAdd(dstzset
->dict
,o
,score
);
5604 incrRefCount(o
); /* added to dictionary */
5605 zslInsert(dstzset
->zsl
,*score
,o
);
5606 incrRefCount(o
); /* added to skiplist */
5609 dictReleaseIterator(di
);
5611 } else if (op
== REDIS_OP_UNION
) {
5612 for (i
= 0; i
< zsetnum
; i
++) {
5613 if (!src
[i
].dict
) continue;
5615 di
= dictGetIterator(src
[i
].dict
);
5616 while((de
= dictNext(di
)) != NULL
) {
5617 /* skip key when already processed */
5618 if (dictFind(dstzset
->dict
,dictGetEntryKey(de
)) != NULL
) continue;
5620 double *score
= zmalloc(sizeof(double));
5622 for (j
= 0; j
< zsetnum
; j
++) {
5623 if (!src
[j
].dict
) continue;
5625 dictEntry
*other
= (i
== j
) ? de
: dictFind(src
[j
].dict
,dictGetEntryKey(de
));
5627 *score
= *score
+ src
[j
].weight
* (*(double*)dictGetEntryVal(other
));
5631 robj
*o
= dictGetEntryKey(de
);
5632 dictAdd(dstzset
->dict
,o
,score
);
5633 incrRefCount(o
); /* added to dictionary */
5634 zslInsert(dstzset
->zsl
,*score
,o
);
5635 incrRefCount(o
); /* added to skiplist */
5637 dictReleaseIterator(di
);
5640 /* unknown operator */
5641 redisAssert(op
== REDIS_OP_INTER
|| op
== REDIS_OP_UNION
);
5644 deleteKey(c
->db
,dstkey
);
5645 dictAdd(c
->db
->dict
,dstkey
,dstobj
);
5646 incrRefCount(dstkey
);
5648 addReplyLong(c
, dstzset
->zsl
->length
);
5653 static void zunionCommand(redisClient
*c
) {
5654 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_UNION
);
5657 static void zinterCommand(redisClient
*c
) {
5658 zunionInterGenericCommand(c
,c
->argv
[1], REDIS_OP_INTER
);
5661 static void zrangeGenericCommand(redisClient
*c
, int reverse
) {
5663 int start
= atoi(c
->argv
[2]->ptr
);
5664 int end
= atoi(c
->argv
[3]->ptr
);
5667 if (c
->argc
== 5 && !strcasecmp(c
->argv
[4]->ptr
,"withscores")) {
5669 } else if (c
->argc
>= 5) {
5670 addReply(c
,shared
.syntaxerr
);
5674 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5676 addReply(c
,shared
.nullmultibulk
);
5678 if (o
->type
!= REDIS_ZSET
) {
5679 addReply(c
,shared
.wrongtypeerr
);
5681 zset
*zsetobj
= o
->ptr
;
5682 zskiplist
*zsl
= zsetobj
->zsl
;
5685 int llen
= zsl
->length
;
5689 /* convert negative indexes */
5690 if (start
< 0) start
= llen
+start
;
5691 if (end
< 0) end
= llen
+end
;
5692 if (start
< 0) start
= 0;
5693 if (end
< 0) end
= 0;
5695 /* indexes sanity checks */
5696 if (start
> end
|| start
>= llen
) {
5697 /* Out of range start or start > end result in empty list */
5698 addReply(c
,shared
.emptymultibulk
);
5701 if (end
>= llen
) end
= llen
-1;
5702 rangelen
= (end
-start
)+1;
5704 /* check if starting point is trivial, before searching
5705 * the element in log(N) time */
5707 ln
= start
== 0 ? zsl
->tail
: zslGetElementByRank(zsl
, llen
-start
);
5709 ln
= start
== 0 ? zsl
->header
->forward
[0] : zslGetElementByRank(zsl
, start
+1);
5712 /* Return the result in form of a multi-bulk reply */
5713 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",
5714 withscores
? (rangelen
*2) : rangelen
));
5715 for (j
= 0; j
< rangelen
; j
++) {
5717 addReplyBulkLen(c
,ele
);
5719 addReply(c
,shared
.crlf
);
5721 addReplyDouble(c
,ln
->score
);
5722 ln
= reverse
? ln
->backward
: ln
->forward
[0];
5728 static void zrangeCommand(redisClient
*c
) {
5729 zrangeGenericCommand(c
,0);
5732 static void zrevrangeCommand(redisClient
*c
) {
5733 zrangeGenericCommand(c
,1);
5736 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5737 * If justcount is non-zero, just the count is returned. */
5738 static void genericZrangebyscoreCommand(redisClient
*c
, int justcount
) {
5741 int minex
= 0, maxex
= 0; /* are min or max exclusive? */
5742 int offset
= 0, limit
= -1;
5746 /* Parse the min-max interval. If one of the values is prefixed
5747 * by the "(" character, it's considered "open". For instance
5748 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5749 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5750 if (((char*)c
->argv
[2]->ptr
)[0] == '(') {
5751 min
= strtod((char*)c
->argv
[2]->ptr
+1,NULL
);
5754 min
= strtod(c
->argv
[2]->ptr
,NULL
);
5756 if (((char*)c
->argv
[3]->ptr
)[0] == '(') {
5757 max
= strtod((char*)c
->argv
[3]->ptr
+1,NULL
);
5760 max
= strtod(c
->argv
[3]->ptr
,NULL
);
5763 /* Parse "WITHSCORES": note that if the command was called with
5764 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5765 * enter the following paths to parse WITHSCORES and LIMIT. */
5766 if (c
->argc
== 5 || c
->argc
== 8) {
5767 if (strcasecmp(c
->argv
[c
->argc
-1]->ptr
,"withscores") == 0)
5772 if (c
->argc
!= (4 + withscores
) && c
->argc
!= (7 + withscores
))
5776 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5781 if (c
->argc
== (7 + withscores
) && strcasecmp(c
->argv
[4]->ptr
,"limit")) {
5782 addReply(c
,shared
.syntaxerr
);
5784 } else if (c
->argc
== (7 + withscores
)) {
5785 offset
= atoi(c
->argv
[5]->ptr
);
5786 limit
= atoi(c
->argv
[6]->ptr
);
5787 if (offset
< 0) offset
= 0;
5790 /* Ok, lookup the key and get the range */
5791 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5793 addReply(c
,justcount
? shared
.czero
: shared
.nullmultibulk
);
5795 if (o
->type
!= REDIS_ZSET
) {
5796 addReply(c
,shared
.wrongtypeerr
);
5798 zset
*zsetobj
= o
->ptr
;
5799 zskiplist
*zsl
= zsetobj
->zsl
;
5801 robj
*ele
, *lenobj
= NULL
;
5802 unsigned long rangelen
= 0;
5804 /* Get the first node with the score >= min, or with
5805 * score > min if 'minex' is true. */
5806 ln
= zslFirstWithScore(zsl
,min
);
5807 while (minex
&& ln
&& ln
->score
== min
) ln
= ln
->forward
[0];
5810 /* No element matching the speciifed interval */
5811 addReply(c
,justcount
? shared
.czero
: shared
.emptymultibulk
);
5815 /* We don't know in advance how many matching elements there
5816 * are in the list, so we push this object that will represent
5817 * the multi-bulk length in the output buffer, and will "fix"
5820 lenobj
= createObject(REDIS_STRING
,NULL
);
5822 decrRefCount(lenobj
);
5825 while(ln
&& (maxex
? (ln
->score
< max
) : (ln
->score
<= max
))) {
5828 ln
= ln
->forward
[0];
5831 if (limit
== 0) break;
5834 addReplyBulkLen(c
,ele
);
5836 addReply(c
,shared
.crlf
);
5838 addReplyDouble(c
,ln
->score
);
5840 ln
= ln
->forward
[0];
5842 if (limit
> 0) limit
--;
5845 addReplyLong(c
,(long)rangelen
);
5847 lenobj
->ptr
= sdscatprintf(sdsempty(),"*%lu\r\n",
5848 withscores
? (rangelen
*2) : rangelen
);
5854 static void zrangebyscoreCommand(redisClient
*c
) {
5855 genericZrangebyscoreCommand(c
,0);
5858 static void zcountCommand(redisClient
*c
) {
5859 genericZrangebyscoreCommand(c
,1);
5862 static void zcardCommand(redisClient
*c
) {
5866 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5868 addReply(c
,shared
.czero
);
5871 if (o
->type
!= REDIS_ZSET
) {
5872 addReply(c
,shared
.wrongtypeerr
);
5875 addReplySds(c
,sdscatprintf(sdsempty(),":%lu\r\n",zs
->zsl
->length
));
5880 static void zscoreCommand(redisClient
*c
) {
5884 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5886 addReply(c
,shared
.nullbulk
);
5889 if (o
->type
!= REDIS_ZSET
) {
5890 addReply(c
,shared
.wrongtypeerr
);
5895 de
= dictFind(zs
->dict
,c
->argv
[2]);
5897 addReply(c
,shared
.nullbulk
);
5899 double *score
= dictGetEntryVal(de
);
5901 addReplyDouble(c
,*score
);
5907 static void zrankGenericCommand(redisClient
*c
, int reverse
) {
5909 o
= lookupKeyRead(c
->db
,c
->argv
[1]);
5911 addReply(c
,shared
.nullbulk
);
5914 if (o
->type
!= REDIS_ZSET
) {
5915 addReply(c
,shared
.wrongtypeerr
);
5918 zskiplist
*zsl
= zs
->zsl
;
5922 de
= dictFind(zs
->dict
,c
->argv
[2]);
5924 addReply(c
,shared
.nullbulk
);
5928 double *score
= dictGetEntryVal(de
);
5929 rank
= zslGetRank(zsl
, *score
, c
->argv
[2]);
5932 addReplyLong(c
, zsl
->length
- rank
);
5934 addReplyLong(c
, rank
-1);
5937 addReply(c
,shared
.nullbulk
);
5942 static void zrankCommand(redisClient
*c
) {
5943 zrankGenericCommand(c
, 0);
5946 static void zrevrankCommand(redisClient
*c
) {
5947 zrankGenericCommand(c
, 1);
5950 /* =================================== Hashes =============================== */
5951 static void hsetCommand(redisClient
*c
) {
5953 robj
*o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
5956 o
= createHashObject();
5957 dictAdd(c
->db
->dict
,c
->argv
[1],o
);
5958 incrRefCount(c
->argv
[1]);
5960 if (o
->type
!= REDIS_HASH
) {
5961 addReply(c
,shared
.wrongtypeerr
);
5965 /* We want to convert the zipmap into an hash table right now if the
5966 * entry to be added is too big. Note that we check if the object
5967 * is integer encoded before to try fetching the length in the test below.
5968 * This is because integers are small, but currently stringObjectLen()
5969 * performs a slow conversion: not worth it. */
5970 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
&&
5971 ((c
->argv
[2]->encoding
== REDIS_ENCODING_RAW
&&
5972 sdslen(c
->argv
[2]->ptr
) > server
.hash_max_zipmap_value
) ||
5973 (c
->argv
[3]->encoding
== REDIS_ENCODING_RAW
&&
5974 sdslen(c
->argv
[3]->ptr
) > server
.hash_max_zipmap_value
)))
5976 convertToRealHash(o
);
5979 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
5980 unsigned char *zm
= o
->ptr
;
5981 robj
*valobj
= getDecodedObject(c
->argv
[3]);
5983 zm
= zipmapSet(zm
,c
->argv
[2]->ptr
,sdslen(c
->argv
[2]->ptr
),
5984 valobj
->ptr
,sdslen(valobj
->ptr
),&update
);
5985 decrRefCount(valobj
);
5988 /* And here there is the second check for hash conversion...
5989 * we want to do it only if the operation was not just an update as
5990 * zipmapLen() is O(N). */
5991 if (!update
&& zipmapLen(zm
) > server
.hash_max_zipmap_entries
)
5992 convertToRealHash(o
);
5994 tryObjectEncoding(c
->argv
[2]);
5995 /* note that c->argv[3] is already encoded, as the latest arg
5996 * of a bulk command is always integer encoded if possible. */
5997 if (dictAdd(o
->ptr
,c
->argv
[2],c
->argv
[3]) == DICT_OK
) {
5998 incrRefCount(c
->argv
[2]);
6002 incrRefCount(c
->argv
[3]);
6005 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",update
== 0));
6008 static void hgetCommand(redisClient
*c
) {
6009 robj
*o
= lookupKeyRead(c
->db
,c
->argv
[1]);
6012 addReply(c
,shared
.nullbulk
);
6015 if (o
->type
!= REDIS_HASH
) {
6016 addReply(c
,shared
.wrongtypeerr
);
6020 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6021 unsigned char *zm
= o
->ptr
;
6025 if (zipmapGet(zm
,c
->argv
[2]->ptr
,sdslen(c
->argv
[2]->ptr
), &val
,&vlen
)) {
6026 addReplySds(c
,sdscatprintf(sdsempty(),"$%u\r\n", vlen
));
6027 addReplySds(c
,sdsnewlen(val
,vlen
));
6028 addReply(c
,shared
.crlf
);
6031 addReply(c
,shared
.nullbulk
);
6035 struct dictEntry
*de
;
6037 de
= dictFind(o
->ptr
,c
->argv
[2]);
6039 addReply(c
,shared
.nullbulk
);
6041 robj
*e
= dictGetEntryVal(de
);
6043 addReplyBulkLen(c
,e
);
6045 addReply(c
,shared
.crlf
);
6051 static void hdelCommand(redisClient
*c
) {
6052 robj
*o
= lookupKeyWrite(c
->db
,c
->argv
[1]);
6055 addReply(c
,shared
.czero
);
6060 if (o
->type
!= REDIS_HASH
) {
6061 addReply(c
,shared
.wrongtypeerr
);
6065 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
6066 o
->ptr
= zipmapDel((unsigned char*) o
->ptr
,
6067 (unsigned char*) c
->argv
[2]->ptr
,
6068 sdslen(c
->argv
[2]->ptr
), &deleted
);
6070 deleted
= dictDelete((dict
*)o
->ptr
,c
->argv
[2]) == DICT_OK
;
6072 addReply(c
,deleted
? shared
.cone
: shared
.czero
);
6076 static void hlenCommand(redisClient
*c
) {
6080 if ((o
= lookupKeyWriteOrReply(c
,c
->argv
[1],shared
.czero
)) == NULL
||
6081 checkType(c
,o
,REDIS_HASH
)) return;
6083 len
= (o
->encoding
== REDIS_ENCODING_ZIPMAP
) ?
6084 zipmapLen((unsigned char*)o
->ptr
) : dictSize((dict
*)o
->ptr
);
6085 addReplyUlong(c
,len
);
6088 static void convertToRealHash(robj
*o
) {
6089 unsigned char *key
, *val
, *p
, *zm
= o
->ptr
;
6090 unsigned int klen
, vlen
;
6091 dict
*dict
= dictCreate(&hashDictType
,NULL
);
6093 assert(o
->type
== REDIS_HASH
&& o
->encoding
!= REDIS_ENCODING_HT
);
6094 p
= zipmapRewind(zm
);
6095 while((p
= zipmapNext(p
,&key
,&klen
,&val
,&vlen
)) != NULL
) {
6096 robj
*keyobj
, *valobj
;
6098 keyobj
= createStringObject((char*)key
,klen
);
6099 valobj
= createStringObject((char*)val
,vlen
);
6100 tryObjectEncoding(keyobj
);
6101 tryObjectEncoding(valobj
);
6102 dictAdd(dict
,keyobj
,valobj
);
6104 o
->encoding
= REDIS_ENCODING_HT
;
6109 /* ========================= Non type-specific commands ==================== */
6111 static void flushdbCommand(redisClient
*c
) {
6112 server
.dirty
+= dictSize(c
->db
->dict
);
6113 dictEmpty(c
->db
->dict
);
6114 dictEmpty(c
->db
->expires
);
6115 addReply(c
,shared
.ok
);
6118 static void flushallCommand(redisClient
*c
) {
6119 server
.dirty
+= emptyDb();
6120 addReply(c
,shared
.ok
);
6121 rdbSave(server
.dbfilename
);
6125 static redisSortOperation
*createSortOperation(int type
, robj
*pattern
) {
6126 redisSortOperation
*so
= zmalloc(sizeof(*so
));
6128 so
->pattern
= pattern
;
6132 /* Return the value associated to the key with a name obtained
6133 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6134 static robj
*lookupKeyByPattern(redisDb
*db
, robj
*pattern
, robj
*subst
) {
6138 int prefixlen
, sublen
, postfixlen
;
6139 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6143 char buf
[REDIS_SORTKEY_MAX
+1];
6146 /* If the pattern is "#" return the substitution object itself in order
6147 * to implement the "SORT ... GET #" feature. */
6148 spat
= pattern
->ptr
;
6149 if (spat
[0] == '#' && spat
[1] == '\0') {
6153 /* The substitution object may be specially encoded. If so we create
6154 * a decoded object on the fly. Otherwise getDecodedObject will just
6155 * increment the ref count, that we'll decrement later. */
6156 subst
= getDecodedObject(subst
);
6159 if (sdslen(spat
)+sdslen(ssub
)-1 > REDIS_SORTKEY_MAX
) return NULL
;
6160 p
= strchr(spat
,'*');
6162 decrRefCount(subst
);
6167 sublen
= sdslen(ssub
);
6168 postfixlen
= sdslen(spat
)-(prefixlen
+1);
6169 memcpy(keyname
.buf
,spat
,prefixlen
);
6170 memcpy(keyname
.buf
+prefixlen
,ssub
,sublen
);
6171 memcpy(keyname
.buf
+prefixlen
+sublen
,p
+1,postfixlen
);
6172 keyname
.buf
[prefixlen
+sublen
+postfixlen
] = '\0';
6173 keyname
.len
= prefixlen
+sublen
+postfixlen
;
6175 initStaticStringObject(keyobj
,((char*)&keyname
)+(sizeof(long)*2))
6176 decrRefCount(subst
);
6178 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6179 return lookupKeyRead(db
,&keyobj
);
6182 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6183 * the additional parameter is not standard but a BSD-specific we have to
6184 * pass sorting parameters via the global 'server' structure */
6185 static int sortCompare(const void *s1
, const void *s2
) {
6186 const redisSortObject
*so1
= s1
, *so2
= s2
;
6189 if (!server
.sort_alpha
) {
6190 /* Numeric sorting. Here it's trivial as we precomputed scores */
6191 if (so1
->u
.score
> so2
->u
.score
) {
6193 } else if (so1
->u
.score
< so2
->u
.score
) {
6199 /* Alphanumeric sorting */
6200 if (server
.sort_bypattern
) {
6201 if (!so1
->u
.cmpobj
|| !so2
->u
.cmpobj
) {
6202 /* At least one compare object is NULL */
6203 if (so1
->u
.cmpobj
== so2
->u
.cmpobj
)
6205 else if (so1
->u
.cmpobj
== NULL
)
6210 /* We have both the objects, use strcoll */
6211 cmp
= strcoll(so1
->u
.cmpobj
->ptr
,so2
->u
.cmpobj
->ptr
);
6214 /* Compare elements directly */
6217 dec1
= getDecodedObject(so1
->obj
);
6218 dec2
= getDecodedObject(so2
->obj
);
6219 cmp
= strcoll(dec1
->ptr
,dec2
->ptr
);
6224 return server
.sort_desc
? -cmp
: cmp
;
6227 /* The SORT command is the most complex command in Redis. Warning: this code
6228 * is optimized for speed and a bit less for readability */
6229 static void sortCommand(redisClient
*c
) {
6232 int desc
= 0, alpha
= 0;
6233 int limit_start
= 0, limit_count
= -1, start
, end
;
6234 int j
, dontsort
= 0, vectorlen
;
6235 int getop
= 0; /* GET operation counter */
6236 robj
*sortval
, *sortby
= NULL
, *storekey
= NULL
;
6237 redisSortObject
*vector
; /* Resulting vector to sort */
6239 /* Lookup the key to sort. It must be of the right types */
6240 sortval
= lookupKeyRead(c
->db
,c
->argv
[1]);
6241 if (sortval
== NULL
) {
6242 addReply(c
,shared
.nullmultibulk
);
6245 if (sortval
->type
!= REDIS_SET
&& sortval
->type
!= REDIS_LIST
&&
6246 sortval
->type
!= REDIS_ZSET
)
6248 addReply(c
,shared
.wrongtypeerr
);
6252 /* Create a list of operations to perform for every sorted element.
6253 * Operations can be GET/DEL/INCR/DECR */
6254 operations
= listCreate();
6255 listSetFreeMethod(operations
,zfree
);
6258 /* Now we need to protect sortval incrementing its count, in the future
6259 * SORT may have options able to overwrite/delete keys during the sorting
6260 * and the sorted key itself may get destroied */
6261 incrRefCount(sortval
);
6263 /* The SORT command has an SQL-alike syntax, parse it */
6264 while(j
< c
->argc
) {
6265 int leftargs
= c
->argc
-j
-1;
6266 if (!strcasecmp(c
->argv
[j
]->ptr
,"asc")) {
6268 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"desc")) {
6270 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"alpha")) {
6272 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"limit") && leftargs
>= 2) {
6273 limit_start
= atoi(c
->argv
[j
+1]->ptr
);
6274 limit_count
= atoi(c
->argv
[j
+2]->ptr
);
6276 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"store") && leftargs
>= 1) {
6277 storekey
= c
->argv
[j
+1];
6279 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"by") && leftargs
>= 1) {
6280 sortby
= c
->argv
[j
+1];
6281 /* If the BY pattern does not contain '*', i.e. it is constant,
6282 * we don't need to sort nor to lookup the weight keys. */
6283 if (strchr(c
->argv
[j
+1]->ptr
,'*') == NULL
) dontsort
= 1;
6285 } else if (!strcasecmp(c
->argv
[j
]->ptr
,"get") && leftargs
>= 1) {
6286 listAddNodeTail(operations
,createSortOperation(
6287 REDIS_SORT_GET
,c
->argv
[j
+1]));
6291 decrRefCount(sortval
);
6292 listRelease(operations
);
6293 addReply(c
,shared
.syntaxerr
);
6299 /* Load the sorting vector with all the objects to sort */
6300 switch(sortval
->type
) {
6301 case REDIS_LIST
: vectorlen
= listLength((list
*)sortval
->ptr
); break;
6302 case REDIS_SET
: vectorlen
= dictSize((dict
*)sortval
->ptr
); break;
6303 case REDIS_ZSET
: vectorlen
= dictSize(((zset
*)sortval
->ptr
)->dict
); break;
6304 default: vectorlen
= 0; redisAssert(0); /* Avoid GCC warning */
6306 vector
= zmalloc(sizeof(redisSortObject
)*vectorlen
);
6309 if (sortval
->type
== REDIS_LIST
) {
6310 list
*list
= sortval
->ptr
;
6314 listRewind(list
,&li
);
6315 while((ln
= listNext(&li
))) {
6316 robj
*ele
= ln
->value
;
6317 vector
[j
].obj
= ele
;
6318 vector
[j
].u
.score
= 0;
6319 vector
[j
].u
.cmpobj
= NULL
;
6327 if (sortval
->type
== REDIS_SET
) {
6330 zset
*zs
= sortval
->ptr
;
6334 di
= dictGetIterator(set
);
6335 while((setele
= dictNext(di
)) != NULL
) {
6336 vector
[j
].obj
= dictGetEntryKey(setele
);
6337 vector
[j
].u
.score
= 0;
6338 vector
[j
].u
.cmpobj
= NULL
;
6341 dictReleaseIterator(di
);
6343 redisAssert(j
== vectorlen
);
6345 /* Now it's time to load the right scores in the sorting vector */
6346 if (dontsort
== 0) {
6347 for (j
= 0; j
< vectorlen
; j
++) {
6351 byval
= lookupKeyByPattern(c
->db
,sortby
,vector
[j
].obj
);
6352 if (!byval
|| byval
->type
!= REDIS_STRING
) continue;
6354 vector
[j
].u
.cmpobj
= getDecodedObject(byval
);
6356 if (byval
->encoding
== REDIS_ENCODING_RAW
) {
6357 vector
[j
].u
.score
= strtod(byval
->ptr
,NULL
);
6359 /* Don't need to decode the object if it's
6360 * integer-encoded (the only encoding supported) so
6361 * far. We can just cast it */
6362 if (byval
->encoding
== REDIS_ENCODING_INT
) {
6363 vector
[j
].u
.score
= (long)byval
->ptr
;
6365 redisAssert(1 != 1);
6370 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_RAW
)
6371 vector
[j
].u
.score
= strtod(vector
[j
].obj
->ptr
,NULL
);
6373 if (vector
[j
].obj
->encoding
== REDIS_ENCODING_INT
)
6374 vector
[j
].u
.score
= (long) vector
[j
].obj
->ptr
;
6376 redisAssert(1 != 1);
6383 /* We are ready to sort the vector... perform a bit of sanity check
6384 * on the LIMIT option too. We'll use a partial version of quicksort. */
6385 start
= (limit_start
< 0) ? 0 : limit_start
;
6386 end
= (limit_count
< 0) ? vectorlen
-1 : start
+limit_count
-1;
6387 if (start
>= vectorlen
) {
6388 start
= vectorlen
-1;
6391 if (end
>= vectorlen
) end
= vectorlen
-1;
6393 if (dontsort
== 0) {
6394 server
.sort_desc
= desc
;
6395 server
.sort_alpha
= alpha
;
6396 server
.sort_bypattern
= sortby
? 1 : 0;
6397 if (sortby
&& (start
!= 0 || end
!= vectorlen
-1))
6398 pqsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
, start
,end
);
6400 qsort(vector
,vectorlen
,sizeof(redisSortObject
),sortCompare
);
6403 /* Send command output to the output buffer, performing the specified
6404 * GET/DEL/INCR/DECR operations if any. */
6405 outputlen
= getop
? getop
*(end
-start
+1) : end
-start
+1;
6406 if (storekey
== NULL
) {
6407 /* STORE option not specified, sent the sorting result to client */
6408 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",outputlen
));
6409 for (j
= start
; j
<= end
; j
++) {
6414 addReplyBulkLen(c
,vector
[j
].obj
);
6415 addReply(c
,vector
[j
].obj
);
6416 addReply(c
,shared
.crlf
);
6418 listRewind(operations
,&li
);
6419 while((ln
= listNext(&li
))) {
6420 redisSortOperation
*sop
= ln
->value
;
6421 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
6424 if (sop
->type
== REDIS_SORT_GET
) {
6425 if (!val
|| val
->type
!= REDIS_STRING
) {
6426 addReply(c
,shared
.nullbulk
);
6428 addReplyBulkLen(c
,val
);
6430 addReply(c
,shared
.crlf
);
6433 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
6438 robj
*listObject
= createListObject();
6439 list
*listPtr
= (list
*) listObject
->ptr
;
6441 /* STORE option specified, set the sorting result as a List object */
6442 for (j
= start
; j
<= end
; j
++) {
6447 listAddNodeTail(listPtr
,vector
[j
].obj
);
6448 incrRefCount(vector
[j
].obj
);
6450 listRewind(operations
,&li
);
6451 while((ln
= listNext(&li
))) {
6452 redisSortOperation
*sop
= ln
->value
;
6453 robj
*val
= lookupKeyByPattern(c
->db
,sop
->pattern
,
6456 if (sop
->type
== REDIS_SORT_GET
) {
6457 if (!val
|| val
->type
!= REDIS_STRING
) {
6458 listAddNodeTail(listPtr
,createStringObject("",0));
6460 listAddNodeTail(listPtr
,val
);
6464 redisAssert(sop
->type
== REDIS_SORT_GET
); /* always fails */
6468 if (dictReplace(c
->db
->dict
,storekey
,listObject
)) {
6469 incrRefCount(storekey
);
6471 /* Note: we add 1 because the DB is dirty anyway since even if the
6472 * SORT result is empty a new key is set and maybe the old content
6474 server
.dirty
+= 1+outputlen
;
6475 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",outputlen
));
6479 decrRefCount(sortval
);
6480 listRelease(operations
);
6481 for (j
= 0; j
< vectorlen
; j
++) {
6482 if (sortby
&& alpha
&& vector
[j
].u
.cmpobj
)
6483 decrRefCount(vector
[j
].u
.cmpobj
);
6488 /* Convert an amount of bytes into a human readable string in the form
6489 * of 100B, 2G, 100M, 4K, and so forth. */
6490 static void bytesToHuman(char *s
, unsigned long long n
) {
6495 sprintf(s
,"%lluB",n
);
6497 } else if (n
< (1024*1024)) {
6498 d
= (double)n
/(1024);
6499 sprintf(s
,"%.2fK",d
);
6500 } else if (n
< (1024LL*1024*1024)) {
6501 d
= (double)n
/(1024*1024);
6502 sprintf(s
,"%.2fM",d
);
6503 } else if (n
< (1024LL*1024*1024*1024)) {
6504 d
= (double)n
/(1024LL*1024*1024);
6505 sprintf(s
,"%.2fG",d
);
6509 /* Create the string returned by the INFO command. This is decoupled
6510 * by the INFO command itself as we need to report the same information
6511 * on memory corruption problems. */
6512 static sds
genRedisInfoString(void) {
6514 time_t uptime
= time(NULL
)-server
.stat_starttime
;
6518 server
.hash_max_zipmap_entries
= REDIS_HASH_MAX_ZIPMAP_ENTRIES
;
6519 server
.hash_max_zipmap_value
= REDIS_HASH_MAX_ZIPMAP_VALUE
;
6521 bytesToHuman(hmem
,zmalloc_used_memory());
6522 info
= sdscatprintf(sdsempty(),
6523 "redis_version:%s\r\n"
6525 "multiplexing_api:%s\r\n"
6526 "process_id:%ld\r\n"
6527 "uptime_in_seconds:%ld\r\n"
6528 "uptime_in_days:%ld\r\n"
6529 "connected_clients:%d\r\n"
6530 "connected_slaves:%d\r\n"
6531 "blocked_clients:%d\r\n"
6532 "used_memory:%zu\r\n"
6533 "used_memory_human:%s\r\n"
6534 "changes_since_last_save:%lld\r\n"
6535 "bgsave_in_progress:%d\r\n"
6536 "last_save_time:%ld\r\n"
6537 "bgrewriteaof_in_progress:%d\r\n"
6538 "total_connections_received:%lld\r\n"
6539 "total_commands_processed:%lld\r\n"
6540 "hash_max_zipmap_entries:%ld\r\n"
6541 "hash_max_zipmap_value:%ld\r\n"
6545 (sizeof(long) == 8) ? "64" : "32",
6550 listLength(server
.clients
)-listLength(server
.slaves
),
6551 listLength(server
.slaves
),
6552 server
.blpop_blocked_clients
,
6553 zmalloc_used_memory(),
6556 server
.bgsavechildpid
!= -1,
6558 server
.bgrewritechildpid
!= -1,
6559 server
.stat_numconnections
,
6560 server
.stat_numcommands
,
6561 server
.hash_max_zipmap_entries
,
6562 server
.hash_max_zipmap_value
,
6563 server
.vm_enabled
!= 0,
6564 server
.masterhost
== NULL
? "master" : "slave"
6566 if (server
.masterhost
) {
6567 info
= sdscatprintf(info
,
6568 "master_host:%s\r\n"
6569 "master_port:%d\r\n"
6570 "master_link_status:%s\r\n"
6571 "master_last_io_seconds_ago:%d\r\n"
6574 (server
.replstate
== REDIS_REPL_CONNECTED
) ?
6576 server
.master
? ((int)(time(NULL
)-server
.master
->lastinteraction
)) : -1
6579 if (server
.vm_enabled
) {
6581 info
= sdscatprintf(info
,
6582 "vm_conf_max_memory:%llu\r\n"
6583 "vm_conf_page_size:%llu\r\n"
6584 "vm_conf_pages:%llu\r\n"
6585 "vm_stats_used_pages:%llu\r\n"
6586 "vm_stats_swapped_objects:%llu\r\n"
6587 "vm_stats_swappin_count:%llu\r\n"
6588 "vm_stats_swappout_count:%llu\r\n"
6589 "vm_stats_io_newjobs_len:%lu\r\n"
6590 "vm_stats_io_processing_len:%lu\r\n"
6591 "vm_stats_io_processed_len:%lu\r\n"
6592 "vm_stats_io_active_threads:%lu\r\n"
6593 "vm_stats_blocked_clients:%lu\r\n"
6594 ,(unsigned long long) server
.vm_max_memory
,
6595 (unsigned long long) server
.vm_page_size
,
6596 (unsigned long long) server
.vm_pages
,
6597 (unsigned long long) server
.vm_stats_used_pages
,
6598 (unsigned long long) server
.vm_stats_swapped_objects
,
6599 (unsigned long long) server
.vm_stats_swapins
,
6600 (unsigned long long) server
.vm_stats_swapouts
,
6601 (unsigned long) listLength(server
.io_newjobs
),
6602 (unsigned long) listLength(server
.io_processing
),
6603 (unsigned long) listLength(server
.io_processed
),
6604 (unsigned long) server
.io_active_threads
,
6605 (unsigned long) server
.vm_blocked_clients
6609 for (j
= 0; j
< server
.dbnum
; j
++) {
6610 long long keys
, vkeys
;
6612 keys
= dictSize(server
.db
[j
].dict
);
6613 vkeys
= dictSize(server
.db
[j
].expires
);
6614 if (keys
|| vkeys
) {
6615 info
= sdscatprintf(info
, "db%d:keys=%lld,expires=%lld\r\n",
6622 static void infoCommand(redisClient
*c
) {
6623 sds info
= genRedisInfoString();
6624 addReplySds(c
,sdscatprintf(sdsempty(),"$%lu\r\n",
6625 (unsigned long)sdslen(info
)));
6626 addReplySds(c
,info
);
6627 addReply(c
,shared
.crlf
);
6630 static void monitorCommand(redisClient
*c
) {
6631 /* ignore MONITOR if aleady slave or in monitor mode */
6632 if (c
->flags
& REDIS_SLAVE
) return;
6634 c
->flags
|= (REDIS_SLAVE
|REDIS_MONITOR
);
6636 listAddNodeTail(server
.monitors
,c
);
6637 addReply(c
,shared
.ok
);
6640 /* ================================= Expire ================================= */
6641 static int removeExpire(redisDb
*db
, robj
*key
) {
6642 if (dictDelete(db
->expires
,key
) == DICT_OK
) {
6649 static int setExpire(redisDb
*db
, robj
*key
, time_t when
) {
6650 if (dictAdd(db
->expires
,key
,(void*)when
) == DICT_ERR
) {
6658 /* Return the expire time of the specified key, or -1 if no expire
6659 * is associated with this key (i.e. the key is non volatile) */
6660 static time_t getExpire(redisDb
*db
, robj
*key
) {
6663 /* No expire? return ASAP */
6664 if (dictSize(db
->expires
) == 0 ||
6665 (de
= dictFind(db
->expires
,key
)) == NULL
) return -1;
6667 return (time_t) dictGetEntryVal(de
);
6670 static int expireIfNeeded(redisDb
*db
, robj
*key
) {
6674 /* No expire? return ASAP */
6675 if (dictSize(db
->expires
) == 0 ||
6676 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6678 /* Lookup the expire */
6679 when
= (time_t) dictGetEntryVal(de
);
6680 if (time(NULL
) <= when
) return 0;
6682 /* Delete the key */
6683 dictDelete(db
->expires
,key
);
6684 return dictDelete(db
->dict
,key
) == DICT_OK
;
6687 static int deleteIfVolatile(redisDb
*db
, robj
*key
) {
6690 /* No expire? return ASAP */
6691 if (dictSize(db
->expires
) == 0 ||
6692 (de
= dictFind(db
->expires
,key
)) == NULL
) return 0;
6694 /* Delete the key */
6696 dictDelete(db
->expires
,key
);
6697 return dictDelete(db
->dict
,key
) == DICT_OK
;
6700 static void expireGenericCommand(redisClient
*c
, robj
*key
, time_t seconds
) {
6703 de
= dictFind(c
->db
->dict
,key
);
6705 addReply(c
,shared
.czero
);
6709 if (deleteKey(c
->db
,key
)) server
.dirty
++;
6710 addReply(c
, shared
.cone
);
6713 time_t when
= time(NULL
)+seconds
;
6714 if (setExpire(c
->db
,key
,when
)) {
6715 addReply(c
,shared
.cone
);
6718 addReply(c
,shared
.czero
);
6724 static void expireCommand(redisClient
*c
) {
6725 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10));
6728 static void expireatCommand(redisClient
*c
) {
6729 expireGenericCommand(c
,c
->argv
[1],strtol(c
->argv
[2]->ptr
,NULL
,10)-time(NULL
));
6732 static void ttlCommand(redisClient
*c
) {
6736 expire
= getExpire(c
->db
,c
->argv
[1]);
6738 ttl
= (int) (expire
-time(NULL
));
6739 if (ttl
< 0) ttl
= -1;
6741 addReplySds(c
,sdscatprintf(sdsempty(),":%d\r\n",ttl
));
6744 /* ================================ MULTI/EXEC ============================== */
6746 /* Client state initialization for MULTI/EXEC */
6747 static void initClientMultiState(redisClient
*c
) {
6748 c
->mstate
.commands
= NULL
;
6749 c
->mstate
.count
= 0;
6752 /* Release all the resources associated with MULTI/EXEC state */
6753 static void freeClientMultiState(redisClient
*c
) {
6756 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6758 multiCmd
*mc
= c
->mstate
.commands
+j
;
6760 for (i
= 0; i
< mc
->argc
; i
++)
6761 decrRefCount(mc
->argv
[i
]);
6764 zfree(c
->mstate
.commands
);
6767 /* Add a new command into the MULTI commands queue */
6768 static void queueMultiCommand(redisClient
*c
, struct redisCommand
*cmd
) {
6772 c
->mstate
.commands
= zrealloc(c
->mstate
.commands
,
6773 sizeof(multiCmd
)*(c
->mstate
.count
+1));
6774 mc
= c
->mstate
.commands
+c
->mstate
.count
;
6777 mc
->argv
= zmalloc(sizeof(robj
*)*c
->argc
);
6778 memcpy(mc
->argv
,c
->argv
,sizeof(robj
*)*c
->argc
);
6779 for (j
= 0; j
< c
->argc
; j
++)
6780 incrRefCount(mc
->argv
[j
]);
6784 static void multiCommand(redisClient
*c
) {
6785 c
->flags
|= REDIS_MULTI
;
6786 addReply(c
,shared
.ok
);
6789 static void discardCommand(redisClient
*c
) {
6790 if (!(c
->flags
& REDIS_MULTI
)) {
6791 addReplySds(c
,sdsnew("-ERR DISCARD without MULTI\r\n"));
6795 freeClientMultiState(c
);
6796 initClientMultiState(c
);
6797 c
->flags
&= (~REDIS_MULTI
);
6798 addReply(c
,shared
.ok
);
6801 static void execCommand(redisClient
*c
) {
6806 if (!(c
->flags
& REDIS_MULTI
)) {
6807 addReplySds(c
,sdsnew("-ERR EXEC without MULTI\r\n"));
6811 orig_argv
= c
->argv
;
6812 orig_argc
= c
->argc
;
6813 addReplySds(c
,sdscatprintf(sdsempty(),"*%d\r\n",c
->mstate
.count
));
6814 for (j
= 0; j
< c
->mstate
.count
; j
++) {
6815 c
->argc
= c
->mstate
.commands
[j
].argc
;
6816 c
->argv
= c
->mstate
.commands
[j
].argv
;
6817 call(c
,c
->mstate
.commands
[j
].cmd
);
6819 c
->argv
= orig_argv
;
6820 c
->argc
= orig_argc
;
6821 freeClientMultiState(c
);
6822 initClientMultiState(c
);
6823 c
->flags
&= (~REDIS_MULTI
);
6826 /* =========================== Blocking Operations ========================= */
6828 /* Currently Redis blocking operations support is limited to list POP ops,
6829 * so the current implementation is not fully generic, but it is also not
6830 * completely specific so it will not require a rewrite to support new
6831 * kind of blocking operations in the future.
6833 * Still it's important to note that list blocking operations can be already
6834 * used as a notification mechanism in order to implement other blocking
6835 * operations at application level, so there must be a very strong evidence
6836 * of usefulness and generality before new blocking operations are implemented.
6838 * This is how the current blocking POP works, we use BLPOP as example:
6839 * - If the user calls BLPOP and the key exists and contains a non empty list
6840 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6841 * if there is not to block.
6842 * - If instead BLPOP is called and the key does not exists or the list is
6843 * empty we need to block. In order to do so we remove the notification for
6844 * new data to read in the client socket (so that we'll not serve new
6845 * requests if the blocking request is not served). Also we put the client
6846 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6847 * blocking for this keys.
6848 * - If a PUSH operation against a key with blocked clients waiting is
6849 * performed, we serve the first in the list: basically instead to push
6850 * the new element inside the list we return it to the (first / oldest)
6851 * blocking client, unblock the client, and remove it form the list.
6853 * The above comment and the source code should be enough in order to understand
6854 * the implementation and modify / fix it later.
6857 /* Set a client in blocking mode for the specified key, with the specified
6859 static void blockForKeys(redisClient
*c
, robj
**keys
, int numkeys
, time_t timeout
) {
6864 c
->blockingkeys
= zmalloc(sizeof(robj
*)*numkeys
);
6865 c
->blockingkeysnum
= numkeys
;
6866 c
->blockingto
= timeout
;
6867 for (j
= 0; j
< numkeys
; j
++) {
6868 /* Add the key in the client structure, to map clients -> keys */
6869 c
->blockingkeys
[j
] = keys
[j
];
6870 incrRefCount(keys
[j
]);
6872 /* And in the other "side", to map keys -> clients */
6873 de
= dictFind(c
->db
->blockingkeys
,keys
[j
]);
6877 /* For every key we take a list of clients blocked for it */
6879 retval
= dictAdd(c
->db
->blockingkeys
,keys
[j
],l
);
6880 incrRefCount(keys
[j
]);
6881 assert(retval
== DICT_OK
);
6883 l
= dictGetEntryVal(de
);
6885 listAddNodeTail(l
,c
);
6887 /* Mark the client as a blocked client */
6888 c
->flags
|= REDIS_BLOCKED
;
6889 server
.blpop_blocked_clients
++;
6892 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6893 static void unblockClientWaitingData(redisClient
*c
) {
6898 assert(c
->blockingkeys
!= NULL
);
6899 /* The client may wait for multiple keys, so unblock it for every key. */
6900 for (j
= 0; j
< c
->blockingkeysnum
; j
++) {
6901 /* Remove this client from the list of clients waiting for this key. */
6902 de
= dictFind(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6904 l
= dictGetEntryVal(de
);
6905 listDelNode(l
,listSearchKey(l
,c
));
6906 /* If the list is empty we need to remove it to avoid wasting memory */
6907 if (listLength(l
) == 0)
6908 dictDelete(c
->db
->blockingkeys
,c
->blockingkeys
[j
]);
6909 decrRefCount(c
->blockingkeys
[j
]);
6911 /* Cleanup the client structure */
6912 zfree(c
->blockingkeys
);
6913 c
->blockingkeys
= NULL
;
6914 c
->flags
&= (~REDIS_BLOCKED
);
6915 server
.blpop_blocked_clients
--;
6916 /* We want to process data if there is some command waiting
6917 * in the input buffer. Note that this is safe even if
6918 * unblockClientWaitingData() gets called from freeClient() because
6919 * freeClient() will be smart enough to call this function
6920 * *after* c->querybuf was set to NULL. */
6921 if (c
->querybuf
&& sdslen(c
->querybuf
) > 0) processInputBuffer(c
);
6924 /* This should be called from any function PUSHing into lists.
6925 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6926 * 'ele' is the element pushed.
6928 * If the function returns 0 there was no client waiting for a list push
6931 * If the function returns 1 there was a client waiting for a list push
6932 * against this key, the element was passed to this client thus it's not
6933 * needed to actually add it to the list and the caller should return asap. */
6934 static int handleClientsWaitingListPush(redisClient
*c
, robj
*key
, robj
*ele
) {
6935 struct dictEntry
*de
;
6936 redisClient
*receiver
;
6940 de
= dictFind(c
->db
->blockingkeys
,key
);
6941 if (de
== NULL
) return 0;
6942 l
= dictGetEntryVal(de
);
6945 receiver
= ln
->value
;
6947 addReplySds(receiver
,sdsnew("*2\r\n"));
6948 addReplyBulkLen(receiver
,key
);
6949 addReply(receiver
,key
);
6950 addReply(receiver
,shared
.crlf
);
6951 addReplyBulkLen(receiver
,ele
);
6952 addReply(receiver
,ele
);
6953 addReply(receiver
,shared
.crlf
);
6954 unblockClientWaitingData(receiver
);
6958 /* Blocking RPOP/LPOP */
6959 static void blockingPopGenericCommand(redisClient
*c
, int where
) {
6964 for (j
= 1; j
< c
->argc
-1; j
++) {
6965 o
= lookupKeyWrite(c
->db
,c
->argv
[j
]);
6967 if (o
->type
!= REDIS_LIST
) {
6968 addReply(c
,shared
.wrongtypeerr
);
6971 list
*list
= o
->ptr
;
6972 if (listLength(list
) != 0) {
6973 /* If the list contains elements fall back to the usual
6974 * non-blocking POP operation */
6975 robj
*argv
[2], **orig_argv
;
6978 /* We need to alter the command arguments before to call
6979 * popGenericCommand() as the command takes a single key. */
6980 orig_argv
= c
->argv
;
6981 orig_argc
= c
->argc
;
6982 argv
[1] = c
->argv
[j
];
6986 /* Also the return value is different, we need to output
6987 * the multi bulk reply header and the key name. The
6988 * "real" command will add the last element (the value)
6989 * for us. If this souds like an hack to you it's just
6990 * because it is... */
6991 addReplySds(c
,sdsnew("*2\r\n"));
6992 addReplyBulkLen(c
,argv
[1]);
6993 addReply(c
,argv
[1]);
6994 addReply(c
,shared
.crlf
);
6995 popGenericCommand(c
,where
);
6997 /* Fix the client structure with the original stuff */
6998 c
->argv
= orig_argv
;
6999 c
->argc
= orig_argc
;
7005 /* If the list is empty or the key does not exists we must block */
7006 timeout
= strtol(c
->argv
[c
->argc
-1]->ptr
,NULL
,10);
7007 if (timeout
> 0) timeout
+= time(NULL
);
7008 blockForKeys(c
,c
->argv
+1,c
->argc
-2,timeout
);
7011 static void blpopCommand(redisClient
*c
) {
7012 blockingPopGenericCommand(c
,REDIS_HEAD
);
7015 static void brpopCommand(redisClient
*c
) {
7016 blockingPopGenericCommand(c
,REDIS_TAIL
);
7019 /* =============================== Replication ============================= */
7021 static int syncWrite(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7022 ssize_t nwritten
, ret
= size
;
7023 time_t start
= time(NULL
);
7027 if (aeWait(fd
,AE_WRITABLE
,1000) & AE_WRITABLE
) {
7028 nwritten
= write(fd
,ptr
,size
);
7029 if (nwritten
== -1) return -1;
7033 if ((time(NULL
)-start
) > timeout
) {
7041 static int syncRead(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7042 ssize_t nread
, totread
= 0;
7043 time_t start
= time(NULL
);
7047 if (aeWait(fd
,AE_READABLE
,1000) & AE_READABLE
) {
7048 nread
= read(fd
,ptr
,size
);
7049 if (nread
== -1) return -1;
7054 if ((time(NULL
)-start
) > timeout
) {
7062 static int syncReadLine(int fd
, char *ptr
, ssize_t size
, int timeout
) {
7069 if (syncRead(fd
,&c
,1,timeout
) == -1) return -1;
7072 if (nread
&& *(ptr
-1) == '\r') *(ptr
-1) = '\0';
7083 static void syncCommand(redisClient
*c
) {
7084 /* ignore SYNC if aleady slave or in monitor mode */
7085 if (c
->flags
& REDIS_SLAVE
) return;
7087 /* SYNC can't be issued when the server has pending data to send to
7088 * the client about already issued commands. We need a fresh reply
7089 * buffer registering the differences between the BGSAVE and the current
7090 * dataset, so that we can copy to other slaves if needed. */
7091 if (listLength(c
->reply
) != 0) {
7092 addReplySds(c
,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7096 redisLog(REDIS_NOTICE
,"Slave ask for synchronization");
7097 /* Here we need to check if there is a background saving operation
7098 * in progress, or if it is required to start one */
7099 if (server
.bgsavechildpid
!= -1) {
7100 /* Ok a background save is in progress. Let's check if it is a good
7101 * one for replication, i.e. if there is another slave that is
7102 * registering differences since the server forked to save */
7107 listRewind(server
.slaves
,&li
);
7108 while((ln
= listNext(&li
))) {
7110 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) break;
7113 /* Perfect, the server is already registering differences for
7114 * another slave. Set the right state, and copy the buffer. */
7115 listRelease(c
->reply
);
7116 c
->reply
= listDup(slave
->reply
);
7117 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7118 redisLog(REDIS_NOTICE
,"Waiting for end of BGSAVE for SYNC");
7120 /* No way, we need to wait for the next BGSAVE in order to
7121 * register differences */
7122 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
7123 redisLog(REDIS_NOTICE
,"Waiting for next BGSAVE for SYNC");
7126 /* Ok we don't have a BGSAVE in progress, let's start one */
7127 redisLog(REDIS_NOTICE
,"Starting BGSAVE for SYNC");
7128 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
7129 redisLog(REDIS_NOTICE
,"Replication failed, can't BGSAVE");
7130 addReplySds(c
,sdsnew("-ERR Unalbe to perform background save\r\n"));
7133 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7136 c
->flags
|= REDIS_SLAVE
;
7138 listAddNodeTail(server
.slaves
,c
);
7142 static void sendBulkToSlave(aeEventLoop
*el
, int fd
, void *privdata
, int mask
) {
7143 redisClient
*slave
= privdata
;
7145 REDIS_NOTUSED(mask
);
7146 char buf
[REDIS_IOBUF_LEN
];
7147 ssize_t nwritten
, buflen
;
7149 if (slave
->repldboff
== 0) {
7150 /* Write the bulk write count before to transfer the DB. In theory here
7151 * we don't know how much room there is in the output buffer of the
7152 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7153 * operations) will never be smaller than the few bytes we need. */
7156 bulkcount
= sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7158 if (write(fd
,bulkcount
,sdslen(bulkcount
)) != (signed)sdslen(bulkcount
))
7166 lseek(slave
->repldbfd
,slave
->repldboff
,SEEK_SET
);
7167 buflen
= read(slave
->repldbfd
,buf
,REDIS_IOBUF_LEN
);
7169 redisLog(REDIS_WARNING
,"Read error sending DB to slave: %s",
7170 (buflen
== 0) ? "premature EOF" : strerror(errno
));
7174 if ((nwritten
= write(fd
,buf
,buflen
)) == -1) {
7175 redisLog(REDIS_VERBOSE
,"Write error sending DB to slave: %s",
7180 slave
->repldboff
+= nwritten
;
7181 if (slave
->repldboff
== slave
->repldbsize
) {
7182 close(slave
->repldbfd
);
7183 slave
->repldbfd
= -1;
7184 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
7185 slave
->replstate
= REDIS_REPL_ONLINE
;
7186 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
,
7187 sendReplyToClient
, slave
) == AE_ERR
) {
7191 addReplySds(slave
,sdsempty());
7192 redisLog(REDIS_NOTICE
,"Synchronization with slave succeeded");
7196 /* This function is called at the end of every backgrond saving.
7197 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7198 * otherwise REDIS_ERR is passed to the function.
7200 * The goal of this function is to handle slaves waiting for a successful
7201 * background saving in order to perform non-blocking synchronization. */
7202 static void updateSlavesWaitingBgsave(int bgsaveerr
) {
7204 int startbgsave
= 0;
7207 listRewind(server
.slaves
,&li
);
7208 while((ln
= listNext(&li
))) {
7209 redisClient
*slave
= ln
->value
;
7211 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
) {
7213 slave
->replstate
= REDIS_REPL_WAIT_BGSAVE_END
;
7214 } else if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_END
) {
7215 struct redis_stat buf
;
7217 if (bgsaveerr
!= REDIS_OK
) {
7219 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE child returned an error");
7222 if ((slave
->repldbfd
= open(server
.dbfilename
,O_RDONLY
)) == -1 ||
7223 redis_fstat(slave
->repldbfd
,&buf
) == -1) {
7225 redisLog(REDIS_WARNING
,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno
));
7228 slave
->repldboff
= 0;
7229 slave
->repldbsize
= buf
.st_size
;
7230 slave
->replstate
= REDIS_REPL_SEND_BULK
;
7231 aeDeleteFileEvent(server
.el
,slave
->fd
,AE_WRITABLE
);
7232 if (aeCreateFileEvent(server
.el
, slave
->fd
, AE_WRITABLE
, sendBulkToSlave
, slave
) == AE_ERR
) {
7239 if (rdbSaveBackground(server
.dbfilename
) != REDIS_OK
) {
7242 listRewind(server
.slaves
,&li
);
7243 redisLog(REDIS_WARNING
,"SYNC failed. BGSAVE failed");
7244 while((ln
= listNext(&li
))) {
7245 redisClient
*slave
= ln
->value
;
7247 if (slave
->replstate
== REDIS_REPL_WAIT_BGSAVE_START
)
7254 static int syncWithMaster(void) {
7255 char buf
[1024], tmpfile
[256], authcmd
[1024];
7257 int fd
= anetTcpConnect(NULL
,server
.masterhost
,server
.masterport
);
7258 int dfd
, maxtries
= 5;
7261 redisLog(REDIS_WARNING
,"Unable to connect to MASTER: %s",
7266 /* AUTH with the master if required. */
7267 if(server
.masterauth
) {
7268 snprintf(authcmd
, 1024, "AUTH %s\r\n", server
.masterauth
);
7269 if (syncWrite(fd
, authcmd
, strlen(server
.masterauth
)+7, 5) == -1) {
7271 redisLog(REDIS_WARNING
,"Unable to AUTH to MASTER: %s",
7275 /* Read the AUTH result. */
7276 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
7278 redisLog(REDIS_WARNING
,"I/O error reading auth result from MASTER: %s",
7282 if (buf
[0] != '+') {
7284 redisLog(REDIS_WARNING
,"Cannot AUTH to MASTER, is the masterauth password correct?");
7289 /* Issue the SYNC command */
7290 if (syncWrite(fd
,"SYNC \r\n",7,5) == -1) {
7292 redisLog(REDIS_WARNING
,"I/O error writing to MASTER: %s",
7296 /* Read the bulk write count */
7297 if (syncReadLine(fd
,buf
,1024,3600) == -1) {
7299 redisLog(REDIS_WARNING
,"I/O error reading bulk count from MASTER: %s",
7303 if (buf
[0] != '$') {
7305 redisLog(REDIS_WARNING
,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7308 dumpsize
= strtol(buf
+1,NULL
,10);
7309 redisLog(REDIS_NOTICE
,"Receiving %ld bytes data dump from MASTER",dumpsize
);
7310 /* Read the bulk write data on a temp file */
7312 snprintf(tmpfile
,256,
7313 "temp-%d.%ld.rdb",(int)time(NULL
),(long int)getpid());
7314 dfd
= open(tmpfile
,O_CREAT
|O_WRONLY
|O_EXCL
,0644);
7315 if (dfd
!= -1) break;
7320 redisLog(REDIS_WARNING
,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno
));
7324 int nread
, nwritten
;
7326 nread
= read(fd
,buf
,(dumpsize
< 1024)?dumpsize
:1024);
7328 redisLog(REDIS_WARNING
,"I/O error trying to sync with MASTER: %s",
7334 nwritten
= write(dfd
,buf
,nread
);
7335 if (nwritten
== -1) {
7336 redisLog(REDIS_WARNING
,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno
));
7344 if (rename(tmpfile
,server
.dbfilename
) == -1) {
7345 redisLog(REDIS_WARNING
,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno
));
7351 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
7352 redisLog(REDIS_WARNING
,"Failed trying to load the MASTER synchronization DB from disk");
7356 server
.master
= createClient(fd
);
7357 server
.master
->flags
|= REDIS_MASTER
;
7358 server
.master
->authenticated
= 1;
7359 server
.replstate
= REDIS_REPL_CONNECTED
;
7363 static void slaveofCommand(redisClient
*c
) {
7364 if (!strcasecmp(c
->argv
[1]->ptr
,"no") &&
7365 !strcasecmp(c
->argv
[2]->ptr
,"one")) {
7366 if (server
.masterhost
) {
7367 sdsfree(server
.masterhost
);
7368 server
.masterhost
= NULL
;
7369 if (server
.master
) freeClient(server
.master
);
7370 server
.replstate
= REDIS_REPL_NONE
;
7371 redisLog(REDIS_NOTICE
,"MASTER MODE enabled (user request)");
7374 sdsfree(server
.masterhost
);
7375 server
.masterhost
= sdsdup(c
->argv
[1]->ptr
);
7376 server
.masterport
= atoi(c
->argv
[2]->ptr
);
7377 if (server
.master
) freeClient(server
.master
);
7378 server
.replstate
= REDIS_REPL_CONNECT
;
7379 redisLog(REDIS_NOTICE
,"SLAVE OF %s:%d enabled (user request)",
7380 server
.masterhost
, server
.masterport
);
7382 addReply(c
,shared
.ok
);
7385 /* ============================ Maxmemory directive ======================== */
7387 /* Try to free one object form the pre-allocated objects free list.
7388 * This is useful under low mem conditions as by default we take 1 million
7389 * free objects allocated. On success REDIS_OK is returned, otherwise
7391 static int tryFreeOneObjectFromFreelist(void) {
7394 if (server
.vm_enabled
) pthread_mutex_lock(&server
.obj_freelist_mutex
);
7395 if (listLength(server
.objfreelist
)) {
7396 listNode
*head
= listFirst(server
.objfreelist
);
7397 o
= listNodeValue(head
);
7398 listDelNode(server
.objfreelist
,head
);
7399 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
7403 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.obj_freelist_mutex
);
7408 /* This function gets called when 'maxmemory' is set on the config file to limit
7409 * the max memory used by the server, and we are out of memory.
7410 * This function will try to, in order:
7412 * - Free objects from the free list
7413 * - Try to remove keys with an EXPIRE set
7415 * It is not possible to free enough memory to reach used-memory < maxmemory
7416 * the server will start refusing commands that will enlarge even more the
7419 static void freeMemoryIfNeeded(void) {
7420 while (server
.maxmemory
&& zmalloc_used_memory() > server
.maxmemory
) {
7421 int j
, k
, freed
= 0;
7423 if (tryFreeOneObjectFromFreelist() == REDIS_OK
) continue;
7424 for (j
= 0; j
< server
.dbnum
; j
++) {
7426 robj
*minkey
= NULL
;
7427 struct dictEntry
*de
;
7429 if (dictSize(server
.db
[j
].expires
)) {
7431 /* From a sample of three keys drop the one nearest to
7432 * the natural expire */
7433 for (k
= 0; k
< 3; k
++) {
7436 de
= dictGetRandomKey(server
.db
[j
].expires
);
7437 t
= (time_t) dictGetEntryVal(de
);
7438 if (minttl
== -1 || t
< minttl
) {
7439 minkey
= dictGetEntryKey(de
);
7443 deleteKey(server
.db
+j
,minkey
);
7446 if (!freed
) return; /* nothing to free... */
7450 /* ============================== Append Only file ========================== */
7452 static void feedAppendOnlyFile(struct redisCommand
*cmd
, int dictid
, robj
**argv
, int argc
) {
7453 sds buf
= sdsempty();
7459 /* The DB this command was targetting is not the same as the last command
7460 * we appendend. To issue a SELECT command is needed. */
7461 if (dictid
!= server
.appendseldb
) {
7464 snprintf(seldb
,sizeof(seldb
),"%d",dictid
);
7465 buf
= sdscatprintf(buf
,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7466 (unsigned long)strlen(seldb
),seldb
);
7467 server
.appendseldb
= dictid
;
7470 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7471 * EXPIREs into EXPIREATs calls */
7472 if (cmd
->proc
== expireCommand
) {
7475 tmpargv
[0] = createStringObject("EXPIREAT",8);
7476 tmpargv
[1] = argv
[1];
7477 incrRefCount(argv
[1]);
7478 when
= time(NULL
)+strtol(argv
[2]->ptr
,NULL
,10);
7479 tmpargv
[2] = createObject(REDIS_STRING
,
7480 sdscatprintf(sdsempty(),"%ld",when
));
7484 /* Append the actual command */
7485 buf
= sdscatprintf(buf
,"*%d\r\n",argc
);
7486 for (j
= 0; j
< argc
; j
++) {
7489 o
= getDecodedObject(o
);
7490 buf
= sdscatprintf(buf
,"$%lu\r\n",(unsigned long)sdslen(o
->ptr
));
7491 buf
= sdscatlen(buf
,o
->ptr
,sdslen(o
->ptr
));
7492 buf
= sdscatlen(buf
,"\r\n",2);
7496 /* Free the objects from the modified argv for EXPIREAT */
7497 if (cmd
->proc
== expireCommand
) {
7498 for (j
= 0; j
< 3; j
++)
7499 decrRefCount(argv
[j
]);
7502 /* We want to perform a single write. This should be guaranteed atomic
7503 * at least if the filesystem we are writing is a real physical one.
7504 * While this will save us against the server being killed I don't think
7505 * there is much to do about the whole server stopping for power problems
7507 nwritten
= write(server
.appendfd
,buf
,sdslen(buf
));
7508 if (nwritten
!= (signed)sdslen(buf
)) {
7509 /* Ooops, we are in troubles. The best thing to do for now is
7510 * to simply exit instead to give the illusion that everything is
7511 * working as expected. */
7512 if (nwritten
== -1) {
7513 redisLog(REDIS_WARNING
,"Exiting on error writing to the append-only file: %s",strerror(errno
));
7515 redisLog(REDIS_WARNING
,"Exiting on short write while writing to the append-only file: %s",strerror(errno
));
7519 /* If a background append only file rewriting is in progress we want to
7520 * accumulate the differences between the child DB and the current one
7521 * in a buffer, so that when the child process will do its work we
7522 * can append the differences to the new append only file. */
7523 if (server
.bgrewritechildpid
!= -1)
7524 server
.bgrewritebuf
= sdscatlen(server
.bgrewritebuf
,buf
,sdslen(buf
));
7528 if (server
.appendfsync
== APPENDFSYNC_ALWAYS
||
7529 (server
.appendfsync
== APPENDFSYNC_EVERYSEC
&&
7530 now
-server
.lastfsync
> 1))
7532 fsync(server
.appendfd
); /* Let's try to get this data on the disk */
7533 server
.lastfsync
= now
;
7537 /* In Redis commands are always executed in the context of a client, so in
7538 * order to load the append only file we need to create a fake client. */
7539 static struct redisClient
*createFakeClient(void) {
7540 struct redisClient
*c
= zmalloc(sizeof(*c
));
7544 c
->querybuf
= sdsempty();
7548 /* We set the fake client as a slave waiting for the synchronization
7549 * so that Redis will not try to send replies to this client. */
7550 c
->replstate
= REDIS_REPL_WAIT_BGSAVE_START
;
7551 c
->reply
= listCreate();
7552 listSetFreeMethod(c
->reply
,decrRefCount
);
7553 listSetDupMethod(c
->reply
,dupClientReplyValue
);
7557 static void freeFakeClient(struct redisClient
*c
) {
7558 sdsfree(c
->querybuf
);
7559 listRelease(c
->reply
);
7563 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7564 * error (the append only file is zero-length) REDIS_ERR is returned. On
7565 * fatal error an error message is logged and the program exists. */
7566 int loadAppendOnlyFile(char *filename
) {
7567 struct redisClient
*fakeClient
;
7568 FILE *fp
= fopen(filename
,"r");
7569 struct redis_stat sb
;
7570 unsigned long long loadedkeys
= 0;
7572 if (redis_fstat(fileno(fp
),&sb
) != -1 && sb
.st_size
== 0)
7576 redisLog(REDIS_WARNING
,"Fatal error: can't open the append log file for reading: %s",strerror(errno
));
7580 fakeClient
= createFakeClient();
7587 struct redisCommand
*cmd
;
7589 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) {
7595 if (buf
[0] != '*') goto fmterr
;
7597 argv
= zmalloc(sizeof(robj
*)*argc
);
7598 for (j
= 0; j
< argc
; j
++) {
7599 if (fgets(buf
,sizeof(buf
),fp
) == NULL
) goto readerr
;
7600 if (buf
[0] != '$') goto fmterr
;
7601 len
= strtol(buf
+1,NULL
,10);
7602 argsds
= sdsnewlen(NULL
,len
);
7603 if (len
&& fread(argsds
,len
,1,fp
) == 0) goto fmterr
;
7604 argv
[j
] = createObject(REDIS_STRING
,argsds
);
7605 if (fread(buf
,2,1,fp
) == 0) goto fmterr
; /* discard CRLF */
7608 /* Command lookup */
7609 cmd
= lookupCommand(argv
[0]->ptr
);
7611 redisLog(REDIS_WARNING
,"Unknown command '%s' reading the append only file", argv
[0]->ptr
);
7614 /* Try object sharing and encoding */
7615 if (server
.shareobjects
) {
7617 for(j
= 1; j
< argc
; j
++)
7618 argv
[j
] = tryObjectSharing(argv
[j
]);
7620 if (cmd
->flags
& REDIS_CMD_BULK
)
7621 tryObjectEncoding(argv
[argc
-1]);
7622 /* Run the command in the context of a fake client */
7623 fakeClient
->argc
= argc
;
7624 fakeClient
->argv
= argv
;
7625 cmd
->proc(fakeClient
);
7626 /* Discard the reply objects list from the fake client */
7627 while(listLength(fakeClient
->reply
))
7628 listDelNode(fakeClient
->reply
,listFirst(fakeClient
->reply
));
7629 /* Clean up, ready for the next command */
7630 for (j
= 0; j
< argc
; j
++) decrRefCount(argv
[j
]);
7632 /* Handle swapping while loading big datasets when VM is on */
7634 if (server
.vm_enabled
&& (loadedkeys
% 5000) == 0) {
7635 while (zmalloc_used_memory() > server
.vm_max_memory
) {
7636 if (vmSwapOneObjectBlocking() == REDIS_ERR
) break;
7641 freeFakeClient(fakeClient
);
7646 redisLog(REDIS_WARNING
,"Unexpected end of file reading the append only file");
7648 redisLog(REDIS_WARNING
,"Unrecoverable error reading the append only file: %s", strerror(errno
));
7652 redisLog(REDIS_WARNING
,"Bad file format reading the append only file");
7656 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7657 static int fwriteBulkObject(FILE *fp
, robj
*obj
) {
7661 /* Avoid the incr/decr ref count business if possible to help
7662 * copy-on-write (we are often in a child process when this function
7664 * Also makes sure that key objects don't get incrRefCount-ed when VM
7666 if (obj
->encoding
!= REDIS_ENCODING_RAW
) {
7667 obj
= getDecodedObject(obj
);
7670 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(long)sdslen(obj
->ptr
));
7671 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) goto err
;
7672 if (sdslen(obj
->ptr
) && fwrite(obj
->ptr
,sdslen(obj
->ptr
),1,fp
) == 0)
7674 if (fwrite("\r\n",2,1,fp
) == 0) goto err
;
7675 if (decrrc
) decrRefCount(obj
);
7678 if (decrrc
) decrRefCount(obj
);
7682 /* Write binary-safe string into a file in the bulkformat
7683 * $<count>\r\n<payload>\r\n */
7684 static int fwriteBulkString(FILE *fp
, char *s
, unsigned long len
) {
7687 snprintf(buf
,sizeof(buf
),"$%ld\r\n",(unsigned long)len
);
7688 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7689 if (len
&& fwrite(s
,len
,1,fp
) == 0) return 0;
7690 if (fwrite("\r\n",2,1,fp
) == 0) return 0;
7694 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7695 static int fwriteBulkDouble(FILE *fp
, double d
) {
7696 char buf
[128], dbuf
[128];
7698 snprintf(dbuf
,sizeof(dbuf
),"%.17g\r\n",d
);
7699 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(dbuf
)-2);
7700 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7701 if (fwrite(dbuf
,strlen(dbuf
),1,fp
) == 0) return 0;
7705 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7706 static int fwriteBulkLong(FILE *fp
, long l
) {
7707 char buf
[128], lbuf
[128];
7709 snprintf(lbuf
,sizeof(lbuf
),"%ld\r\n",l
);
7710 snprintf(buf
,sizeof(buf
),"$%lu\r\n",(unsigned long)strlen(lbuf
)-2);
7711 if (fwrite(buf
,strlen(buf
),1,fp
) == 0) return 0;
7712 if (fwrite(lbuf
,strlen(lbuf
),1,fp
) == 0) return 0;
7716 /* Write a sequence of commands able to fully rebuild the dataset into
7717 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7718 static int rewriteAppendOnlyFile(char *filename
) {
7719 dictIterator
*di
= NULL
;
7724 time_t now
= time(NULL
);
7726 /* Note that we have to use a different temp name here compared to the
7727 * one used by rewriteAppendOnlyFileBackground() function. */
7728 snprintf(tmpfile
,256,"temp-rewriteaof-%d.aof", (int) getpid());
7729 fp
= fopen(tmpfile
,"w");
7731 redisLog(REDIS_WARNING
, "Failed rewriting the append only file: %s", strerror(errno
));
7734 for (j
= 0; j
< server
.dbnum
; j
++) {
7735 char selectcmd
[] = "*2\r\n$6\r\nSELECT\r\n";
7736 redisDb
*db
= server
.db
+j
;
7738 if (dictSize(d
) == 0) continue;
7739 di
= dictGetIterator(d
);
7745 /* SELECT the new DB */
7746 if (fwrite(selectcmd
,sizeof(selectcmd
)-1,1,fp
) == 0) goto werr
;
7747 if (fwriteBulkLong(fp
,j
) == 0) goto werr
;
7749 /* Iterate this DB writing every entry */
7750 while((de
= dictNext(di
)) != NULL
) {
7755 key
= dictGetEntryKey(de
);
7756 /* If the value for this key is swapped, load a preview in memory.
7757 * We use a "swapped" flag to remember if we need to free the
7758 * value object instead to just increment the ref count anyway
7759 * in order to avoid copy-on-write of pages if we are forked() */
7760 if (!server
.vm_enabled
|| key
->storage
== REDIS_VM_MEMORY
||
7761 key
->storage
== REDIS_VM_SWAPPING
) {
7762 o
= dictGetEntryVal(de
);
7765 o
= vmPreviewObject(key
);
7768 expiretime
= getExpire(db
,key
);
7770 /* Save the key and associated value */
7771 if (o
->type
== REDIS_STRING
) {
7772 /* Emit a SET command */
7773 char cmd
[]="*3\r\n$3\r\nSET\r\n";
7774 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7776 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7777 if (fwriteBulkObject(fp
,o
) == 0) goto werr
;
7778 } else if (o
->type
== REDIS_LIST
) {
7779 /* Emit the RPUSHes needed to rebuild the list */
7780 list
*list
= o
->ptr
;
7784 listRewind(list
,&li
);
7785 while((ln
= listNext(&li
))) {
7786 char cmd
[]="*3\r\n$5\r\nRPUSH\r\n";
7787 robj
*eleobj
= listNodeValue(ln
);
7789 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7790 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7791 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
7793 } else if (o
->type
== REDIS_SET
) {
7794 /* Emit the SADDs needed to rebuild the set */
7796 dictIterator
*di
= dictGetIterator(set
);
7799 while((de
= dictNext(di
)) != NULL
) {
7800 char cmd
[]="*3\r\n$4\r\nSADD\r\n";
7801 robj
*eleobj
= dictGetEntryKey(de
);
7803 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7804 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7805 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
7807 dictReleaseIterator(di
);
7808 } else if (o
->type
== REDIS_ZSET
) {
7809 /* Emit the ZADDs needed to rebuild the sorted set */
7811 dictIterator
*di
= dictGetIterator(zs
->dict
);
7814 while((de
= dictNext(di
)) != NULL
) {
7815 char cmd
[]="*4\r\n$4\r\nZADD\r\n";
7816 robj
*eleobj
= dictGetEntryKey(de
);
7817 double *score
= dictGetEntryVal(de
);
7819 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7820 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7821 if (fwriteBulkDouble(fp
,*score
) == 0) goto werr
;
7822 if (fwriteBulkObject(fp
,eleobj
) == 0) goto werr
;
7824 dictReleaseIterator(di
);
7825 } else if (o
->type
== REDIS_HASH
) {
7826 char cmd
[]="*4\r\n$4\r\nHSET\r\n";
7828 /* Emit the HSETs needed to rebuild the hash */
7829 if (o
->encoding
== REDIS_ENCODING_ZIPMAP
) {
7830 unsigned char *p
= zipmapRewind(o
->ptr
);
7831 unsigned char *field
, *val
;
7832 unsigned int flen
, vlen
;
7834 while((p
= zipmapNext(p
,&field
,&flen
,&val
,&vlen
)) != NULL
) {
7835 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7836 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7837 if (fwriteBulkString(fp
,(char*)field
,flen
) == -1)
7839 if (fwriteBulkString(fp
,(char*)val
,vlen
) == -1)
7843 dictIterator
*di
= dictGetIterator(o
->ptr
);
7846 while((de
= dictNext(di
)) != NULL
) {
7847 robj
*field
= dictGetEntryKey(de
);
7848 robj
*val
= dictGetEntryVal(de
);
7850 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7851 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7852 if (fwriteBulkObject(fp
,field
) == -1) return -1;
7853 if (fwriteBulkObject(fp
,val
) == -1) return -1;
7855 dictReleaseIterator(di
);
7858 redisAssert(0 != 0);
7860 /* Save the expire time */
7861 if (expiretime
!= -1) {
7862 char cmd
[]="*3\r\n$8\r\nEXPIREAT\r\n";
7863 /* If this key is already expired skip it */
7864 if (expiretime
< now
) continue;
7865 if (fwrite(cmd
,sizeof(cmd
)-1,1,fp
) == 0) goto werr
;
7866 if (fwriteBulkObject(fp
,key
) == 0) goto werr
;
7867 if (fwriteBulkLong(fp
,expiretime
) == 0) goto werr
;
7869 if (swapped
) decrRefCount(o
);
7871 dictReleaseIterator(di
);
7874 /* Make sure data will not remain on the OS's output buffers */
7879 /* Use RENAME to make sure the DB file is changed atomically only
7880 * if the generate DB file is ok. */
7881 if (rename(tmpfile
,filename
) == -1) {
7882 redisLog(REDIS_WARNING
,"Error moving temp append only file on the final destination: %s", strerror(errno
));
7886 redisLog(REDIS_NOTICE
,"SYNC append only file rewrite performed");
7892 redisLog(REDIS_WARNING
,"Write error writing append only file on disk: %s", strerror(errno
));
7893 if (di
) dictReleaseIterator(di
);
7897 /* This is how rewriting of the append only file in background works:
7899 * 1) The user calls BGREWRITEAOF
7900 * 2) Redis calls this function, that forks():
7901 * 2a) the child rewrite the append only file in a temp file.
7902 * 2b) the parent accumulates differences in server.bgrewritebuf.
7903 * 3) When the child finished '2a' exists.
7904 * 4) The parent will trap the exit code, if it's OK, will append the
7905 * data accumulated into server.bgrewritebuf into the temp file, and
7906 * finally will rename(2) the temp file in the actual file name.
7907 * The the new file is reopened as the new append only file. Profit!
7909 static int rewriteAppendOnlyFileBackground(void) {
7912 if (server
.bgrewritechildpid
!= -1) return REDIS_ERR
;
7913 if (server
.vm_enabled
) waitEmptyIOJobsQueue();
7914 if ((childpid
= fork()) == 0) {
7918 if (server
.vm_enabled
) vmReopenSwapFile();
7920 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7921 if (rewriteAppendOnlyFile(tmpfile
) == REDIS_OK
) {
7928 if (childpid
== -1) {
7929 redisLog(REDIS_WARNING
,
7930 "Can't rewrite append only file in background: fork: %s",
7934 redisLog(REDIS_NOTICE
,
7935 "Background append only file rewriting started by pid %d",childpid
);
7936 server
.bgrewritechildpid
= childpid
;
7937 /* We set appendseldb to -1 in order to force the next call to the
7938 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7939 * accumulated by the parent into server.bgrewritebuf will start
7940 * with a SELECT statement and it will be safe to merge. */
7941 server
.appendseldb
= -1;
7944 return REDIS_OK
; /* unreached */
7947 static void bgrewriteaofCommand(redisClient
*c
) {
7948 if (server
.bgrewritechildpid
!= -1) {
7949 addReplySds(c
,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7952 if (rewriteAppendOnlyFileBackground() == REDIS_OK
) {
7953 char *status
= "+Background append only file rewriting started\r\n";
7954 addReplySds(c
,sdsnew(status
));
7956 addReply(c
,shared
.err
);
7960 static void aofRemoveTempFile(pid_t childpid
) {
7963 snprintf(tmpfile
,256,"temp-rewriteaof-bg-%d.aof", (int) childpid
);
7967 /* Virtual Memory is composed mainly of two subsystems:
7968 * - Blocking Virutal Memory
7969 * - Threaded Virtual Memory I/O
7970 * The two parts are not fully decoupled, but functions are split among two
7971 * different sections of the source code (delimited by comments) in order to
7972 * make more clear what functionality is about the blocking VM and what about
7973 * the threaded (not blocking) VM.
7977 * Redis VM is a blocking VM (one that blocks reading swapped values from
7978 * disk into memory when a value swapped out is needed in memory) that is made
7979 * unblocking by trying to examine the command argument vector in order to
7980 * load in background values that will likely be needed in order to exec
7981 * the command. The command is executed only once all the relevant keys
7982 * are loaded into memory.
7984 * This basically is almost as simple of a blocking VM, but almost as parallel
7985 * as a fully non-blocking VM.
7988 /* =================== Virtual Memory - Blocking Side ====================== */
7990 /* substitute the first occurrence of '%p' with the process pid in the
7991 * swap file name. */
7992 static void expandVmSwapFilename(void) {
7993 char *p
= strstr(server
.vm_swap_file
,"%p");
7999 new = sdscat(new,server
.vm_swap_file
);
8000 new = sdscatprintf(new,"%ld",(long) getpid());
8001 new = sdscat(new,p
+2);
8002 zfree(server
.vm_swap_file
);
8003 server
.vm_swap_file
= new;
8006 static void vmInit(void) {
8011 if (server
.vm_max_threads
!= 0)
8012 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8014 expandVmSwapFilename();
8015 redisLog(REDIS_NOTICE
,"Using '%s' as swap file",server
.vm_swap_file
);
8016 if ((server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b")) == NULL
) {
8017 server
.vm_fp
= fopen(server
.vm_swap_file
,"w+b");
8019 if (server
.vm_fp
== NULL
) {
8020 redisLog(REDIS_WARNING
,
8021 "Impossible to open the swap file: %s. Exiting.",
8025 server
.vm_fd
= fileno(server
.vm_fp
);
8026 server
.vm_next_page
= 0;
8027 server
.vm_near_pages
= 0;
8028 server
.vm_stats_used_pages
= 0;
8029 server
.vm_stats_swapped_objects
= 0;
8030 server
.vm_stats_swapouts
= 0;
8031 server
.vm_stats_swapins
= 0;
8032 totsize
= server
.vm_pages
*server
.vm_page_size
;
8033 redisLog(REDIS_NOTICE
,"Allocating %lld bytes of swap file",totsize
);
8034 if (ftruncate(server
.vm_fd
,totsize
) == -1) {
8035 redisLog(REDIS_WARNING
,"Can't ftruncate swap file: %s. Exiting.",
8039 redisLog(REDIS_NOTICE
,"Swap file allocated with success");
8041 server
.vm_bitmap
= zmalloc((server
.vm_pages
+7)/8);
8042 redisLog(REDIS_VERBOSE
,"Allocated %lld bytes page table for %lld pages",
8043 (long long) (server
.vm_pages
+7)/8, server
.vm_pages
);
8044 memset(server
.vm_bitmap
,0,(server
.vm_pages
+7)/8);
8046 /* Initialize threaded I/O (used by Virtual Memory) */
8047 server
.io_newjobs
= listCreate();
8048 server
.io_processing
= listCreate();
8049 server
.io_processed
= listCreate();
8050 server
.io_ready_clients
= listCreate();
8051 pthread_mutex_init(&server
.io_mutex
,NULL
);
8052 pthread_mutex_init(&server
.obj_freelist_mutex
,NULL
);
8053 pthread_mutex_init(&server
.io_swapfile_mutex
,NULL
);
8054 server
.io_active_threads
= 0;
8055 if (pipe(pipefds
) == -1) {
8056 redisLog(REDIS_WARNING
,"Unable to intialized VM: pipe(2): %s. Exiting."
8060 server
.io_ready_pipe_read
= pipefds
[0];
8061 server
.io_ready_pipe_write
= pipefds
[1];
8062 redisAssert(anetNonBlock(NULL
,server
.io_ready_pipe_read
) != ANET_ERR
);
8063 /* LZF requires a lot of stack */
8064 pthread_attr_init(&server
.io_threads_attr
);
8065 pthread_attr_getstacksize(&server
.io_threads_attr
, &stacksize
);
8066 while (stacksize
< REDIS_THREAD_STACK_SIZE
) stacksize
*= 2;
8067 pthread_attr_setstacksize(&server
.io_threads_attr
, stacksize
);
8068 /* Listen for events in the threaded I/O pipe */
8069 if (aeCreateFileEvent(server
.el
, server
.io_ready_pipe_read
, AE_READABLE
,
8070 vmThreadedIOCompletedJob
, NULL
) == AE_ERR
)
8071 oom("creating file event");
8074 /* Mark the page as used */
8075 static void vmMarkPageUsed(off_t page
) {
8076 off_t byte
= page
/8;
8078 redisAssert(vmFreePage(page
) == 1);
8079 server
.vm_bitmap
[byte
] |= 1<<bit
;
8082 /* Mark N contiguous pages as used, with 'page' being the first. */
8083 static void vmMarkPagesUsed(off_t page
, off_t count
) {
8086 for (j
= 0; j
< count
; j
++)
8087 vmMarkPageUsed(page
+j
);
8088 server
.vm_stats_used_pages
+= count
;
8089 redisLog(REDIS_DEBUG
,"Mark USED pages: %lld pages at %lld\n",
8090 (long long)count
, (long long)page
);
8093 /* Mark the page as free */
8094 static void vmMarkPageFree(off_t page
) {
8095 off_t byte
= page
/8;
8097 redisAssert(vmFreePage(page
) == 0);
8098 server
.vm_bitmap
[byte
] &= ~(1<<bit
);
8101 /* Mark N contiguous pages as free, with 'page' being the first. */
8102 static void vmMarkPagesFree(off_t page
, off_t count
) {
8105 for (j
= 0; j
< count
; j
++)
8106 vmMarkPageFree(page
+j
);
8107 server
.vm_stats_used_pages
-= count
;
8108 redisLog(REDIS_DEBUG
,"Mark FREE pages: %lld pages at %lld\n",
8109 (long long)count
, (long long)page
);
8112 /* Test if the page is free */
8113 static int vmFreePage(off_t page
) {
8114 off_t byte
= page
/8;
8116 return (server
.vm_bitmap
[byte
] & (1<<bit
)) == 0;
8119 /* Find N contiguous free pages storing the first page of the cluster in *first.
8120 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8121 * REDIS_ERR is returned.
8123 * This function uses a simple algorithm: we try to allocate
8124 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8125 * again from the start of the swap file searching for free spaces.
8127 * If it looks pretty clear that there are no free pages near our offset
8128 * we try to find less populated places doing a forward jump of
8129 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8130 * without hurry, and then we jump again and so forth...
8132 * This function can be improved using a free list to avoid to guess
8133 * too much, since we could collect data about freed pages.
8135 * note: I implemented this function just after watching an episode of
8136 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8138 static int vmFindContiguousPages(off_t
*first
, off_t n
) {
8139 off_t base
, offset
= 0, since_jump
= 0, numfree
= 0;
8141 if (server
.vm_near_pages
== REDIS_VM_MAX_NEAR_PAGES
) {
8142 server
.vm_near_pages
= 0;
8143 server
.vm_next_page
= 0;
8145 server
.vm_near_pages
++; /* Yet another try for pages near to the old ones */
8146 base
= server
.vm_next_page
;
8148 while(offset
< server
.vm_pages
) {
8149 off_t
this = base
+offset
;
8151 /* If we overflow, restart from page zero */
8152 if (this >= server
.vm_pages
) {
8153 this -= server
.vm_pages
;
8155 /* Just overflowed, what we found on tail is no longer
8156 * interesting, as it's no longer contiguous. */
8160 if (vmFreePage(this)) {
8161 /* This is a free page */
8163 /* Already got N free pages? Return to the caller, with success */
8165 *first
= this-(n
-1);
8166 server
.vm_next_page
= this+1;
8167 redisLog(REDIS_DEBUG
, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n
, (long long) *first
);
8171 /* The current one is not a free page */
8175 /* Fast-forward if the current page is not free and we already
8176 * searched enough near this place. */
8178 if (!numfree
&& since_jump
>= REDIS_VM_MAX_RANDOM_JUMP
/4) {
8179 offset
+= random() % REDIS_VM_MAX_RANDOM_JUMP
;
8181 /* Note that even if we rewind after the jump, we are don't need
8182 * to make sure numfree is set to zero as we only jump *if* it
8183 * is set to zero. */
8185 /* Otherwise just check the next page */
8192 /* Write the specified object at the specified page of the swap file */
8193 static int vmWriteObjectOnSwap(robj
*o
, off_t page
) {
8194 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
8195 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
8196 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8197 redisLog(REDIS_WARNING
,
8198 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8202 rdbSaveObject(server
.vm_fp
,o
);
8203 fflush(server
.vm_fp
);
8204 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8208 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8209 * needed to later retrieve the object into the key object.
8210 * If we can't find enough contiguous empty pages to swap the object on disk
8211 * REDIS_ERR is returned. */
8212 static int vmSwapObjectBlocking(robj
*key
, robj
*val
) {
8213 off_t pages
= rdbSavedObjectPages(val
,NULL
);
8216 assert(key
->storage
== REDIS_VM_MEMORY
);
8217 assert(key
->refcount
== 1);
8218 if (vmFindContiguousPages(&page
,pages
) == REDIS_ERR
) return REDIS_ERR
;
8219 if (vmWriteObjectOnSwap(val
,page
) == REDIS_ERR
) return REDIS_ERR
;
8220 key
->vm
.page
= page
;
8221 key
->vm
.usedpages
= pages
;
8222 key
->storage
= REDIS_VM_SWAPPED
;
8223 key
->vtype
= val
->type
;
8224 decrRefCount(val
); /* Deallocate the object from memory. */
8225 vmMarkPagesUsed(page
,pages
);
8226 redisLog(REDIS_DEBUG
,"VM: object %s swapped out at %lld (%lld pages)",
8227 (unsigned char*) key
->ptr
,
8228 (unsigned long long) page
, (unsigned long long) pages
);
8229 server
.vm_stats_swapped_objects
++;
8230 server
.vm_stats_swapouts
++;
8234 static robj
*vmReadObjectFromSwap(off_t page
, int type
) {
8237 if (server
.vm_enabled
) pthread_mutex_lock(&server
.io_swapfile_mutex
);
8238 if (fseeko(server
.vm_fp
,page
*server
.vm_page_size
,SEEK_SET
) == -1) {
8239 redisLog(REDIS_WARNING
,
8240 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8244 o
= rdbLoadObject(type
,server
.vm_fp
);
8246 redisLog(REDIS_WARNING
, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno
));
8249 if (server
.vm_enabled
) pthread_mutex_unlock(&server
.io_swapfile_mutex
);
8253 /* Load the value object relative to the 'key' object from swap to memory.
8254 * The newly allocated object is returned.
8256 * If preview is true the unserialized object is returned to the caller but
8257 * no changes are made to the key object, nor the pages are marked as freed */
8258 static robj
*vmGenericLoadObject(robj
*key
, int preview
) {
8261 redisAssert(key
->storage
== REDIS_VM_SWAPPED
|| key
->storage
== REDIS_VM_LOADING
);
8262 val
= vmReadObjectFromSwap(key
->vm
.page
,key
->vtype
);
8264 key
->storage
= REDIS_VM_MEMORY
;
8265 key
->vm
.atime
= server
.unixtime
;
8266 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
8267 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk",
8268 (unsigned char*) key
->ptr
);
8269 server
.vm_stats_swapped_objects
--;
8271 redisLog(REDIS_DEBUG
, "VM: object %s previewed from disk",
8272 (unsigned char*) key
->ptr
);
8274 server
.vm_stats_swapins
++;
8278 /* Plain object loading, from swap to memory */
8279 static robj
*vmLoadObject(robj
*key
) {
8280 /* If we are loading the object in background, stop it, we
8281 * need to load this object synchronously ASAP. */
8282 if (key
->storage
== REDIS_VM_LOADING
)
8283 vmCancelThreadedIOJob(key
);
8284 return vmGenericLoadObject(key
,0);
8287 /* Just load the value on disk, without to modify the key.
8288 * This is useful when we want to perform some operation on the value
8289 * without to really bring it from swap to memory, like while saving the
8290 * dataset or rewriting the append only log. */
8291 static robj
*vmPreviewObject(robj
*key
) {
8292 return vmGenericLoadObject(key
,1);
8295 /* How a good candidate is this object for swapping?
8296 * The better candidate it is, the greater the returned value.
8298 * Currently we try to perform a fast estimation of the object size in
8299 * memory, and combine it with aging informations.
8301 * Basically swappability = idle-time * log(estimated size)
8303 * Bigger objects are preferred over smaller objects, but not
8304 * proportionally, this is why we use the logarithm. This algorithm is
8305 * just a first try and will probably be tuned later. */
8306 static double computeObjectSwappability(robj
*o
) {
8307 time_t age
= server
.unixtime
- o
->vm
.atime
;
8311 struct dictEntry
*de
;
8314 if (age
<= 0) return 0;
8317 if (o
->encoding
!= REDIS_ENCODING_RAW
) {
8320 asize
= sdslen(o
->ptr
)+sizeof(*o
)+sizeof(long)*2;
8325 listNode
*ln
= listFirst(l
);
8327 asize
= sizeof(list
);
8329 robj
*ele
= ln
->value
;
8332 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8333 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8335 asize
+= (sizeof(listNode
)+elesize
)*listLength(l
);
8340 z
= (o
->type
== REDIS_ZSET
);
8341 d
= z
? ((zset
*)o
->ptr
)->dict
: o
->ptr
;
8343 asize
= sizeof(dict
)+(sizeof(struct dictEntry
*)*dictSlots(d
));
8344 if (z
) asize
+= sizeof(zset
)-sizeof(dict
);
8349 de
= dictGetRandomKey(d
);
8350 ele
= dictGetEntryKey(de
);
8351 elesize
= (ele
->encoding
== REDIS_ENCODING_RAW
) ?
8352 (sizeof(*o
)+sdslen(ele
->ptr
)) :
8354 asize
+= (sizeof(struct dictEntry
)+elesize
)*dictSize(d
);
8355 if (z
) asize
+= sizeof(zskiplistNode
)*dictSize(d
);
8359 return (double)age
*log(1+asize
);
8362 /* Try to swap an object that's a good candidate for swapping.
8363 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8364 * to swap any object at all.
8366 * If 'usethreaded' is true, Redis will try to swap the object in background
8367 * using I/O threads. */
8368 static int vmSwapOneObject(int usethreads
) {
8370 struct dictEntry
*best
= NULL
;
8371 double best_swappability
= 0;
8372 redisDb
*best_db
= NULL
;
8375 for (j
= 0; j
< server
.dbnum
; j
++) {
8376 redisDb
*db
= server
.db
+j
;
8377 /* Why maxtries is set to 100?
8378 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8379 * are swappable objects */
8382 if (dictSize(db
->dict
) == 0) continue;
8383 for (i
= 0; i
< 5; i
++) {
8385 double swappability
;
8387 if (maxtries
) maxtries
--;
8388 de
= dictGetRandomKey(db
->dict
);
8389 key
= dictGetEntryKey(de
);
8390 val
= dictGetEntryVal(de
);
8391 /* Only swap objects that are currently in memory.
8393 * Also don't swap shared objects if threaded VM is on, as we
8394 * try to ensure that the main thread does not touch the
8395 * object while the I/O thread is using it, but we can't
8396 * control other keys without adding additional mutex. */
8397 if (key
->storage
!= REDIS_VM_MEMORY
||
8398 (server
.vm_max_threads
!= 0 && val
->refcount
!= 1)) {
8399 if (maxtries
) i
--; /* don't count this try */
8402 swappability
= computeObjectSwappability(val
);
8403 if (!best
|| swappability
> best_swappability
) {
8405 best_swappability
= swappability
;
8410 if (best
== NULL
) return REDIS_ERR
;
8411 key
= dictGetEntryKey(best
);
8412 val
= dictGetEntryVal(best
);
8414 redisLog(REDIS_DEBUG
,"Key with best swappability: %s, %f",
8415 key
->ptr
, best_swappability
);
8417 /* Unshare the key if needed */
8418 if (key
->refcount
> 1) {
8419 robj
*newkey
= dupStringObject(key
);
8421 key
= dictGetEntryKey(best
) = newkey
;
8425 vmSwapObjectThreaded(key
,val
,best_db
);
8428 if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
8429 dictGetEntryVal(best
) = NULL
;
8437 static int vmSwapOneObjectBlocking() {
8438 return vmSwapOneObject(0);
8441 static int vmSwapOneObjectThreaded() {
8442 return vmSwapOneObject(1);
8445 /* Return true if it's safe to swap out objects in a given moment.
8446 * Basically we don't want to swap objects out while there is a BGSAVE
8447 * or a BGAEOREWRITE running in backgroud. */
8448 static int vmCanSwapOut(void) {
8449 return (server
.bgsavechildpid
== -1 && server
.bgrewritechildpid
== -1);
8452 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8453 * and was deleted. Otherwise 0 is returned. */
8454 static int deleteIfSwapped(redisDb
*db
, robj
*key
) {
8458 if ((de
= dictFind(db
->dict
,key
)) == NULL
) return 0;
8459 foundkey
= dictGetEntryKey(de
);
8460 if (foundkey
->storage
== REDIS_VM_MEMORY
) return 0;
8465 /* =================== Virtual Memory - Threaded I/O ======================= */
8467 static void freeIOJob(iojob
*j
) {
8468 if ((j
->type
== REDIS_IOJOB_PREPARE_SWAP
||
8469 j
->type
== REDIS_IOJOB_DO_SWAP
||
8470 j
->type
== REDIS_IOJOB_LOAD
) && j
->val
!= NULL
)
8471 decrRefCount(j
->val
);
8472 decrRefCount(j
->key
);
8476 /* Every time a thread finished a Job, it writes a byte into the write side
8477 * of an unix pipe in order to "awake" the main thread, and this function
8479 static void vmThreadedIOCompletedJob(aeEventLoop
*el
, int fd
, void *privdata
,
8483 int retval
, processed
= 0, toprocess
= -1, trytoswap
= 1;
8485 REDIS_NOTUSED(mask
);
8486 REDIS_NOTUSED(privdata
);
8488 /* For every byte we read in the read side of the pipe, there is one
8489 * I/O job completed to process. */
8490 while((retval
= read(fd
,buf
,1)) == 1) {
8494 struct dictEntry
*de
;
8496 redisLog(REDIS_DEBUG
,"Processing I/O completed job");
8498 /* Get the processed element (the oldest one) */
8500 assert(listLength(server
.io_processed
) != 0);
8501 if (toprocess
== -1) {
8502 toprocess
= (listLength(server
.io_processed
)*REDIS_MAX_COMPLETED_JOBS_PROCESSED
)/100;
8503 if (toprocess
<= 0) toprocess
= 1;
8505 ln
= listFirst(server
.io_processed
);
8507 listDelNode(server
.io_processed
,ln
);
8509 /* If this job is marked as canceled, just ignore it */
8514 /* Post process it in the main thread, as there are things we
8515 * can do just here to avoid race conditions and/or invasive locks */
8516 redisLog(REDIS_DEBUG
,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j
, j
->type
, (void*)j
->key
, (char*)j
->key
->ptr
, j
->key
->refcount
);
8517 de
= dictFind(j
->db
->dict
,j
->key
);
8519 key
= dictGetEntryKey(de
);
8520 if (j
->type
== REDIS_IOJOB_LOAD
) {
8523 /* Key loaded, bring it at home */
8524 key
->storage
= REDIS_VM_MEMORY
;
8525 key
->vm
.atime
= server
.unixtime
;
8526 vmMarkPagesFree(key
->vm
.page
,key
->vm
.usedpages
);
8527 redisLog(REDIS_DEBUG
, "VM: object %s loaded from disk (threaded)",
8528 (unsigned char*) key
->ptr
);
8529 server
.vm_stats_swapped_objects
--;
8530 server
.vm_stats_swapins
++;
8531 dictGetEntryVal(de
) = j
->val
;
8532 incrRefCount(j
->val
);
8535 /* Handle clients waiting for this key to be loaded. */
8536 handleClientsBlockedOnSwappedKey(db
,key
);
8537 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
8538 /* Now we know the amount of pages required to swap this object.
8539 * Let's find some space for it, and queue this task again
8540 * rebranded as REDIS_IOJOB_DO_SWAP. */
8541 if (!vmCanSwapOut() ||
8542 vmFindContiguousPages(&j
->page
,j
->pages
) == REDIS_ERR
)
8544 /* Ooops... no space or we can't swap as there is
8545 * a fork()ed Redis trying to save stuff on disk. */
8547 key
->storage
= REDIS_VM_MEMORY
; /* undo operation */
8549 /* Note that we need to mark this pages as used now,
8550 * if the job will be canceled, we'll mark them as freed
8552 vmMarkPagesUsed(j
->page
,j
->pages
);
8553 j
->type
= REDIS_IOJOB_DO_SWAP
;
8558 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
8561 /* Key swapped. We can finally free some memory. */
8562 if (key
->storage
!= REDIS_VM_SWAPPING
) {
8563 printf("key->storage: %d\n",key
->storage
);
8564 printf("key->name: %s\n",(char*)key
->ptr
);
8565 printf("key->refcount: %d\n",key
->refcount
);
8566 printf("val: %p\n",(void*)j
->val
);
8567 printf("val->type: %d\n",j
->val
->type
);
8568 printf("val->ptr: %s\n",(char*)j
->val
->ptr
);
8570 redisAssert(key
->storage
== REDIS_VM_SWAPPING
);
8571 val
= dictGetEntryVal(de
);
8572 key
->vm
.page
= j
->page
;
8573 key
->vm
.usedpages
= j
->pages
;
8574 key
->storage
= REDIS_VM_SWAPPED
;
8575 key
->vtype
= j
->val
->type
;
8576 decrRefCount(val
); /* Deallocate the object from memory. */
8577 dictGetEntryVal(de
) = NULL
;
8578 redisLog(REDIS_DEBUG
,
8579 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8580 (unsigned char*) key
->ptr
,
8581 (unsigned long long) j
->page
, (unsigned long long) j
->pages
);
8582 server
.vm_stats_swapped_objects
++;
8583 server
.vm_stats_swapouts
++;
8585 /* Put a few more swap requests in queue if we are still
8587 if (trytoswap
&& vmCanSwapOut() &&
8588 zmalloc_used_memory() > server
.vm_max_memory
)
8593 more
= listLength(server
.io_newjobs
) <
8594 (unsigned) server
.vm_max_threads
;
8596 /* Don't waste CPU time if swappable objects are rare. */
8597 if (vmSwapOneObjectThreaded() == REDIS_ERR
) {
8605 if (processed
== toprocess
) return;
8607 if (retval
< 0 && errno
!= EAGAIN
) {
8608 redisLog(REDIS_WARNING
,
8609 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8614 static void lockThreadedIO(void) {
8615 pthread_mutex_lock(&server
.io_mutex
);
8618 static void unlockThreadedIO(void) {
8619 pthread_mutex_unlock(&server
.io_mutex
);
8622 /* Remove the specified object from the threaded I/O queue if still not
8623 * processed, otherwise make sure to flag it as canceled. */
8624 static void vmCancelThreadedIOJob(robj
*o
) {
8626 server
.io_newjobs
, /* 0 */
8627 server
.io_processing
, /* 1 */
8628 server
.io_processed
/* 2 */
8632 assert(o
->storage
== REDIS_VM_LOADING
|| o
->storage
== REDIS_VM_SWAPPING
);
8635 /* Search for a matching key in one of the queues */
8636 for (i
= 0; i
< 3; i
++) {
8640 listRewind(lists
[i
],&li
);
8641 while ((ln
= listNext(&li
)) != NULL
) {
8642 iojob
*job
= ln
->value
;
8644 if (job
->canceled
) continue; /* Skip this, already canceled. */
8645 if (compareStringObjects(job
->key
,o
) == 0) {
8646 redisLog(REDIS_DEBUG
,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8647 (void*)job
, (char*)o
->ptr
, job
->type
, i
);
8648 /* Mark the pages as free since the swap didn't happened
8649 * or happened but is now discarded. */
8650 if (i
!= 1 && job
->type
== REDIS_IOJOB_DO_SWAP
)
8651 vmMarkPagesFree(job
->page
,job
->pages
);
8652 /* Cancel the job. It depends on the list the job is
8655 case 0: /* io_newjobs */
8656 /* If the job was yet not processed the best thing to do
8657 * is to remove it from the queue at all */
8659 listDelNode(lists
[i
],ln
);
8661 case 1: /* io_processing */
8662 /* Oh Shi- the thread is messing with the Job:
8664 * Probably it's accessing the object if this is a
8665 * PREPARE_SWAP or DO_SWAP job.
8666 * If it's a LOAD job it may be reading from disk and
8667 * if we don't wait for the job to terminate before to
8668 * cancel it, maybe in a few microseconds data can be
8669 * corrupted in this pages. So the short story is:
8671 * Better to wait for the job to move into the
8672 * next queue (processed)... */
8674 /* We try again and again until the job is completed. */
8676 /* But let's wait some time for the I/O thread
8677 * to finish with this job. After all this condition
8678 * should be very rare. */
8681 case 2: /* io_processed */
8682 /* The job was already processed, that's easy...
8683 * just mark it as canceled so that we'll ignore it
8684 * when processing completed jobs. */
8688 /* Finally we have to adjust the storage type of the object
8689 * in order to "UNDO" the operaiton. */
8690 if (o
->storage
== REDIS_VM_LOADING
)
8691 o
->storage
= REDIS_VM_SWAPPED
;
8692 else if (o
->storage
== REDIS_VM_SWAPPING
)
8693 o
->storage
= REDIS_VM_MEMORY
;
8700 assert(1 != 1); /* We should never reach this */
8703 static void *IOThreadEntryPoint(void *arg
) {
8708 pthread_detach(pthread_self());
8710 /* Get a new job to process */
8712 if (listLength(server
.io_newjobs
) == 0) {
8713 /* No new jobs in queue, exit. */
8714 redisLog(REDIS_DEBUG
,"Thread %ld exiting, nothing to do",
8715 (long) pthread_self());
8716 server
.io_active_threads
--;
8720 ln
= listFirst(server
.io_newjobs
);
8722 listDelNode(server
.io_newjobs
,ln
);
8723 /* Add the job in the processing queue */
8724 j
->thread
= pthread_self();
8725 listAddNodeTail(server
.io_processing
,j
);
8726 ln
= listLast(server
.io_processing
); /* We use ln later to remove it */
8728 redisLog(REDIS_DEBUG
,"Thread %ld got a new job (type %d): %p about key '%s'",
8729 (long) pthread_self(), j
->type
, (void*)j
, (char*)j
->key
->ptr
);
8731 /* Process the Job */
8732 if (j
->type
== REDIS_IOJOB_LOAD
) {
8733 j
->val
= vmReadObjectFromSwap(j
->page
,j
->key
->vtype
);
8734 } else if (j
->type
== REDIS_IOJOB_PREPARE_SWAP
) {
8735 FILE *fp
= fopen("/dev/null","w+");
8736 j
->pages
= rdbSavedObjectPages(j
->val
,fp
);
8738 } else if (j
->type
== REDIS_IOJOB_DO_SWAP
) {
8739 if (vmWriteObjectOnSwap(j
->val
,j
->page
) == REDIS_ERR
)
8743 /* Done: insert the job into the processed queue */
8744 redisLog(REDIS_DEBUG
,"Thread %ld completed the job: %p (key %s)",
8745 (long) pthread_self(), (void*)j
, (char*)j
->key
->ptr
);
8747 listDelNode(server
.io_processing
,ln
);
8748 listAddNodeTail(server
.io_processed
,j
);
8751 /* Signal the main thread there is new stuff to process */
8752 assert(write(server
.io_ready_pipe_write
,"x",1) == 1);
8754 return NULL
; /* never reached */
8757 static void spawnIOThread(void) {
8759 sigset_t mask
, omask
;
8762 sigaddset(&mask
,SIGCHLD
);
8763 sigaddset(&mask
,SIGHUP
);
8764 sigaddset(&mask
,SIGPIPE
);
8765 pthread_sigmask(SIG_SETMASK
, &mask
, &omask
);
8766 pthread_create(&thread
,&server
.io_threads_attr
,IOThreadEntryPoint
,NULL
);
8767 pthread_sigmask(SIG_SETMASK
, &omask
, NULL
);
8768 server
.io_active_threads
++;
8771 /* We need to wait for the last thread to exit before we are able to
8772 * fork() in order to BGSAVE or BGREWRITEAOF. */
8773 static void waitEmptyIOJobsQueue(void) {
8775 int io_processed_len
;
8778 if (listLength(server
.io_newjobs
) == 0 &&
8779 listLength(server
.io_processing
) == 0 &&
8780 server
.io_active_threads
== 0)
8785 /* While waiting for empty jobs queue condition we post-process some
8786 * finshed job, as I/O threads may be hanging trying to write against
8787 * the io_ready_pipe_write FD but there are so much pending jobs that
8789 io_processed_len
= listLength(server
.io_processed
);
8791 if (io_processed_len
) {
8792 vmThreadedIOCompletedJob(NULL
,server
.io_ready_pipe_read
,NULL
,0);
8793 usleep(1000); /* 1 millisecond */
8795 usleep(10000); /* 10 milliseconds */
8800 static void vmReopenSwapFile(void) {
8801 /* Note: we don't close the old one as we are in the child process
8802 * and don't want to mess at all with the original file object. */
8803 server
.vm_fp
= fopen(server
.vm_swap_file
,"r+b");
8804 if (server
.vm_fp
== NULL
) {
8805 redisLog(REDIS_WARNING
,"Can't re-open the VM swap file: %s. Exiting.",
8806 server
.vm_swap_file
);
8809 server
.vm_fd
= fileno(server
.vm_fp
);
8812 /* This function must be called while with threaded IO locked */
8813 static void queueIOJob(iojob
*j
) {
8814 redisLog(REDIS_DEBUG
,"Queued IO Job %p type %d about key '%s'\n",
8815 (void*)j
, j
->type
, (char*)j
->key
->ptr
);
8816 listAddNodeTail(server
.io_newjobs
,j
);
8817 if (server
.io_active_threads
< server
.vm_max_threads
)
8821 static int vmSwapObjectThreaded(robj
*key
, robj
*val
, redisDb
*db
) {
8824 assert(key
->storage
== REDIS_VM_MEMORY
);
8825 assert(key
->refcount
== 1);
8827 j
= zmalloc(sizeof(*j
));
8828 j
->type
= REDIS_IOJOB_PREPARE_SWAP
;
8830 j
->key
= dupStringObject(key
);
8834 j
->thread
= (pthread_t
) -1;
8835 key
->storage
= REDIS_VM_SWAPPING
;
8843 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8845 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8846 * If there is not already a job loading the key, it is craeted.
8847 * The key is added to the io_keys list in the client structure, and also
8848 * in the hash table mapping swapped keys to waiting clients, that is,
8849 * server.io_waited_keys. */
8850 static int waitForSwappedKey(redisClient
*c
, robj
*key
) {
8851 struct dictEntry
*de
;
8855 /* If the key does not exist or is already in RAM we don't need to
8856 * block the client at all. */
8857 de
= dictFind(c
->db
->dict
,key
);
8858 if (de
== NULL
) return 0;
8859 o
= dictGetEntryKey(de
);
8860 if (o
->storage
== REDIS_VM_MEMORY
) {
8862 } else if (o
->storage
== REDIS_VM_SWAPPING
) {
8863 /* We were swapping the key, undo it! */
8864 vmCancelThreadedIOJob(o
);
8868 /* OK: the key is either swapped, or being loaded just now. */
8870 /* Add the key to the list of keys this client is waiting for.
8871 * This maps clients to keys they are waiting for. */
8872 listAddNodeTail(c
->io_keys
,key
);
8875 /* Add the client to the swapped keys => clients waiting map. */
8876 de
= dictFind(c
->db
->io_keys
,key
);
8880 /* For every key we take a list of clients blocked for it */
8882 retval
= dictAdd(c
->db
->io_keys
,key
,l
);
8884 assert(retval
== DICT_OK
);
8886 l
= dictGetEntryVal(de
);
8888 listAddNodeTail(l
,c
);
8890 /* Are we already loading the key from disk? If not create a job */
8891 if (o
->storage
== REDIS_VM_SWAPPED
) {
8894 o
->storage
= REDIS_VM_LOADING
;
8895 j
= zmalloc(sizeof(*j
));
8896 j
->type
= REDIS_IOJOB_LOAD
;
8898 j
->key
= dupStringObject(key
);
8899 j
->key
->vtype
= o
->vtype
;
8900 j
->page
= o
->vm
.page
;
8903 j
->thread
= (pthread_t
) -1;
8911 /* Is this client attempting to run a command against swapped keys?
8912 * If so, block it ASAP, load the keys in background, then resume it.
8914 * The important idea about this function is that it can fail! If keys will
8915 * still be swapped when the client is resumed, this key lookups will
8916 * just block loading keys from disk. In practical terms this should only
8917 * happen with SORT BY command or if there is a bug in this function.
8919 * Return 1 if the client is marked as blocked, 0 if the client can
8920 * continue as the keys it is going to access appear to be in memory. */
8921 static int blockClientOnSwappedKeys(struct redisCommand
*cmd
, redisClient
*c
) {
8924 if (cmd
->vm_firstkey
== 0) return 0;
8925 last
= cmd
->vm_lastkey
;
8926 if (last
< 0) last
= c
->argc
+last
;
8927 for (j
= cmd
->vm_firstkey
; j
<= last
; j
+= cmd
->vm_keystep
)
8928 waitForSwappedKey(c
,c
->argv
[j
]);
8929 /* If the client was blocked for at least one key, mark it as blocked. */
8930 if (listLength(c
->io_keys
)) {
8931 c
->flags
|= REDIS_IO_WAIT
;
8932 aeDeleteFileEvent(server
.el
,c
->fd
,AE_READABLE
);
8933 server
.vm_blocked_clients
++;
8940 /* Remove the 'key' from the list of blocked keys for a given client.
8942 * The function returns 1 when there are no longer blocking keys after
8943 * the current one was removed (and the client can be unblocked). */
8944 static int dontWaitForSwappedKey(redisClient
*c
, robj
*key
) {
8948 struct dictEntry
*de
;
8950 /* Remove the key from the list of keys this client is waiting for. */
8951 listRewind(c
->io_keys
,&li
);
8952 while ((ln
= listNext(&li
)) != NULL
) {
8953 if (compareStringObjects(ln
->value
,key
) == 0) {
8954 listDelNode(c
->io_keys
,ln
);
8960 /* Remove the client form the key => waiting clients map. */
8961 de
= dictFind(c
->db
->io_keys
,key
);
8963 l
= dictGetEntryVal(de
);
8964 ln
= listSearchKey(l
,c
);
8967 if (listLength(l
) == 0)
8968 dictDelete(c
->db
->io_keys
,key
);
8970 return listLength(c
->io_keys
) == 0;
8973 static void handleClientsBlockedOnSwappedKey(redisDb
*db
, robj
*key
) {
8974 struct dictEntry
*de
;
8979 de
= dictFind(db
->io_keys
,key
);
8982 l
= dictGetEntryVal(de
);
8983 len
= listLength(l
);
8984 /* Note: we can't use something like while(listLength(l)) as the list
8985 * can be freed by the calling function when we remove the last element. */
8988 redisClient
*c
= ln
->value
;
8990 if (dontWaitForSwappedKey(c
,key
)) {
8991 /* Put the client in the list of clients ready to go as we
8992 * loaded all the keys about it. */
8993 listAddNodeTail(server
.io_ready_clients
,c
);
8998 /* ================================= Debugging ============================== */
9000 static void debugCommand(redisClient
*c
) {
9001 if (!strcasecmp(c
->argv
[1]->ptr
,"segfault")) {
9003 } else if (!strcasecmp(c
->argv
[1]->ptr
,"reload")) {
9004 if (rdbSave(server
.dbfilename
) != REDIS_OK
) {
9005 addReply(c
,shared
.err
);
9009 if (rdbLoad(server
.dbfilename
) != REDIS_OK
) {
9010 addReply(c
,shared
.err
);
9013 redisLog(REDIS_WARNING
,"DB reloaded by DEBUG RELOAD");
9014 addReply(c
,shared
.ok
);
9015 } else if (!strcasecmp(c
->argv
[1]->ptr
,"loadaof")) {
9017 if (loadAppendOnlyFile(server
.appendfilename
) != REDIS_OK
) {
9018 addReply(c
,shared
.err
);
9021 redisLog(REDIS_WARNING
,"Append Only File loaded by DEBUG LOADAOF");
9022 addReply(c
,shared
.ok
);
9023 } else if (!strcasecmp(c
->argv
[1]->ptr
,"object") && c
->argc
== 3) {
9024 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
9028 addReply(c
,shared
.nokeyerr
);
9031 key
= dictGetEntryKey(de
);
9032 val
= dictGetEntryVal(de
);
9033 if (!server
.vm_enabled
|| (key
->storage
== REDIS_VM_MEMORY
||
9034 key
->storage
== REDIS_VM_SWAPPING
)) {
9038 if (val
->encoding
< (sizeof(strencoding
)/sizeof(char*))) {
9039 strenc
= strencoding
[val
->encoding
];
9041 snprintf(buf
,64,"unknown encoding %d\n", val
->encoding
);
9044 addReplySds(c
,sdscatprintf(sdsempty(),
9045 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9046 "encoding:%s serializedlength:%lld\r\n",
9047 (void*)key
, key
->refcount
, (void*)val
, val
->refcount
,
9048 strenc
, (long long) rdbSavedObjectLen(val
,NULL
)));
9050 addReplySds(c
,sdscatprintf(sdsempty(),
9051 "+Key at:%p refcount:%d, value swapped at: page %llu "
9052 "using %llu pages\r\n",
9053 (void*)key
, key
->refcount
, (unsigned long long) key
->vm
.page
,
9054 (unsigned long long) key
->vm
.usedpages
));
9056 } else if (!strcasecmp(c
->argv
[1]->ptr
,"swapout") && c
->argc
== 3) {
9057 dictEntry
*de
= dictFind(c
->db
->dict
,c
->argv
[2]);
9060 if (!server
.vm_enabled
) {
9061 addReplySds(c
,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9065 addReply(c
,shared
.nokeyerr
);
9068 key
= dictGetEntryKey(de
);
9069 val
= dictGetEntryVal(de
);
9070 /* If the key is shared we want to create a copy */
9071 if (key
->refcount
> 1) {
9072 robj
*newkey
= dupStringObject(key
);
9074 key
= dictGetEntryKey(de
) = newkey
;
9077 if (key
->storage
!= REDIS_VM_MEMORY
) {
9078 addReplySds(c
,sdsnew("-ERR This key is not in memory\r\n"));
9079 } else if (vmSwapObjectBlocking(key
,val
) == REDIS_OK
) {
9080 dictGetEntryVal(de
) = NULL
;
9081 addReply(c
,shared
.ok
);
9083 addReply(c
,shared
.err
);
9086 addReplySds(c
,sdsnew(
9087 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9091 static void _redisAssert(char *estr
, char *file
, int line
) {
9092 redisLog(REDIS_WARNING
,"=== ASSERTION FAILED ===");
9093 redisLog(REDIS_WARNING
,"==> %s:%d '%s' is not true\n",file
,line
,estr
);
9094 #ifdef HAVE_BACKTRACE
9095 redisLog(REDIS_WARNING
,"(forcing SIGSEGV in order to print the stack trace)");
9100 /* =================================== Main! ================================ */
9103 int linuxOvercommitMemoryValue(void) {
9104 FILE *fp
= fopen("/proc/sys/vm/overcommit_memory","r");
9108 if (fgets(buf
,64,fp
) == NULL
) {
9117 void linuxOvercommitMemoryWarning(void) {
9118 if (linuxOvercommitMemoryValue() == 0) {
9119 redisLog(REDIS_WARNING
,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9122 #endif /* __linux__ */
9124 static void daemonize(void) {
9128 if (fork() != 0) exit(0); /* parent exits */
9129 setsid(); /* create a new session */
9131 /* Every output goes to /dev/null. If Redis is daemonized but
9132 * the 'logfile' is set to 'stdout' in the configuration file
9133 * it will not log at all. */
9134 if ((fd
= open("/dev/null", O_RDWR
, 0)) != -1) {
9135 dup2(fd
, STDIN_FILENO
);
9136 dup2(fd
, STDOUT_FILENO
);
9137 dup2(fd
, STDERR_FILENO
);
9138 if (fd
> STDERR_FILENO
) close(fd
);
9140 /* Try to write the pid file */
9141 fp
= fopen(server
.pidfile
,"w");
9143 fprintf(fp
,"%d\n",getpid());
9148 int main(int argc
, char **argv
) {
9153 resetServerSaveParams();
9154 loadServerConfig(argv
[1]);
9155 } else if (argc
> 2) {
9156 fprintf(stderr
,"Usage: ./redis-server [/path/to/redis.conf]\n");
9159 redisLog(REDIS_WARNING
,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9161 if (server
.daemonize
) daemonize();
9163 redisLog(REDIS_NOTICE
,"Server started, Redis version " REDIS_VERSION
);
9165 linuxOvercommitMemoryWarning();
9168 if (server
.appendonly
) {
9169 if (loadAppendOnlyFile(server
.appendfilename
) == REDIS_OK
)
9170 redisLog(REDIS_NOTICE
,"DB loaded from append only file: %ld seconds",time(NULL
)-start
);
9172 if (rdbLoad(server
.dbfilename
) == REDIS_OK
)
9173 redisLog(REDIS_NOTICE
,"DB loaded from disk: %ld seconds",time(NULL
)-start
);
9175 redisLog(REDIS_NOTICE
,"The server is now ready to accept connections on port %d", server
.port
);
9176 aeSetBeforeSleepProc(server
.el
,beforeSleep
);
9178 aeDeleteEventLoop(server
.el
);
9182 /* ============================= Backtrace support ========================= */
9184 #ifdef HAVE_BACKTRACE
9185 static char *findFuncName(void *pointer
, unsigned long *offset
);
9187 static void *getMcontextEip(ucontext_t
*uc
) {
9188 #if defined(__FreeBSD__)
9189 return (void*) uc
->uc_mcontext
.mc_eip
;
9190 #elif defined(__dietlibc__)
9191 return (void*) uc
->uc_mcontext
.eip
;
9192 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9194 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
9196 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
9198 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9199 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9200 return (void*) uc
->uc_mcontext
->__ss
.__rip
;
9202 return (void*) uc
->uc_mcontext
->__ss
.__eip
;
9204 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9205 return (void*) uc
->uc_mcontext
.gregs
[REG_EIP
]; /* Linux 32/64 bit */
9206 #elif defined(__ia64__) /* Linux IA64 */
9207 return (void*) uc
->uc_mcontext
.sc_ip
;
9213 static void segvHandler(int sig
, siginfo_t
*info
, void *secret
) {
9215 char **messages
= NULL
;
9216 int i
, trace_size
= 0;
9217 unsigned long offset
=0;
9218 ucontext_t
*uc
= (ucontext_t
*) secret
;
9220 REDIS_NOTUSED(info
);
9222 redisLog(REDIS_WARNING
,
9223 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION
, sig
);
9224 infostring
= genRedisInfoString();
9225 redisLog(REDIS_WARNING
, "%s",infostring
);
9226 /* It's not safe to sdsfree() the returned string under memory
9227 * corruption conditions. Let it leak as we are going to abort */
9229 trace_size
= backtrace(trace
, 100);
9230 /* overwrite sigaction with caller's address */
9231 if (getMcontextEip(uc
) != NULL
) {
9232 trace
[1] = getMcontextEip(uc
);
9234 messages
= backtrace_symbols(trace
, trace_size
);
9236 for (i
=1; i
<trace_size
; ++i
) {
9237 char *fn
= findFuncName(trace
[i
], &offset
), *p
;
9239 p
= strchr(messages
[i
],'+');
9240 if (!fn
|| (p
&& ((unsigned long)strtol(p
+1,NULL
,10)) < offset
)) {
9241 redisLog(REDIS_WARNING
,"%s", messages
[i
]);
9243 redisLog(REDIS_WARNING
,"%d redis-server %p %s + %d", i
, trace
[i
], fn
, (unsigned int)offset
);
9246 /* free(messages); Don't call free() with possibly corrupted memory. */
9250 static void setupSigSegvAction(void) {
9251 struct sigaction act
;
9253 sigemptyset (&act
.sa_mask
);
9254 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9255 * is used. Otherwise, sa_handler is used */
9256 act
.sa_flags
= SA_NODEFER
| SA_ONSTACK
| SA_RESETHAND
| SA_SIGINFO
;
9257 act
.sa_sigaction
= segvHandler
;
9258 sigaction (SIGSEGV
, &act
, NULL
);
9259 sigaction (SIGBUS
, &act
, NULL
);
9260 sigaction (SIGFPE
, &act
, NULL
);
9261 sigaction (SIGILL
, &act
, NULL
);
9262 sigaction (SIGBUS
, &act
, NULL
);
9266 #include "staticsymbols.h"
9267 /* This function try to convert a pointer into a function name. It's used in
9268 * oreder to provide a backtrace under segmentation fault that's able to
9269 * display functions declared as static (otherwise the backtrace is useless). */
9270 static char *findFuncName(void *pointer
, unsigned long *offset
){
9272 unsigned long off
, minoff
= 0;
9274 /* Try to match against the Symbol with the smallest offset */
9275 for (i
=0; symsTable
[i
].pointer
; i
++) {
9276 unsigned long lp
= (unsigned long) pointer
;
9278 if (lp
!= (unsigned long)-1 && lp
>= symsTable
[i
].pointer
) {
9279 off
=lp
-symsTable
[i
].pointer
;
9280 if (ret
< 0 || off
< minoff
) {
9286 if (ret
== -1) return NULL
;
9288 return symsTable
[ret
].name
;
9290 #else /* HAVE_BACKTRACE */
9291 static void setupSigSegvAction(void) {
9293 #endif /* HAVE_BACKTRACE */