]> git.saurik.com Git - redis.git/blame_incremental - redis.c
Merge branch 'master' of github.com:antirez/redis
[redis.git] / redis.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#define REDIS_VERSION "1.3.8"
31
32#include "fmacros.h"
33#include "config.h"
34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#define __USE_POSIX199309
41#define __USE_UNIX98
42#include <signal.h>
43
44#ifdef HAVE_BACKTRACE
45#include <execinfo.h>
46#include <ucontext.h>
47#endif /* HAVE_BACKTRACE */
48
49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#define __USE_GNU
58#include <fcntl.h>
59#include <sys/time.h>
60#include <sys/resource.h>
61#include <sys/uio.h>
62#include <limits.h>
63#include <math.h>
64#include <pthread.h>
65
66#if defined(__sun)
67#include "solarisfixes.h"
68#endif
69
70#include "redis.h"
71#include "ae.h" /* Event driven programming library */
72#include "sds.h" /* Dynamic safe strings */
73#include "anet.h" /* Networking the easy way */
74#include "dict.h" /* Hash tables */
75#include "adlist.h" /* Linked lists */
76#include "zmalloc.h" /* total memory usage aware version of malloc/free */
77#include "lzf.h" /* LZF compression library */
78#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
79#include "zipmap.h"
80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88#define REDIS_IOBUF_LEN 1024
89#define REDIS_LOADBUF_LEN 1024
90#define REDIS_STATIC_ARGS 8
91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107/* Command flags */
108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
121#define REDIS_ZSET 3
122#define REDIS_HASH 4
123
124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
136/* Object types only used for dumping to disk */
137#define REDIS_EXPIRETIME 253
138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
157#define REDIS_RDB_ENCVAL 3
158#define REDIS_RDB_LENERR UINT_MAX
159
160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
178#define REDIS_VM_MAX_THREADS 32
179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186/* Client flags */
187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193
194/* Slave replication state - slave side */
195#define REDIS_REPL_NONE 0 /* No active replication */
196#define REDIS_REPL_CONNECT 1 /* Must connect to master */
197#define REDIS_REPL_CONNECTED 2 /* Connected to master */
198
199/* Slave replication state - from the point of view of master
200 * Note that in SEND_BULK and ONLINE state the slave receives new updates
201 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
202 * to start the next background saving in order to send updates to it. */
203#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
204#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
205#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
206#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
207
208/* List related stuff */
209#define REDIS_HEAD 0
210#define REDIS_TAIL 1
211
212/* Sort operations */
213#define REDIS_SORT_GET 0
214#define REDIS_SORT_ASC 1
215#define REDIS_SORT_DESC 2
216#define REDIS_SORTKEY_MAX 1024
217
218/* Log levels */
219#define REDIS_DEBUG 0
220#define REDIS_VERBOSE 1
221#define REDIS_NOTICE 2
222#define REDIS_WARNING 3
223
224/* Anti-warning macro... */
225#define REDIS_NOTUSED(V) ((void) V)
226
227#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
228#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
229
230/* Append only defines */
231#define APPENDFSYNC_NO 0
232#define APPENDFSYNC_ALWAYS 1
233#define APPENDFSYNC_EVERYSEC 2
234
235/* Hashes related defaults */
236#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
237#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
238
239/* We can print the stacktrace, so our assert is defined this way: */
240#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
241#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
242static void _redisAssert(char *estr, char *file, int line);
243static void _redisPanic(char *msg, char *file, int line);
244
245/*================================= Data types ============================== */
246
247/* A redis object, that is a type able to hold a string / list / set */
248
249/* The VM object structure */
250struct redisObjectVM {
251 off_t page; /* the page at witch the object is stored on disk */
252 off_t usedpages; /* number of pages used on disk */
253 time_t atime; /* Last access time */
254} vm;
255
256/* The actual Redis Object */
257typedef struct redisObject {
258 void *ptr;
259 unsigned char type;
260 unsigned char encoding;
261 unsigned char storage; /* If this object is a key, where is the value?
262 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
263 unsigned char vtype; /* If this object is a key, and value is swapped out,
264 * this is the type of the swapped out object. */
265 int refcount;
266 /* VM fields, this are only allocated if VM is active, otherwise the
267 * object allocation function will just allocate
268 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
269 * Redis without VM active will not have any overhead. */
270 struct redisObjectVM vm;
271} robj;
272
273/* Macro used to initalize a Redis object allocated on the stack.
274 * Note that this macro is taken near the structure definition to make sure
275 * we'll update it when the structure is changed, to avoid bugs like
276 * bug #85 introduced exactly in this way. */
277#define initStaticStringObject(_var,_ptr) do { \
278 _var.refcount = 1; \
279 _var.type = REDIS_STRING; \
280 _var.encoding = REDIS_ENCODING_RAW; \
281 _var.ptr = _ptr; \
282 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
283} while(0);
284
285typedef struct redisDb {
286 dict *dict; /* The keyspace for this DB */
287 dict *expires; /* Timeout of keys with a timeout set */
288 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
289 dict *io_keys; /* Keys with clients waiting for VM I/O */
290 int id;
291} redisDb;
292
293/* Client MULTI/EXEC state */
294typedef struct multiCmd {
295 robj **argv;
296 int argc;
297 struct redisCommand *cmd;
298} multiCmd;
299
300typedef struct multiState {
301 multiCmd *commands; /* Array of MULTI commands */
302 int count; /* Total number of MULTI commands */
303} multiState;
304
305/* With multiplexing we need to take per-clinet state.
306 * Clients are taken in a liked list. */
307typedef struct redisClient {
308 int fd;
309 redisDb *db;
310 int dictid;
311 sds querybuf;
312 robj **argv, **mbargv;
313 int argc, mbargc;
314 int bulklen; /* bulk read len. -1 if not in bulk read mode */
315 int multibulk; /* multi bulk command format active */
316 list *reply;
317 int sentlen;
318 time_t lastinteraction; /* time of the last interaction, used for timeout */
319 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
320 int slaveseldb; /* slave selected db, if this client is a slave */
321 int authenticated; /* when requirepass is non-NULL */
322 int replstate; /* replication state if this is a slave */
323 int repldbfd; /* replication DB file descriptor */
324 long repldboff; /* replication DB file offset */
325 off_t repldbsize; /* replication DB file size */
326 multiState mstate; /* MULTI/EXEC state */
327 robj **blockingkeys; /* The key we are waiting to terminate a blocking
328 * operation such as BLPOP. Otherwise NULL. */
329 int blockingkeysnum; /* Number of blocking keys */
330 time_t blockingto; /* Blocking operation timeout. If UNIX current time
331 * is >= blockingto then the operation timed out. */
332 list *io_keys; /* Keys this client is waiting to be loaded from the
333 * swap file in order to continue. */
334 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
335 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
336} redisClient;
337
338struct saveparam {
339 time_t seconds;
340 int changes;
341};
342
343/* Global server state structure */
344struct redisServer {
345 int port;
346 int fd;
347 redisDb *db;
348 long long dirty; /* changes to DB from the last save */
349 list *clients;
350 list *slaves, *monitors;
351 char neterr[ANET_ERR_LEN];
352 aeEventLoop *el;
353 int cronloops; /* number of times the cron function run */
354 list *objfreelist; /* A list of freed objects to avoid malloc() */
355 time_t lastsave; /* Unix time of last save succeeede */
356 /* Fields used only for stats */
357 time_t stat_starttime; /* server start time */
358 long long stat_numcommands; /* number of processed commands */
359 long long stat_numconnections; /* number of connections received */
360 long long stat_expiredkeys; /* number of expired keys */
361 /* Configuration */
362 int verbosity;
363 int glueoutputbuf;
364 int maxidletime;
365 int dbnum;
366 int daemonize;
367 int appendonly;
368 int appendfsync;
369 time_t lastfsync;
370 int appendfd;
371 int appendseldb;
372 char *pidfile;
373 pid_t bgsavechildpid;
374 pid_t bgrewritechildpid;
375 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
376 struct saveparam *saveparams;
377 int saveparamslen;
378 char *logfile;
379 char *bindaddr;
380 char *dbfilename;
381 char *appendfilename;
382 char *requirepass;
383 int shareobjects;
384 int rdbcompression;
385 int activerehashing;
386 /* Replication related */
387 int isslave;
388 char *masterauth;
389 char *masterhost;
390 int masterport;
391 redisClient *master; /* client that is master for this slave */
392 int replstate;
393 unsigned int maxclients;
394 unsigned long long maxmemory;
395 unsigned int blpop_blocked_clients;
396 unsigned int vm_blocked_clients;
397 /* Sort parameters - qsort_r() is only available under BSD so we
398 * have to take this state global, in order to pass it to sortCompare() */
399 int sort_desc;
400 int sort_alpha;
401 int sort_bypattern;
402 /* Virtual memory configuration */
403 int vm_enabled;
404 char *vm_swap_file;
405 off_t vm_page_size;
406 off_t vm_pages;
407 unsigned long long vm_max_memory;
408 /* Hashes config */
409 size_t hash_max_zipmap_entries;
410 size_t hash_max_zipmap_value;
411 /* Virtual memory state */
412 FILE *vm_fp;
413 int vm_fd;
414 off_t vm_next_page; /* Next probably empty page */
415 off_t vm_near_pages; /* Number of pages allocated sequentially */
416 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
417 time_t unixtime; /* Unix time sampled every second. */
418 /* Virtual memory I/O threads stuff */
419 /* An I/O thread process an element taken from the io_jobs queue and
420 * put the result of the operation in the io_done list. While the
421 * job is being processed, it's put on io_processing queue. */
422 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
423 list *io_processing; /* List of VM I/O jobs being processed */
424 list *io_processed; /* List of VM I/O jobs already processed */
425 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
426 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
427 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
428 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
429 pthread_attr_t io_threads_attr; /* attributes for threads creation */
430 int io_active_threads; /* Number of running I/O threads */
431 int vm_max_threads; /* Max number of I/O threads running at the same time */
432 /* Our main thread is blocked on the event loop, locking for sockets ready
433 * to be read or written, so when a threaded I/O operation is ready to be
434 * processed by the main thread, the I/O thread will use a unix pipe to
435 * awake the main thread. The followings are the two pipe FDs. */
436 int io_ready_pipe_read;
437 int io_ready_pipe_write;
438 /* Virtual memory stats */
439 unsigned long long vm_stats_used_pages;
440 unsigned long long vm_stats_swapped_objects;
441 unsigned long long vm_stats_swapouts;
442 unsigned long long vm_stats_swapins;
443 /* Pubsub */
444 dict *pubsub_channels; /* Map channels to list of subscribed clients */
445 list *pubsub_patterns; /* A list of pubsub_patterns */
446 /* Misc */
447 FILE *devnull;
448};
449
450typedef struct pubsubPattern {
451 redisClient *client;
452 robj *pattern;
453} pubsubPattern;
454
455typedef void redisCommandProc(redisClient *c);
456struct redisCommand {
457 char *name;
458 redisCommandProc *proc;
459 int arity;
460 int flags;
461 /* Use a function to determine which keys need to be loaded
462 * in the background prior to executing this command. Takes precedence
463 * over vm_firstkey and others, ignored when NULL */
464 redisCommandProc *vm_preload_proc;
465 /* What keys should be loaded in background when calling this command? */
466 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
467 int vm_lastkey; /* THe last argument that's a key */
468 int vm_keystep; /* The step between first and last key */
469};
470
471struct redisFunctionSym {
472 char *name;
473 unsigned long pointer;
474};
475
476typedef struct _redisSortObject {
477 robj *obj;
478 union {
479 double score;
480 robj *cmpobj;
481 } u;
482} redisSortObject;
483
484typedef struct _redisSortOperation {
485 int type;
486 robj *pattern;
487} redisSortOperation;
488
489/* ZSETs use a specialized version of Skiplists */
490
491typedef struct zskiplistNode {
492 struct zskiplistNode **forward;
493 struct zskiplistNode *backward;
494 unsigned int *span;
495 double score;
496 robj *obj;
497} zskiplistNode;
498
499typedef struct zskiplist {
500 struct zskiplistNode *header, *tail;
501 unsigned long length;
502 int level;
503} zskiplist;
504
505typedef struct zset {
506 dict *dict;
507 zskiplist *zsl;
508} zset;
509
510/* Our shared "common" objects */
511
512#define REDIS_SHARED_INTEGERS 10000
513struct sharedObjectsStruct {
514 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
515 *colon, *nullbulk, *nullmultibulk, *queued,
516 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
517 *outofrangeerr, *plus,
518 *select0, *select1, *select2, *select3, *select4,
519 *select5, *select6, *select7, *select8, *select9,
520 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
521 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
522} shared;
523
524/* Global vars that are actally used as constants. The following double
525 * values are used for double on-disk serialization, and are initialized
526 * at runtime to avoid strange compiler optimizations. */
527
528static double R_Zero, R_PosInf, R_NegInf, R_Nan;
529
530/* VM threaded I/O request message */
531#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
532#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
533#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
534typedef struct iojob {
535 int type; /* Request type, REDIS_IOJOB_* */
536 redisDb *db;/* Redis database */
537 robj *key; /* This I/O request is about swapping this key */
538 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
539 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
540 off_t page; /* Swap page where to read/write the object */
541 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
542 int canceled; /* True if this command was canceled by blocking side of VM */
543 pthread_t thread; /* ID of the thread processing this entry */
544} iojob;
545
546/*================================ Prototypes =============================== */
547
548static void freeStringObject(robj *o);
549static void freeListObject(robj *o);
550static void freeSetObject(robj *o);
551static void decrRefCount(void *o);
552static robj *createObject(int type, void *ptr);
553static void freeClient(redisClient *c);
554static int rdbLoad(char *filename);
555static void addReply(redisClient *c, robj *obj);
556static void addReplySds(redisClient *c, sds s);
557static void incrRefCount(robj *o);
558static int rdbSaveBackground(char *filename);
559static robj *createStringObject(char *ptr, size_t len);
560static robj *dupStringObject(robj *o);
561static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
562static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
563static int syncWithMaster(void);
564static robj *tryObjectEncoding(robj *o);
565static robj *getDecodedObject(robj *o);
566static int removeExpire(redisDb *db, robj *key);
567static int expireIfNeeded(redisDb *db, robj *key);
568static int deleteIfVolatile(redisDb *db, robj *key);
569static int deleteIfSwapped(redisDb *db, robj *key);
570static int deleteKey(redisDb *db, robj *key);
571static time_t getExpire(redisDb *db, robj *key);
572static int setExpire(redisDb *db, robj *key, time_t when);
573static void updateSlavesWaitingBgsave(int bgsaveerr);
574static void freeMemoryIfNeeded(void);
575static int processCommand(redisClient *c);
576static void setupSigSegvAction(void);
577static void rdbRemoveTempFile(pid_t childpid);
578static void aofRemoveTempFile(pid_t childpid);
579static size_t stringObjectLen(robj *o);
580static void processInputBuffer(redisClient *c);
581static zskiplist *zslCreate(void);
582static void zslFree(zskiplist *zsl);
583static void zslInsert(zskiplist *zsl, double score, robj *obj);
584static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
585static void initClientMultiState(redisClient *c);
586static void freeClientMultiState(redisClient *c);
587static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
588static void unblockClientWaitingData(redisClient *c);
589static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
590static void vmInit(void);
591static void vmMarkPagesFree(off_t page, off_t count);
592static robj *vmLoadObject(robj *key);
593static robj *vmPreviewObject(robj *key);
594static int vmSwapOneObjectBlocking(void);
595static int vmSwapOneObjectThreaded(void);
596static int vmCanSwapOut(void);
597static int tryFreeOneObjectFromFreelist(void);
598static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
599static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
600static void vmCancelThreadedIOJob(robj *o);
601static void lockThreadedIO(void);
602static void unlockThreadedIO(void);
603static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
604static void freeIOJob(iojob *j);
605static void queueIOJob(iojob *j);
606static int vmWriteObjectOnSwap(robj *o, off_t page);
607static robj *vmReadObjectFromSwap(off_t page, int type);
608static void waitEmptyIOJobsQueue(void);
609static void vmReopenSwapFile(void);
610static int vmFreePage(off_t page);
611static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
612static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
613static int dontWaitForSwappedKey(redisClient *c, robj *key);
614static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
615static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
616static struct redisCommand *lookupCommand(char *name);
617static void call(redisClient *c, struct redisCommand *cmd);
618static void resetClient(redisClient *c);
619static void convertToRealHash(robj *o);
620static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
621static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
622static void freePubsubPattern(void *p);
623static int listMatchPubsubPattern(void *a, void *b);
624static int compareStringObjects(robj *a, robj *b);
625static void usage();
626
627static void authCommand(redisClient *c);
628static void pingCommand(redisClient *c);
629static void echoCommand(redisClient *c);
630static void setCommand(redisClient *c);
631static void setnxCommand(redisClient *c);
632static void getCommand(redisClient *c);
633static void delCommand(redisClient *c);
634static void existsCommand(redisClient *c);
635static void incrCommand(redisClient *c);
636static void decrCommand(redisClient *c);
637static void incrbyCommand(redisClient *c);
638static void decrbyCommand(redisClient *c);
639static void selectCommand(redisClient *c);
640static void randomkeyCommand(redisClient *c);
641static void keysCommand(redisClient *c);
642static void dbsizeCommand(redisClient *c);
643static void lastsaveCommand(redisClient *c);
644static void saveCommand(redisClient *c);
645static void bgsaveCommand(redisClient *c);
646static void bgrewriteaofCommand(redisClient *c);
647static void shutdownCommand(redisClient *c);
648static void moveCommand(redisClient *c);
649static void renameCommand(redisClient *c);
650static void renamenxCommand(redisClient *c);
651static void lpushCommand(redisClient *c);
652static void rpushCommand(redisClient *c);
653static void lpopCommand(redisClient *c);
654static void rpopCommand(redisClient *c);
655static void llenCommand(redisClient *c);
656static void lindexCommand(redisClient *c);
657static void lrangeCommand(redisClient *c);
658static void ltrimCommand(redisClient *c);
659static void typeCommand(redisClient *c);
660static void lsetCommand(redisClient *c);
661static void saddCommand(redisClient *c);
662static void sremCommand(redisClient *c);
663static void smoveCommand(redisClient *c);
664static void sismemberCommand(redisClient *c);
665static void scardCommand(redisClient *c);
666static void spopCommand(redisClient *c);
667static void srandmemberCommand(redisClient *c);
668static void sinterCommand(redisClient *c);
669static void sinterstoreCommand(redisClient *c);
670static void sunionCommand(redisClient *c);
671static void sunionstoreCommand(redisClient *c);
672static void sdiffCommand(redisClient *c);
673static void sdiffstoreCommand(redisClient *c);
674static void syncCommand(redisClient *c);
675static void flushdbCommand(redisClient *c);
676static void flushallCommand(redisClient *c);
677static void sortCommand(redisClient *c);
678static void lremCommand(redisClient *c);
679static void rpoplpushcommand(redisClient *c);
680static void infoCommand(redisClient *c);
681static void mgetCommand(redisClient *c);
682static void monitorCommand(redisClient *c);
683static void expireCommand(redisClient *c);
684static void expireatCommand(redisClient *c);
685static void getsetCommand(redisClient *c);
686static void ttlCommand(redisClient *c);
687static void slaveofCommand(redisClient *c);
688static void debugCommand(redisClient *c);
689static void msetCommand(redisClient *c);
690static void msetnxCommand(redisClient *c);
691static void zaddCommand(redisClient *c);
692static void zincrbyCommand(redisClient *c);
693static void zrangeCommand(redisClient *c);
694static void zrangebyscoreCommand(redisClient *c);
695static void zcountCommand(redisClient *c);
696static void zrevrangeCommand(redisClient *c);
697static void zcardCommand(redisClient *c);
698static void zremCommand(redisClient *c);
699static void zscoreCommand(redisClient *c);
700static void zremrangebyscoreCommand(redisClient *c);
701static void multiCommand(redisClient *c);
702static void execCommand(redisClient *c);
703static void discardCommand(redisClient *c);
704static void blpopCommand(redisClient *c);
705static void brpopCommand(redisClient *c);
706static void appendCommand(redisClient *c);
707static void substrCommand(redisClient *c);
708static void zrankCommand(redisClient *c);
709static void zrevrankCommand(redisClient *c);
710static void hsetCommand(redisClient *c);
711static void hsetnxCommand(redisClient *c);
712static void hgetCommand(redisClient *c);
713static void hmsetCommand(redisClient *c);
714static void hmgetCommand(redisClient *c);
715static void hdelCommand(redisClient *c);
716static void hlenCommand(redisClient *c);
717static void zremrangebyrankCommand(redisClient *c);
718static void zunionCommand(redisClient *c);
719static void zinterCommand(redisClient *c);
720static void hkeysCommand(redisClient *c);
721static void hvalsCommand(redisClient *c);
722static void hgetallCommand(redisClient *c);
723static void hexistsCommand(redisClient *c);
724static void configCommand(redisClient *c);
725static void hincrbyCommand(redisClient *c);
726static void subscribeCommand(redisClient *c);
727static void unsubscribeCommand(redisClient *c);
728static void psubscribeCommand(redisClient *c);
729static void punsubscribeCommand(redisClient *c);
730static void publishCommand(redisClient *c);
731
732/*================================= Globals ================================= */
733
734/* Global vars */
735static struct redisServer server; /* server global state */
736static struct redisCommand cmdTable[] = {
737 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
739 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
740 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
743 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
747 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
759 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
760 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
762 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
763 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
764 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
768 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
769 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
770 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
771 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
772 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
773 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
775 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
777 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
780 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
781 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
782 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
789 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
792 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
796 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
804 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
805 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
806 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
818 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
826 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
831 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
837 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
842 {NULL,NULL,0,0,NULL,0,0,0}
843};
844
845/*============================ Utility functions ============================ */
846
847/* Glob-style pattern matching. */
848static int stringmatchlen(const char *pattern, int patternLen,
849 const char *string, int stringLen, int nocase)
850{
851 while(patternLen) {
852 switch(pattern[0]) {
853 case '*':
854 while (pattern[1] == '*') {
855 pattern++;
856 patternLen--;
857 }
858 if (patternLen == 1)
859 return 1; /* match */
860 while(stringLen) {
861 if (stringmatchlen(pattern+1, patternLen-1,
862 string, stringLen, nocase))
863 return 1; /* match */
864 string++;
865 stringLen--;
866 }
867 return 0; /* no match */
868 break;
869 case '?':
870 if (stringLen == 0)
871 return 0; /* no match */
872 string++;
873 stringLen--;
874 break;
875 case '[':
876 {
877 int not, match;
878
879 pattern++;
880 patternLen--;
881 not = pattern[0] == '^';
882 if (not) {
883 pattern++;
884 patternLen--;
885 }
886 match = 0;
887 while(1) {
888 if (pattern[0] == '\\') {
889 pattern++;
890 patternLen--;
891 if (pattern[0] == string[0])
892 match = 1;
893 } else if (pattern[0] == ']') {
894 break;
895 } else if (patternLen == 0) {
896 pattern--;
897 patternLen++;
898 break;
899 } else if (pattern[1] == '-' && patternLen >= 3) {
900 int start = pattern[0];
901 int end = pattern[2];
902 int c = string[0];
903 if (start > end) {
904 int t = start;
905 start = end;
906 end = t;
907 }
908 if (nocase) {
909 start = tolower(start);
910 end = tolower(end);
911 c = tolower(c);
912 }
913 pattern += 2;
914 patternLen -= 2;
915 if (c >= start && c <= end)
916 match = 1;
917 } else {
918 if (!nocase) {
919 if (pattern[0] == string[0])
920 match = 1;
921 } else {
922 if (tolower((int)pattern[0]) == tolower((int)string[0]))
923 match = 1;
924 }
925 }
926 pattern++;
927 patternLen--;
928 }
929 if (not)
930 match = !match;
931 if (!match)
932 return 0; /* no match */
933 string++;
934 stringLen--;
935 break;
936 }
937 case '\\':
938 if (patternLen >= 2) {
939 pattern++;
940 patternLen--;
941 }
942 /* fall through */
943 default:
944 if (!nocase) {
945 if (pattern[0] != string[0])
946 return 0; /* no match */
947 } else {
948 if (tolower((int)pattern[0]) != tolower((int)string[0]))
949 return 0; /* no match */
950 }
951 string++;
952 stringLen--;
953 break;
954 }
955 pattern++;
956 patternLen--;
957 if (stringLen == 0) {
958 while(*pattern == '*') {
959 pattern++;
960 patternLen--;
961 }
962 break;
963 }
964 }
965 if (patternLen == 0 && stringLen == 0)
966 return 1;
967 return 0;
968}
969
970static int stringmatch(const char *pattern, const char *string, int nocase) {
971 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
972}
973
974static void redisLog(int level, const char *fmt, ...) {
975 va_list ap;
976 FILE *fp;
977
978 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
979 if (!fp) return;
980
981 va_start(ap, fmt);
982 if (level >= server.verbosity) {
983 char *c = ".-*#";
984 char buf[64];
985 time_t now;
986
987 now = time(NULL);
988 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
989 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
990 vfprintf(fp, fmt, ap);
991 fprintf(fp,"\n");
992 fflush(fp);
993 }
994 va_end(ap);
995
996 if (server.logfile) fclose(fp);
997}
998
999/*====================== Hash table type implementation ==================== */
1000
1001/* This is an hash table type that uses the SDS dynamic strings libary as
1002 * keys and radis objects as values (objects can hold SDS strings,
1003 * lists, sets). */
1004
1005static void dictVanillaFree(void *privdata, void *val)
1006{
1007 DICT_NOTUSED(privdata);
1008 zfree(val);
1009}
1010
1011static void dictListDestructor(void *privdata, void *val)
1012{
1013 DICT_NOTUSED(privdata);
1014 listRelease((list*)val);
1015}
1016
1017static int sdsDictKeyCompare(void *privdata, const void *key1,
1018 const void *key2)
1019{
1020 int l1,l2;
1021 DICT_NOTUSED(privdata);
1022
1023 l1 = sdslen((sds)key1);
1024 l2 = sdslen((sds)key2);
1025 if (l1 != l2) return 0;
1026 return memcmp(key1, key2, l1) == 0;
1027}
1028
1029static void dictRedisObjectDestructor(void *privdata, void *val)
1030{
1031 DICT_NOTUSED(privdata);
1032
1033 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1034 decrRefCount(val);
1035}
1036
1037static int dictObjKeyCompare(void *privdata, const void *key1,
1038 const void *key2)
1039{
1040 const robj *o1 = key1, *o2 = key2;
1041 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1042}
1043
1044static unsigned int dictObjHash(const void *key) {
1045 const robj *o = key;
1046 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1047}
1048
1049static int dictEncObjKeyCompare(void *privdata, const void *key1,
1050 const void *key2)
1051{
1052 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1053 int cmp;
1054
1055 if (o1->encoding == REDIS_ENCODING_INT &&
1056 o2->encoding == REDIS_ENCODING_INT &&
1057 o1->ptr == o2->ptr) return 1;
1058
1059 o1 = getDecodedObject(o1);
1060 o2 = getDecodedObject(o2);
1061 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1062 decrRefCount(o1);
1063 decrRefCount(o2);
1064 return cmp;
1065}
1066
1067static unsigned int dictEncObjHash(const void *key) {
1068 robj *o = (robj*) key;
1069
1070 if (o->encoding == REDIS_ENCODING_RAW) {
1071 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1072 } else {
1073 if (o->encoding == REDIS_ENCODING_INT) {
1074 char buf[32];
1075 int len;
1076
1077 len = snprintf(buf,32,"%ld",(long)o->ptr);
1078 return dictGenHashFunction((unsigned char*)buf, len);
1079 } else {
1080 unsigned int hash;
1081
1082 o = getDecodedObject(o);
1083 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1084 decrRefCount(o);
1085 return hash;
1086 }
1087 }
1088}
1089
1090/* Sets type and expires */
1091static dictType setDictType = {
1092 dictEncObjHash, /* hash function */
1093 NULL, /* key dup */
1094 NULL, /* val dup */
1095 dictEncObjKeyCompare, /* key compare */
1096 dictRedisObjectDestructor, /* key destructor */
1097 NULL /* val destructor */
1098};
1099
1100/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1101static dictType zsetDictType = {
1102 dictEncObjHash, /* hash function */
1103 NULL, /* key dup */
1104 NULL, /* val dup */
1105 dictEncObjKeyCompare, /* key compare */
1106 dictRedisObjectDestructor, /* key destructor */
1107 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1108};
1109
1110/* Db->dict */
1111static dictType dbDictType = {
1112 dictObjHash, /* hash function */
1113 NULL, /* key dup */
1114 NULL, /* val dup */
1115 dictObjKeyCompare, /* key compare */
1116 dictRedisObjectDestructor, /* key destructor */
1117 dictRedisObjectDestructor /* val destructor */
1118};
1119
1120/* Db->expires */
1121static dictType keyptrDictType = {
1122 dictObjHash, /* hash function */
1123 NULL, /* key dup */
1124 NULL, /* val dup */
1125 dictObjKeyCompare, /* key compare */
1126 dictRedisObjectDestructor, /* key destructor */
1127 NULL /* val destructor */
1128};
1129
1130/* Hash type hash table (note that small hashes are represented with zimpaps) */
1131static dictType hashDictType = {
1132 dictEncObjHash, /* hash function */
1133 NULL, /* key dup */
1134 NULL, /* val dup */
1135 dictEncObjKeyCompare, /* key compare */
1136 dictRedisObjectDestructor, /* key destructor */
1137 dictRedisObjectDestructor /* val destructor */
1138};
1139
1140/* Keylist hash table type has unencoded redis objects as keys and
1141 * lists as values. It's used for blocking operations (BLPOP) and to
1142 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1143static dictType keylistDictType = {
1144 dictObjHash, /* hash function */
1145 NULL, /* key dup */
1146 NULL, /* val dup */
1147 dictObjKeyCompare, /* key compare */
1148 dictRedisObjectDestructor, /* key destructor */
1149 dictListDestructor /* val destructor */
1150};
1151
1152static void version();
1153
1154/* ========================= Random utility functions ======================= */
1155
1156/* Redis generally does not try to recover from out of memory conditions
1157 * when allocating objects or strings, it is not clear if it will be possible
1158 * to report this condition to the client since the networking layer itself
1159 * is based on heap allocation for send buffers, so we simply abort.
1160 * At least the code will be simpler to read... */
1161static void oom(const char *msg) {
1162 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1163 sleep(1);
1164 abort();
1165}
1166
1167/* ====================== Redis server networking stuff ===================== */
1168static void closeTimedoutClients(void) {
1169 redisClient *c;
1170 listNode *ln;
1171 time_t now = time(NULL);
1172 listIter li;
1173
1174 listRewind(server.clients,&li);
1175 while ((ln = listNext(&li)) != NULL) {
1176 c = listNodeValue(ln);
1177 if (server.maxidletime &&
1178 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1179 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1180 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1181 listLength(c->pubsub_patterns) == 0 &&
1182 (now - c->lastinteraction > server.maxidletime))
1183 {
1184 redisLog(REDIS_VERBOSE,"Closing idle client");
1185 freeClient(c);
1186 } else if (c->flags & REDIS_BLOCKED) {
1187 if (c->blockingto != 0 && c->blockingto < now) {
1188 addReply(c,shared.nullmultibulk);
1189 unblockClientWaitingData(c);
1190 }
1191 }
1192 }
1193}
1194
1195static int htNeedsResize(dict *dict) {
1196 long long size, used;
1197
1198 size = dictSlots(dict);
1199 used = dictSize(dict);
1200 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1201 (used*100/size < REDIS_HT_MINFILL));
1202}
1203
1204/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1205 * we resize the hash table to save memory */
1206static void tryResizeHashTables(void) {
1207 int j;
1208
1209 for (j = 0; j < server.dbnum; j++) {
1210 if (htNeedsResize(server.db[j].dict))
1211 dictResize(server.db[j].dict);
1212 if (htNeedsResize(server.db[j].expires))
1213 dictResize(server.db[j].expires);
1214 }
1215}
1216
1217/* Our hash table implementation performs rehashing incrementally while
1218 * we write/read from the hash table. Still if the server is idle, the hash
1219 * table will use two tables for a long time. So we try to use 1 millisecond
1220 * of CPU time at every serverCron() loop in order to rehash some key. */
1221static void incrementallyRehash(void) {
1222 int j;
1223
1224 for (j = 0; j < server.dbnum; j++) {
1225 if (dictIsRehashing(server.db[j].dict)) {
1226 dictRehashMilliseconds(server.db[j].dict,1);
1227 break; /* already used our millisecond for this loop... */
1228 }
1229 }
1230}
1231
1232/* A background saving child (BGSAVE) terminated its work. Handle this. */
1233void backgroundSaveDoneHandler(int statloc) {
1234 int exitcode = WEXITSTATUS(statloc);
1235 int bysignal = WIFSIGNALED(statloc);
1236
1237 if (!bysignal && exitcode == 0) {
1238 redisLog(REDIS_NOTICE,
1239 "Background saving terminated with success");
1240 server.dirty = 0;
1241 server.lastsave = time(NULL);
1242 } else if (!bysignal && exitcode != 0) {
1243 redisLog(REDIS_WARNING, "Background saving error");
1244 } else {
1245 redisLog(REDIS_WARNING,
1246 "Background saving terminated by signal %d", WTERMSIG(statloc));
1247 rdbRemoveTempFile(server.bgsavechildpid);
1248 }
1249 server.bgsavechildpid = -1;
1250 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1251 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1252 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1253}
1254
1255/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1256 * Handle this. */
1257void backgroundRewriteDoneHandler(int statloc) {
1258 int exitcode = WEXITSTATUS(statloc);
1259 int bysignal = WIFSIGNALED(statloc);
1260
1261 if (!bysignal && exitcode == 0) {
1262 int fd;
1263 char tmpfile[256];
1264
1265 redisLog(REDIS_NOTICE,
1266 "Background append only file rewriting terminated with success");
1267 /* Now it's time to flush the differences accumulated by the parent */
1268 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1269 fd = open(tmpfile,O_WRONLY|O_APPEND);
1270 if (fd == -1) {
1271 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1272 goto cleanup;
1273 }
1274 /* Flush our data... */
1275 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1276 (signed) sdslen(server.bgrewritebuf)) {
1277 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1278 close(fd);
1279 goto cleanup;
1280 }
1281 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1282 /* Now our work is to rename the temp file into the stable file. And
1283 * switch the file descriptor used by the server for append only. */
1284 if (rename(tmpfile,server.appendfilename) == -1) {
1285 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1286 close(fd);
1287 goto cleanup;
1288 }
1289 /* Mission completed... almost */
1290 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1291 if (server.appendfd != -1) {
1292 /* If append only is actually enabled... */
1293 close(server.appendfd);
1294 server.appendfd = fd;
1295 fsync(fd);
1296 server.appendseldb = -1; /* Make sure it will issue SELECT */
1297 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1298 } else {
1299 /* If append only is disabled we just generate a dump in this
1300 * format. Why not? */
1301 close(fd);
1302 }
1303 } else if (!bysignal && exitcode != 0) {
1304 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1305 } else {
1306 redisLog(REDIS_WARNING,
1307 "Background append only file rewriting terminated by signal %d",
1308 WTERMSIG(statloc));
1309 }
1310cleanup:
1311 sdsfree(server.bgrewritebuf);
1312 server.bgrewritebuf = sdsempty();
1313 aofRemoveTempFile(server.bgrewritechildpid);
1314 server.bgrewritechildpid = -1;
1315}
1316
1317/* This function is called once a background process of some kind terminates,
1318 * as we want to avoid resizing the hash tables when there is a child in order
1319 * to play well with copy-on-write (otherwise when a resize happens lots of
1320 * memory pages are copied). The goal of this function is to update the ability
1321 * for dict.c to resize the hash tables accordingly to the fact we have o not
1322 * running childs. */
1323static void updateDictResizePolicy(void) {
1324 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1325 dictEnableResize();
1326 else
1327 dictDisableResize();
1328}
1329
1330static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1331 int j, loops = server.cronloops++;
1332 REDIS_NOTUSED(eventLoop);
1333 REDIS_NOTUSED(id);
1334 REDIS_NOTUSED(clientData);
1335
1336 /* We take a cached value of the unix time in the global state because
1337 * with virtual memory and aging there is to store the current time
1338 * in objects at every object access, and accuracy is not needed.
1339 * To access a global var is faster than calling time(NULL) */
1340 server.unixtime = time(NULL);
1341
1342 /* Show some info about non-empty databases */
1343 for (j = 0; j < server.dbnum; j++) {
1344 long long size, used, vkeys;
1345
1346 size = dictSlots(server.db[j].dict);
1347 used = dictSize(server.db[j].dict);
1348 vkeys = dictSize(server.db[j].expires);
1349 if (!(loops % 50) && (used || vkeys)) {
1350 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1351 /* dictPrintStats(server.dict); */
1352 }
1353 }
1354
1355 /* We don't want to resize the hash tables while a bacground saving
1356 * is in progress: the saving child is created using fork() that is
1357 * implemented with a copy-on-write semantic in most modern systems, so
1358 * if we resize the HT while there is the saving child at work actually
1359 * a lot of memory movements in the parent will cause a lot of pages
1360 * copied. */
1361 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1362 if (!(loops % 10)) tryResizeHashTables();
1363 if (server.activerehashing) incrementallyRehash();
1364 }
1365
1366 /* Show information about connected clients */
1367 if (!(loops % 50)) {
1368 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1369 listLength(server.clients)-listLength(server.slaves),
1370 listLength(server.slaves),
1371 zmalloc_used_memory());
1372 }
1373
1374 /* Close connections of timedout clients */
1375 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1376 closeTimedoutClients();
1377
1378 /* Check if a background saving or AOF rewrite in progress terminated */
1379 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1380 int statloc;
1381 pid_t pid;
1382
1383 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1384 if (pid == server.bgsavechildpid) {
1385 backgroundSaveDoneHandler(statloc);
1386 } else {
1387 backgroundRewriteDoneHandler(statloc);
1388 }
1389 updateDictResizePolicy();
1390 }
1391 } else {
1392 /* If there is not a background saving in progress check if
1393 * we have to save now */
1394 time_t now = time(NULL);
1395 for (j = 0; j < server.saveparamslen; j++) {
1396 struct saveparam *sp = server.saveparams+j;
1397
1398 if (server.dirty >= sp->changes &&
1399 now-server.lastsave > sp->seconds) {
1400 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1401 sp->changes, sp->seconds);
1402 rdbSaveBackground(server.dbfilename);
1403 break;
1404 }
1405 }
1406 }
1407
1408 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1409 * will use few CPU cycles if there are few expiring keys, otherwise
1410 * it will get more aggressive to avoid that too much memory is used by
1411 * keys that can be removed from the keyspace. */
1412 for (j = 0; j < server.dbnum; j++) {
1413 int expired;
1414 redisDb *db = server.db+j;
1415
1416 /* Continue to expire if at the end of the cycle more than 25%
1417 * of the keys were expired. */
1418 do {
1419 long num = dictSize(db->expires);
1420 time_t now = time(NULL);
1421
1422 expired = 0;
1423 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1424 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1425 while (num--) {
1426 dictEntry *de;
1427 time_t t;
1428
1429 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1430 t = (time_t) dictGetEntryVal(de);
1431 if (now > t) {
1432 deleteKey(db,dictGetEntryKey(de));
1433 expired++;
1434 server.stat_expiredkeys++;
1435 }
1436 }
1437 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1438 }
1439
1440 /* Swap a few keys on disk if we are over the memory limit and VM
1441 * is enbled. Try to free objects from the free list first. */
1442 if (vmCanSwapOut()) {
1443 while (server.vm_enabled && zmalloc_used_memory() >
1444 server.vm_max_memory)
1445 {
1446 int retval;
1447
1448 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1449 retval = (server.vm_max_threads == 0) ?
1450 vmSwapOneObjectBlocking() :
1451 vmSwapOneObjectThreaded();
1452 if (retval == REDIS_ERR && !(loops % 300) &&
1453 zmalloc_used_memory() >
1454 (server.vm_max_memory+server.vm_max_memory/10))
1455 {
1456 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1457 }
1458 /* Note that when using threade I/O we free just one object,
1459 * because anyway when the I/O thread in charge to swap this
1460 * object out will finish, the handler of completed jobs
1461 * will try to swap more objects if we are still out of memory. */
1462 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1463 }
1464 }
1465
1466 /* Check if we should connect to a MASTER */
1467 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1468 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1469 if (syncWithMaster() == REDIS_OK) {
1470 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1471 }
1472 }
1473 return 100;
1474}
1475
1476/* This function gets called every time Redis is entering the
1477 * main loop of the event driven library, that is, before to sleep
1478 * for ready file descriptors. */
1479static void beforeSleep(struct aeEventLoop *eventLoop) {
1480 REDIS_NOTUSED(eventLoop);
1481
1482 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1483 listIter li;
1484 listNode *ln;
1485
1486 listRewind(server.io_ready_clients,&li);
1487 while((ln = listNext(&li))) {
1488 redisClient *c = ln->value;
1489 struct redisCommand *cmd;
1490
1491 /* Resume the client. */
1492 listDelNode(server.io_ready_clients,ln);
1493 c->flags &= (~REDIS_IO_WAIT);
1494 server.vm_blocked_clients--;
1495 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1496 readQueryFromClient, c);
1497 cmd = lookupCommand(c->argv[0]->ptr);
1498 assert(cmd != NULL);
1499 call(c,cmd);
1500 resetClient(c);
1501 /* There may be more data to process in the input buffer. */
1502 if (c->querybuf && sdslen(c->querybuf) > 0)
1503 processInputBuffer(c);
1504 }
1505 }
1506}
1507
1508static void createSharedObjects(void) {
1509 int j;
1510
1511 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1512 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1513 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1514 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1515 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1516 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1517 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1518 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1519 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1520 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1521 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1522 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1523 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1524 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1525 "-ERR no such key\r\n"));
1526 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1527 "-ERR syntax error\r\n"));
1528 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1529 "-ERR source and destination objects are the same\r\n"));
1530 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1531 "-ERR index out of range\r\n"));
1532 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1533 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1534 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1535 shared.select0 = createStringObject("select 0\r\n",10);
1536 shared.select1 = createStringObject("select 1\r\n",10);
1537 shared.select2 = createStringObject("select 2\r\n",10);
1538 shared.select3 = createStringObject("select 3\r\n",10);
1539 shared.select4 = createStringObject("select 4\r\n",10);
1540 shared.select5 = createStringObject("select 5\r\n",10);
1541 shared.select6 = createStringObject("select 6\r\n",10);
1542 shared.select7 = createStringObject("select 7\r\n",10);
1543 shared.select8 = createStringObject("select 8\r\n",10);
1544 shared.select9 = createStringObject("select 9\r\n",10);
1545 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1546 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1547 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1548 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1549 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1550 shared.mbulk3 = createStringObject("*3\r\n",4);
1551 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1552 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1553 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1554 }
1555}
1556
1557static void appendServerSaveParams(time_t seconds, int changes) {
1558 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1559 server.saveparams[server.saveparamslen].seconds = seconds;
1560 server.saveparams[server.saveparamslen].changes = changes;
1561 server.saveparamslen++;
1562}
1563
1564static void resetServerSaveParams() {
1565 zfree(server.saveparams);
1566 server.saveparams = NULL;
1567 server.saveparamslen = 0;
1568}
1569
1570static void initServerConfig() {
1571 server.dbnum = REDIS_DEFAULT_DBNUM;
1572 server.port = REDIS_SERVERPORT;
1573 server.verbosity = REDIS_VERBOSE;
1574 server.maxidletime = REDIS_MAXIDLETIME;
1575 server.saveparams = NULL;
1576 server.logfile = NULL; /* NULL = log on standard output */
1577 server.bindaddr = NULL;
1578 server.glueoutputbuf = 1;
1579 server.daemonize = 0;
1580 server.appendonly = 0;
1581 server.appendfsync = APPENDFSYNC_ALWAYS;
1582 server.lastfsync = time(NULL);
1583 server.appendfd = -1;
1584 server.appendseldb = -1; /* Make sure the first time will not match */
1585 server.pidfile = zstrdup("/var/run/redis.pid");
1586 server.dbfilename = zstrdup("dump.rdb");
1587 server.appendfilename = zstrdup("appendonly.aof");
1588 server.requirepass = NULL;
1589 server.shareobjects = 0;
1590 server.rdbcompression = 1;
1591 server.activerehashing = 1;
1592 server.maxclients = 0;
1593 server.blpop_blocked_clients = 0;
1594 server.maxmemory = 0;
1595 server.vm_enabled = 0;
1596 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1597 server.vm_page_size = 256; /* 256 bytes per page */
1598 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1599 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1600 server.vm_max_threads = 4;
1601 server.vm_blocked_clients = 0;
1602 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1603 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1604
1605 resetServerSaveParams();
1606
1607 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1608 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1609 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1610 /* Replication related */
1611 server.isslave = 0;
1612 server.masterauth = NULL;
1613 server.masterhost = NULL;
1614 server.masterport = 6379;
1615 server.master = NULL;
1616 server.replstate = REDIS_REPL_NONE;
1617
1618 /* Double constants initialization */
1619 R_Zero = 0.0;
1620 R_PosInf = 1.0/R_Zero;
1621 R_NegInf = -1.0/R_Zero;
1622 R_Nan = R_Zero/R_Zero;
1623}
1624
1625static void initServer() {
1626 int j;
1627
1628 signal(SIGHUP, SIG_IGN);
1629 signal(SIGPIPE, SIG_IGN);
1630 setupSigSegvAction();
1631
1632 server.devnull = fopen("/dev/null","w");
1633 if (server.devnull == NULL) {
1634 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1635 exit(1);
1636 }
1637 server.clients = listCreate();
1638 server.slaves = listCreate();
1639 server.monitors = listCreate();
1640 server.objfreelist = listCreate();
1641 createSharedObjects();
1642 server.el = aeCreateEventLoop();
1643 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1644 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1645 if (server.fd == -1) {
1646 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1647 exit(1);
1648 }
1649 for (j = 0; j < server.dbnum; j++) {
1650 server.db[j].dict = dictCreate(&dbDictType,NULL);
1651 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1652 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1653 if (server.vm_enabled)
1654 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1655 server.db[j].id = j;
1656 }
1657 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1658 server.pubsub_patterns = listCreate();
1659 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1660 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1661 server.cronloops = 0;
1662 server.bgsavechildpid = -1;
1663 server.bgrewritechildpid = -1;
1664 server.bgrewritebuf = sdsempty();
1665 server.lastsave = time(NULL);
1666 server.dirty = 0;
1667 server.stat_numcommands = 0;
1668 server.stat_numconnections = 0;
1669 server.stat_expiredkeys = 0;
1670 server.stat_starttime = time(NULL);
1671 server.unixtime = time(NULL);
1672 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1673 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1674 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1675
1676 if (server.appendonly) {
1677 int flags = O_WRONLY|O_APPEND|O_CREAT;
1678
1679#ifdef HAVE_O_DIRECT
1680 if (server.appendfsync == APPENDFSYNC_ALWAYS) {
1681 flags |= O_DIRECT;
1682 server.appendfsync = APPENDFSYNC_NO;
1683 }
1684#endif
1685
1686 server.appendfd = open(server.appendfilename,flags,0644);
1687 if (server.appendfd == -1) {
1688 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1689 strerror(errno));
1690 exit(1);
1691 }
1692 }
1693
1694 if (server.vm_enabled) vmInit();
1695}
1696
1697/* Empty the whole database */
1698static long long emptyDb() {
1699 int j;
1700 long long removed = 0;
1701
1702 for (j = 0; j < server.dbnum; j++) {
1703 removed += dictSize(server.db[j].dict);
1704 dictEmpty(server.db[j].dict);
1705 dictEmpty(server.db[j].expires);
1706 }
1707 return removed;
1708}
1709
1710static int yesnotoi(char *s) {
1711 if (!strcasecmp(s,"yes")) return 1;
1712 else if (!strcasecmp(s,"no")) return 0;
1713 else return -1;
1714}
1715
1716/* I agree, this is a very rudimental way to load a configuration...
1717 will improve later if the config gets more complex */
1718static void loadServerConfig(char *filename) {
1719 FILE *fp;
1720 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1721 int linenum = 0;
1722 sds line = NULL;
1723
1724 if (filename[0] == '-' && filename[1] == '\0')
1725 fp = stdin;
1726 else {
1727 if ((fp = fopen(filename,"r")) == NULL) {
1728 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1729 exit(1);
1730 }
1731 }
1732
1733 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1734 sds *argv;
1735 int argc, j;
1736
1737 linenum++;
1738 line = sdsnew(buf);
1739 line = sdstrim(line," \t\r\n");
1740
1741 /* Skip comments and blank lines*/
1742 if (line[0] == '#' || line[0] == '\0') {
1743 sdsfree(line);
1744 continue;
1745 }
1746
1747 /* Split into arguments */
1748 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1749 sdstolower(argv[0]);
1750
1751 /* Execute config directives */
1752 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1753 server.maxidletime = atoi(argv[1]);
1754 if (server.maxidletime < 0) {
1755 err = "Invalid timeout value"; goto loaderr;
1756 }
1757 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1758 server.port = atoi(argv[1]);
1759 if (server.port < 1 || server.port > 65535) {
1760 err = "Invalid port"; goto loaderr;
1761 }
1762 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1763 server.bindaddr = zstrdup(argv[1]);
1764 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1765 int seconds = atoi(argv[1]);
1766 int changes = atoi(argv[2]);
1767 if (seconds < 1 || changes < 0) {
1768 err = "Invalid save parameters"; goto loaderr;
1769 }
1770 appendServerSaveParams(seconds,changes);
1771 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1772 if (chdir(argv[1]) == -1) {
1773 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1774 argv[1], strerror(errno));
1775 exit(1);
1776 }
1777 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1778 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1779 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1780 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1781 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1782 else {
1783 err = "Invalid log level. Must be one of debug, notice, warning";
1784 goto loaderr;
1785 }
1786 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1787 FILE *logfp;
1788
1789 server.logfile = zstrdup(argv[1]);
1790 if (!strcasecmp(server.logfile,"stdout")) {
1791 zfree(server.logfile);
1792 server.logfile = NULL;
1793 }
1794 if (server.logfile) {
1795 /* Test if we are able to open the file. The server will not
1796 * be able to abort just for this problem later... */
1797 logfp = fopen(server.logfile,"a");
1798 if (logfp == NULL) {
1799 err = sdscatprintf(sdsempty(),
1800 "Can't open the log file: %s", strerror(errno));
1801 goto loaderr;
1802 }
1803 fclose(logfp);
1804 }
1805 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1806 server.dbnum = atoi(argv[1]);
1807 if (server.dbnum < 1) {
1808 err = "Invalid number of databases"; goto loaderr;
1809 }
1810 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1811 loadServerConfig(argv[1]);
1812 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1813 server.maxclients = atoi(argv[1]);
1814 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1815 server.maxmemory = strtoll(argv[1], NULL, 10);
1816 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1817 server.masterhost = sdsnew(argv[1]);
1818 server.masterport = atoi(argv[2]);
1819 server.replstate = REDIS_REPL_CONNECT;
1820 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1821 server.masterauth = zstrdup(argv[1]);
1822 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1823 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1824 err = "argument must be 'yes' or 'no'"; goto loaderr;
1825 }
1826 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1827 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1828 err = "argument must be 'yes' or 'no'"; goto loaderr;
1829 }
1830 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1831 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1832 err = "argument must be 'yes' or 'no'"; goto loaderr;
1833 }
1834 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1835 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1836 err = "argument must be 'yes' or 'no'"; goto loaderr;
1837 }
1838 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1839 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1840 err = "argument must be 'yes' or 'no'"; goto loaderr;
1841 }
1842 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1843 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1844 err = "argument must be 'yes' or 'no'"; goto loaderr;
1845 }
1846 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1847 if (!strcasecmp(argv[1],"no")) {
1848 server.appendfsync = APPENDFSYNC_NO;
1849 } else if (!strcasecmp(argv[1],"always")) {
1850 server.appendfsync = APPENDFSYNC_ALWAYS;
1851 } else if (!strcasecmp(argv[1],"everysec")) {
1852 server.appendfsync = APPENDFSYNC_EVERYSEC;
1853 } else {
1854 err = "argument must be 'no', 'always' or 'everysec'";
1855 goto loaderr;
1856 }
1857 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1858 server.requirepass = zstrdup(argv[1]);
1859 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1860 zfree(server.pidfile);
1861 server.pidfile = zstrdup(argv[1]);
1862 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1863 zfree(server.dbfilename);
1864 server.dbfilename = zstrdup(argv[1]);
1865 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1866 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1867 err = "argument must be 'yes' or 'no'"; goto loaderr;
1868 }
1869 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1870 zfree(server.vm_swap_file);
1871 server.vm_swap_file = zstrdup(argv[1]);
1872 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1873 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1874 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1875 server.vm_page_size = strtoll(argv[1], NULL, 10);
1876 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1877 server.vm_pages = strtoll(argv[1], NULL, 10);
1878 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1879 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1880 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1881 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1882 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1883 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1884 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1885 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1886 } else {
1887 err = "Bad directive or wrong number of arguments"; goto loaderr;
1888 }
1889 for (j = 0; j < argc; j++)
1890 sdsfree(argv[j]);
1891 zfree(argv);
1892 sdsfree(line);
1893 }
1894 if (fp != stdin) fclose(fp);
1895 return;
1896
1897loaderr:
1898 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1899 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1900 fprintf(stderr, ">>> '%s'\n", line);
1901 fprintf(stderr, "%s\n", err);
1902 exit(1);
1903}
1904
1905static void freeClientArgv(redisClient *c) {
1906 int j;
1907
1908 for (j = 0; j < c->argc; j++)
1909 decrRefCount(c->argv[j]);
1910 for (j = 0; j < c->mbargc; j++)
1911 decrRefCount(c->mbargv[j]);
1912 c->argc = 0;
1913 c->mbargc = 0;
1914}
1915
1916static void freeClient(redisClient *c) {
1917 listNode *ln;
1918
1919 /* Note that if the client we are freeing is blocked into a blocking
1920 * call, we have to set querybuf to NULL *before* to call
1921 * unblockClientWaitingData() to avoid processInputBuffer() will get
1922 * called. Also it is important to remove the file events after
1923 * this, because this call adds the READABLE event. */
1924 sdsfree(c->querybuf);
1925 c->querybuf = NULL;
1926 if (c->flags & REDIS_BLOCKED)
1927 unblockClientWaitingData(c);
1928
1929 /* Unsubscribe from all the pubsub channels */
1930 pubsubUnsubscribeAllChannels(c,0);
1931 pubsubUnsubscribeAllPatterns(c,0);
1932 dictRelease(c->pubsub_channels);
1933 listRelease(c->pubsub_patterns);
1934 /* Obvious cleanup */
1935 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1936 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1937 listRelease(c->reply);
1938 freeClientArgv(c);
1939 close(c->fd);
1940 /* Remove from the list of clients */
1941 ln = listSearchKey(server.clients,c);
1942 redisAssert(ln != NULL);
1943 listDelNode(server.clients,ln);
1944 /* Remove from the list of clients waiting for swapped keys */
1945 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1946 ln = listSearchKey(server.io_ready_clients,c);
1947 if (ln) {
1948 listDelNode(server.io_ready_clients,ln);
1949 server.vm_blocked_clients--;
1950 }
1951 }
1952 while (server.vm_enabled && listLength(c->io_keys)) {
1953 ln = listFirst(c->io_keys);
1954 dontWaitForSwappedKey(c,ln->value);
1955 }
1956 listRelease(c->io_keys);
1957 /* Master/slave cleanup */
1958 if (c->flags & REDIS_SLAVE) {
1959 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1960 close(c->repldbfd);
1961 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1962 ln = listSearchKey(l,c);
1963 redisAssert(ln != NULL);
1964 listDelNode(l,ln);
1965 }
1966 if (c->flags & REDIS_MASTER) {
1967 server.master = NULL;
1968 server.replstate = REDIS_REPL_CONNECT;
1969 }
1970 /* Release memory */
1971 zfree(c->argv);
1972 zfree(c->mbargv);
1973 freeClientMultiState(c);
1974 zfree(c);
1975}
1976
1977#define GLUEREPLY_UP_TO (1024)
1978static void glueReplyBuffersIfNeeded(redisClient *c) {
1979 int copylen = 0;
1980 char buf[GLUEREPLY_UP_TO];
1981 listNode *ln;
1982 listIter li;
1983 robj *o;
1984
1985 listRewind(c->reply,&li);
1986 while((ln = listNext(&li))) {
1987 int objlen;
1988
1989 o = ln->value;
1990 objlen = sdslen(o->ptr);
1991 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1992 memcpy(buf+copylen,o->ptr,objlen);
1993 copylen += objlen;
1994 listDelNode(c->reply,ln);
1995 } else {
1996 if (copylen == 0) return;
1997 break;
1998 }
1999 }
2000 /* Now the output buffer is empty, add the new single element */
2001 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2002 listAddNodeHead(c->reply,o);
2003}
2004
2005static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2006 redisClient *c = privdata;
2007 int nwritten = 0, totwritten = 0, objlen;
2008 robj *o;
2009 REDIS_NOTUSED(el);
2010 REDIS_NOTUSED(mask);
2011
2012 /* Use writev() if we have enough buffers to send */
2013 if (!server.glueoutputbuf &&
2014 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2015 !(c->flags & REDIS_MASTER))
2016 {
2017 sendReplyToClientWritev(el, fd, privdata, mask);
2018 return;
2019 }
2020
2021 while(listLength(c->reply)) {
2022 if (server.glueoutputbuf && listLength(c->reply) > 1)
2023 glueReplyBuffersIfNeeded(c);
2024
2025 o = listNodeValue(listFirst(c->reply));
2026 objlen = sdslen(o->ptr);
2027
2028 if (objlen == 0) {
2029 listDelNode(c->reply,listFirst(c->reply));
2030 continue;
2031 }
2032
2033 if (c->flags & REDIS_MASTER) {
2034 /* Don't reply to a master */
2035 nwritten = objlen - c->sentlen;
2036 } else {
2037 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2038 if (nwritten <= 0) break;
2039 }
2040 c->sentlen += nwritten;
2041 totwritten += nwritten;
2042 /* If we fully sent the object on head go to the next one */
2043 if (c->sentlen == objlen) {
2044 listDelNode(c->reply,listFirst(c->reply));
2045 c->sentlen = 0;
2046 }
2047 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2048 * bytes, in a single threaded server it's a good idea to serve
2049 * other clients as well, even if a very large request comes from
2050 * super fast link that is always able to accept data (in real world
2051 * scenario think about 'KEYS *' against the loopback interfae) */
2052 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2053 }
2054 if (nwritten == -1) {
2055 if (errno == EAGAIN) {
2056 nwritten = 0;
2057 } else {
2058 redisLog(REDIS_VERBOSE,
2059 "Error writing to client: %s", strerror(errno));
2060 freeClient(c);
2061 return;
2062 }
2063 }
2064 if (totwritten > 0) c->lastinteraction = time(NULL);
2065 if (listLength(c->reply) == 0) {
2066 c->sentlen = 0;
2067 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2068 }
2069}
2070
2071static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2072{
2073 redisClient *c = privdata;
2074 int nwritten = 0, totwritten = 0, objlen, willwrite;
2075 robj *o;
2076 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2077 int offset, ion = 0;
2078 REDIS_NOTUSED(el);
2079 REDIS_NOTUSED(mask);
2080
2081 listNode *node;
2082 while (listLength(c->reply)) {
2083 offset = c->sentlen;
2084 ion = 0;
2085 willwrite = 0;
2086
2087 /* fill-in the iov[] array */
2088 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2089 o = listNodeValue(node);
2090 objlen = sdslen(o->ptr);
2091
2092 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2093 break;
2094
2095 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2096 break; /* no more iovecs */
2097
2098 iov[ion].iov_base = ((char*)o->ptr) + offset;
2099 iov[ion].iov_len = objlen - offset;
2100 willwrite += objlen - offset;
2101 offset = 0; /* just for the first item */
2102 ion++;
2103 }
2104
2105 if(willwrite == 0)
2106 break;
2107
2108 /* write all collected blocks at once */
2109 if((nwritten = writev(fd, iov, ion)) < 0) {
2110 if (errno != EAGAIN) {
2111 redisLog(REDIS_VERBOSE,
2112 "Error writing to client: %s", strerror(errno));
2113 freeClient(c);
2114 return;
2115 }
2116 break;
2117 }
2118
2119 totwritten += nwritten;
2120 offset = c->sentlen;
2121
2122 /* remove written robjs from c->reply */
2123 while (nwritten && listLength(c->reply)) {
2124 o = listNodeValue(listFirst(c->reply));
2125 objlen = sdslen(o->ptr);
2126
2127 if(nwritten >= objlen - offset) {
2128 listDelNode(c->reply, listFirst(c->reply));
2129 nwritten -= objlen - offset;
2130 c->sentlen = 0;
2131 } else {
2132 /* partial write */
2133 c->sentlen += nwritten;
2134 break;
2135 }
2136 offset = 0;
2137 }
2138 }
2139
2140 if (totwritten > 0)
2141 c->lastinteraction = time(NULL);
2142
2143 if (listLength(c->reply) == 0) {
2144 c->sentlen = 0;
2145 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2146 }
2147}
2148
2149static struct redisCommand *lookupCommand(char *name) {
2150 int j = 0;
2151 while(cmdTable[j].name != NULL) {
2152 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2153 j++;
2154 }
2155 return NULL;
2156}
2157
2158/* resetClient prepare the client to process the next command */
2159static void resetClient(redisClient *c) {
2160 freeClientArgv(c);
2161 c->bulklen = -1;
2162 c->multibulk = 0;
2163}
2164
2165/* Call() is the core of Redis execution of a command */
2166static void call(redisClient *c, struct redisCommand *cmd) {
2167 long long dirty;
2168
2169 dirty = server.dirty;
2170 cmd->proc(c);
2171 dirty = server.dirty-dirty;
2172
2173 if (server.appendonly && dirty)
2174 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2175 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2176 listLength(server.slaves))
2177 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2178 if (listLength(server.monitors))
2179 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2180 server.stat_numcommands++;
2181}
2182
2183/* If this function gets called we already read a whole
2184 * command, argments are in the client argv/argc fields.
2185 * processCommand() execute the command or prepare the
2186 * server for a bulk read from the client.
2187 *
2188 * If 1 is returned the client is still alive and valid and
2189 * and other operations can be performed by the caller. Otherwise
2190 * if 0 is returned the client was destroied (i.e. after QUIT). */
2191static int processCommand(redisClient *c) {
2192 struct redisCommand *cmd;
2193
2194 /* Free some memory if needed (maxmemory setting) */
2195 if (server.maxmemory) freeMemoryIfNeeded();
2196
2197 /* Handle the multi bulk command type. This is an alternative protocol
2198 * supported by Redis in order to receive commands that are composed of
2199 * multiple binary-safe "bulk" arguments. The latency of processing is
2200 * a bit higher but this allows things like multi-sets, so if this
2201 * protocol is used only for MSET and similar commands this is a big win. */
2202 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2203 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2204 if (c->multibulk <= 0) {
2205 resetClient(c);
2206 return 1;
2207 } else {
2208 decrRefCount(c->argv[c->argc-1]);
2209 c->argc--;
2210 return 1;
2211 }
2212 } else if (c->multibulk) {
2213 if (c->bulklen == -1) {
2214 if (((char*)c->argv[0]->ptr)[0] != '$') {
2215 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2216 resetClient(c);
2217 return 1;
2218 } else {
2219 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2220 decrRefCount(c->argv[0]);
2221 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2222 c->argc--;
2223 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2224 resetClient(c);
2225 return 1;
2226 }
2227 c->argc--;
2228 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2229 return 1;
2230 }
2231 } else {
2232 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2233 c->mbargv[c->mbargc] = c->argv[0];
2234 c->mbargc++;
2235 c->argc--;
2236 c->multibulk--;
2237 if (c->multibulk == 0) {
2238 robj **auxargv;
2239 int auxargc;
2240
2241 /* Here we need to swap the multi-bulk argc/argv with the
2242 * normal argc/argv of the client structure. */
2243 auxargv = c->argv;
2244 c->argv = c->mbargv;
2245 c->mbargv = auxargv;
2246
2247 auxargc = c->argc;
2248 c->argc = c->mbargc;
2249 c->mbargc = auxargc;
2250
2251 /* We need to set bulklen to something different than -1
2252 * in order for the code below to process the command without
2253 * to try to read the last argument of a bulk command as
2254 * a special argument. */
2255 c->bulklen = 0;
2256 /* continue below and process the command */
2257 } else {
2258 c->bulklen = -1;
2259 return 1;
2260 }
2261 }
2262 }
2263 /* -- end of multi bulk commands processing -- */
2264
2265 /* The QUIT command is handled as a special case. Normal command
2266 * procs are unable to close the client connection safely */
2267 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2268 freeClient(c);
2269 return 0;
2270 }
2271
2272 /* Now lookup the command and check ASAP about trivial error conditions
2273 * such wrong arity, bad command name and so forth. */
2274 cmd = lookupCommand(c->argv[0]->ptr);
2275 if (!cmd) {
2276 addReplySds(c,
2277 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2278 (char*)c->argv[0]->ptr));
2279 resetClient(c);
2280 return 1;
2281 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2282 (c->argc < -cmd->arity)) {
2283 addReplySds(c,
2284 sdscatprintf(sdsempty(),
2285 "-ERR wrong number of arguments for '%s' command\r\n",
2286 cmd->name));
2287 resetClient(c);
2288 return 1;
2289 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2290 /* This is a bulk command, we have to read the last argument yet. */
2291 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2292
2293 decrRefCount(c->argv[c->argc-1]);
2294 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2295 c->argc--;
2296 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2297 resetClient(c);
2298 return 1;
2299 }
2300 c->argc--;
2301 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2302 /* It is possible that the bulk read is already in the
2303 * buffer. Check this condition and handle it accordingly.
2304 * This is just a fast path, alternative to call processInputBuffer().
2305 * It's a good idea since the code is small and this condition
2306 * happens most of the times. */
2307 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2308 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2309 c->argc++;
2310 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2311 } else {
2312 /* Otherwise return... there is to read the last argument
2313 * from the socket. */
2314 return 1;
2315 }
2316 }
2317 /* Let's try to encode the bulk object to save space. */
2318 if (cmd->flags & REDIS_CMD_BULK)
2319 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2320
2321 /* Check if the user is authenticated */
2322 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2323 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2324 resetClient(c);
2325 return 1;
2326 }
2327
2328 /* Handle the maxmemory directive */
2329 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2330 zmalloc_used_memory() > server.maxmemory)
2331 {
2332 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2333 resetClient(c);
2334 return 1;
2335 }
2336
2337 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2338 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2339 &&
2340 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2341 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2342 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2343 resetClient(c);
2344 return 1;
2345 }
2346
2347 /* Exec the command */
2348 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2349 queueMultiCommand(c,cmd);
2350 addReply(c,shared.queued);
2351 } else {
2352 if (server.vm_enabled && server.vm_max_threads > 0 &&
2353 blockClientOnSwappedKeys(cmd,c)) return 1;
2354 call(c,cmd);
2355 }
2356
2357 /* Prepare the client for the next command */
2358 resetClient(c);
2359 return 1;
2360}
2361
2362static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2363 listNode *ln;
2364 listIter li;
2365 int outc = 0, j;
2366 robj **outv;
2367 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2368 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2369 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2370 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2371 robj *lenobj;
2372
2373 if (argc <= REDIS_STATIC_ARGS) {
2374 outv = static_outv;
2375 } else {
2376 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2377 }
2378
2379 lenobj = createObject(REDIS_STRING,
2380 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2381 lenobj->refcount = 0;
2382 outv[outc++] = lenobj;
2383 for (j = 0; j < argc; j++) {
2384 lenobj = createObject(REDIS_STRING,
2385 sdscatprintf(sdsempty(),"$%lu\r\n",
2386 (unsigned long) stringObjectLen(argv[j])));
2387 lenobj->refcount = 0;
2388 outv[outc++] = lenobj;
2389 outv[outc++] = argv[j];
2390 outv[outc++] = shared.crlf;
2391 }
2392
2393 /* Increment all the refcounts at start and decrement at end in order to
2394 * be sure to free objects if there is no slave in a replication state
2395 * able to be feed with commands */
2396 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2397 listRewind(slaves,&li);
2398 while((ln = listNext(&li))) {
2399 redisClient *slave = ln->value;
2400
2401 /* Don't feed slaves that are still waiting for BGSAVE to start */
2402 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2403
2404 /* Feed all the other slaves, MONITORs and so on */
2405 if (slave->slaveseldb != dictid) {
2406 robj *selectcmd;
2407
2408 switch(dictid) {
2409 case 0: selectcmd = shared.select0; break;
2410 case 1: selectcmd = shared.select1; break;
2411 case 2: selectcmd = shared.select2; break;
2412 case 3: selectcmd = shared.select3; break;
2413 case 4: selectcmd = shared.select4; break;
2414 case 5: selectcmd = shared.select5; break;
2415 case 6: selectcmd = shared.select6; break;
2416 case 7: selectcmd = shared.select7; break;
2417 case 8: selectcmd = shared.select8; break;
2418 case 9: selectcmd = shared.select9; break;
2419 default:
2420 selectcmd = createObject(REDIS_STRING,
2421 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2422 selectcmd->refcount = 0;
2423 break;
2424 }
2425 addReply(slave,selectcmd);
2426 slave->slaveseldb = dictid;
2427 }
2428 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2429 }
2430 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2431 if (outv != static_outv) zfree(outv);
2432}
2433
2434static void processInputBuffer(redisClient *c) {
2435again:
2436 /* Before to process the input buffer, make sure the client is not
2437 * waitig for a blocking operation such as BLPOP. Note that the first
2438 * iteration the client is never blocked, otherwise the processInputBuffer
2439 * would not be called at all, but after the execution of the first commands
2440 * in the input buffer the client may be blocked, and the "goto again"
2441 * will try to reiterate. The following line will make it return asap. */
2442 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2443 if (c->bulklen == -1) {
2444 /* Read the first line of the query */
2445 char *p = strchr(c->querybuf,'\n');
2446 size_t querylen;
2447
2448 if (p) {
2449 sds query, *argv;
2450 int argc, j;
2451
2452 query = c->querybuf;
2453 c->querybuf = sdsempty();
2454 querylen = 1+(p-(query));
2455 if (sdslen(query) > querylen) {
2456 /* leave data after the first line of the query in the buffer */
2457 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2458 }
2459 *p = '\0'; /* remove "\n" */
2460 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2461 sdsupdatelen(query);
2462
2463 /* Now we can split the query in arguments */
2464 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2465 sdsfree(query);
2466
2467 if (c->argv) zfree(c->argv);
2468 c->argv = zmalloc(sizeof(robj*)*argc);
2469
2470 for (j = 0; j < argc; j++) {
2471 if (sdslen(argv[j])) {
2472 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2473 c->argc++;
2474 } else {
2475 sdsfree(argv[j]);
2476 }
2477 }
2478 zfree(argv);
2479 if (c->argc) {
2480 /* Execute the command. If the client is still valid
2481 * after processCommand() return and there is something
2482 * on the query buffer try to process the next command. */
2483 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2484 } else {
2485 /* Nothing to process, argc == 0. Just process the query
2486 * buffer if it's not empty or return to the caller */
2487 if (sdslen(c->querybuf)) goto again;
2488 }
2489 return;
2490 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2491 redisLog(REDIS_VERBOSE, "Client protocol error");
2492 freeClient(c);
2493 return;
2494 }
2495 } else {
2496 /* Bulk read handling. Note that if we are at this point
2497 the client already sent a command terminated with a newline,
2498 we are reading the bulk data that is actually the last
2499 argument of the command. */
2500 int qbl = sdslen(c->querybuf);
2501
2502 if (c->bulklen <= qbl) {
2503 /* Copy everything but the final CRLF as final argument */
2504 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2505 c->argc++;
2506 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2507 /* Process the command. If the client is still valid after
2508 * the processing and there is more data in the buffer
2509 * try to parse it. */
2510 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2511 return;
2512 }
2513 }
2514}
2515
2516static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2517 redisClient *c = (redisClient*) privdata;
2518 char buf[REDIS_IOBUF_LEN];
2519 int nread;
2520 REDIS_NOTUSED(el);
2521 REDIS_NOTUSED(mask);
2522
2523 nread = read(fd, buf, REDIS_IOBUF_LEN);
2524 if (nread == -1) {
2525 if (errno == EAGAIN) {
2526 nread = 0;
2527 } else {
2528 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2529 freeClient(c);
2530 return;
2531 }
2532 } else if (nread == 0) {
2533 redisLog(REDIS_VERBOSE, "Client closed connection");
2534 freeClient(c);
2535 return;
2536 }
2537 if (nread) {
2538 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2539 c->lastinteraction = time(NULL);
2540 } else {
2541 return;
2542 }
2543 processInputBuffer(c);
2544}
2545
2546static int selectDb(redisClient *c, int id) {
2547 if (id < 0 || id >= server.dbnum)
2548 return REDIS_ERR;
2549 c->db = &server.db[id];
2550 return REDIS_OK;
2551}
2552
2553static void *dupClientReplyValue(void *o) {
2554 incrRefCount((robj*)o);
2555 return o;
2556}
2557
2558static int listMatchObjects(void *a, void *b) {
2559 return compareStringObjects(a,b) == 0;
2560}
2561
2562static redisClient *createClient(int fd) {
2563 redisClient *c = zmalloc(sizeof(*c));
2564
2565 anetNonBlock(NULL,fd);
2566 anetTcpNoDelay(NULL,fd);
2567 if (!c) return NULL;
2568 selectDb(c,0);
2569 c->fd = fd;
2570 c->querybuf = sdsempty();
2571 c->argc = 0;
2572 c->argv = NULL;
2573 c->bulklen = -1;
2574 c->multibulk = 0;
2575 c->mbargc = 0;
2576 c->mbargv = NULL;
2577 c->sentlen = 0;
2578 c->flags = 0;
2579 c->lastinteraction = time(NULL);
2580 c->authenticated = 0;
2581 c->replstate = REDIS_REPL_NONE;
2582 c->reply = listCreate();
2583 listSetFreeMethod(c->reply,decrRefCount);
2584 listSetDupMethod(c->reply,dupClientReplyValue);
2585 c->blockingkeys = NULL;
2586 c->blockingkeysnum = 0;
2587 c->io_keys = listCreate();
2588 listSetFreeMethod(c->io_keys,decrRefCount);
2589 c->pubsub_channels = dictCreate(&setDictType,NULL);
2590 c->pubsub_patterns = listCreate();
2591 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2592 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2593 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2594 readQueryFromClient, c) == AE_ERR) {
2595 freeClient(c);
2596 return NULL;
2597 }
2598 listAddNodeTail(server.clients,c);
2599 initClientMultiState(c);
2600 return c;
2601}
2602
2603static void addReply(redisClient *c, robj *obj) {
2604 if (listLength(c->reply) == 0 &&
2605 (c->replstate == REDIS_REPL_NONE ||
2606 c->replstate == REDIS_REPL_ONLINE) &&
2607 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2608 sendReplyToClient, c) == AE_ERR) return;
2609
2610 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2611 obj = dupStringObject(obj);
2612 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2613 }
2614 listAddNodeTail(c->reply,getDecodedObject(obj));
2615}
2616
2617static void addReplySds(redisClient *c, sds s) {
2618 robj *o = createObject(REDIS_STRING,s);
2619 addReply(c,o);
2620 decrRefCount(o);
2621}
2622
2623static void addReplyDouble(redisClient *c, double d) {
2624 char buf[128];
2625
2626 snprintf(buf,sizeof(buf),"%.17g",d);
2627 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2628 (unsigned long) strlen(buf),buf));
2629}
2630
2631static void addReplyLong(redisClient *c, long l) {
2632 char buf[128];
2633 size_t len;
2634
2635 if (l == 0) {
2636 addReply(c,shared.czero);
2637 return;
2638 } else if (l == 1) {
2639 addReply(c,shared.cone);
2640 return;
2641 }
2642 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2643 addReplySds(c,sdsnewlen(buf,len));
2644}
2645
2646static void addReplyLongLong(redisClient *c, long long ll) {
2647 char buf[128];
2648 size_t len;
2649
2650 if (ll == 0) {
2651 addReply(c,shared.czero);
2652 return;
2653 } else if (ll == 1) {
2654 addReply(c,shared.cone);
2655 return;
2656 }
2657 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2658 addReplySds(c,sdsnewlen(buf,len));
2659}
2660
2661static void addReplyUlong(redisClient *c, unsigned long ul) {
2662 char buf[128];
2663 size_t len;
2664
2665 if (ul == 0) {
2666 addReply(c,shared.czero);
2667 return;
2668 } else if (ul == 1) {
2669 addReply(c,shared.cone);
2670 return;
2671 }
2672 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2673 addReplySds(c,sdsnewlen(buf,len));
2674}
2675
2676static void addReplyBulkLen(redisClient *c, robj *obj) {
2677 size_t len;
2678
2679 if (obj->encoding == REDIS_ENCODING_RAW) {
2680 len = sdslen(obj->ptr);
2681 } else {
2682 long n = (long)obj->ptr;
2683
2684 /* Compute how many bytes will take this integer as a radix 10 string */
2685 len = 1;
2686 if (n < 0) {
2687 len++;
2688 n = -n;
2689 }
2690 while((n = n/10) != 0) {
2691 len++;
2692 }
2693 }
2694 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2695}
2696
2697static void addReplyBulk(redisClient *c, robj *obj) {
2698 addReplyBulkLen(c,obj);
2699 addReply(c,obj);
2700 addReply(c,shared.crlf);
2701}
2702
2703/* In the CONFIG command we need to add vanilla C string as bulk replies */
2704static void addReplyBulkCString(redisClient *c, char *s) {
2705 if (s == NULL) {
2706 addReply(c,shared.nullbulk);
2707 } else {
2708 robj *o = createStringObject(s,strlen(s));
2709 addReplyBulk(c,o);
2710 decrRefCount(o);
2711 }
2712}
2713
2714static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2715 int cport, cfd;
2716 char cip[128];
2717 redisClient *c;
2718 REDIS_NOTUSED(el);
2719 REDIS_NOTUSED(mask);
2720 REDIS_NOTUSED(privdata);
2721
2722 cfd = anetAccept(server.neterr, fd, cip, &cport);
2723 if (cfd == AE_ERR) {
2724 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2725 return;
2726 }
2727 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2728 if ((c = createClient(cfd)) == NULL) {
2729 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2730 close(cfd); /* May be already closed, just ingore errors */
2731 return;
2732 }
2733 /* If maxclient directive is set and this is one client more... close the
2734 * connection. Note that we create the client instead to check before
2735 * for this condition, since now the socket is already set in nonblocking
2736 * mode and we can send an error for free using the Kernel I/O */
2737 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2738 char *err = "-ERR max number of clients reached\r\n";
2739
2740 /* That's a best effort error message, don't check write errors */
2741 if (write(c->fd,err,strlen(err)) == -1) {
2742 /* Nothing to do, Just to avoid the warning... */
2743 }
2744 freeClient(c);
2745 return;
2746 }
2747 server.stat_numconnections++;
2748}
2749
2750/* ======================= Redis objects implementation ===================== */
2751
2752static robj *createObject(int type, void *ptr) {
2753 robj *o;
2754
2755 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2756 if (listLength(server.objfreelist)) {
2757 listNode *head = listFirst(server.objfreelist);
2758 o = listNodeValue(head);
2759 listDelNode(server.objfreelist,head);
2760 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2761 } else {
2762 if (server.vm_enabled) {
2763 pthread_mutex_unlock(&server.obj_freelist_mutex);
2764 o = zmalloc(sizeof(*o));
2765 } else {
2766 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2767 }
2768 }
2769 o->type = type;
2770 o->encoding = REDIS_ENCODING_RAW;
2771 o->ptr = ptr;
2772 o->refcount = 1;
2773 if (server.vm_enabled) {
2774 /* Note that this code may run in the context of an I/O thread
2775 * and accessing to server.unixtime in theory is an error
2776 * (no locks). But in practice this is safe, and even if we read
2777 * garbage Redis will not fail, as it's just a statistical info */
2778 o->vm.atime = server.unixtime;
2779 o->storage = REDIS_VM_MEMORY;
2780 }
2781 return o;
2782}
2783
2784static robj *createStringObject(char *ptr, size_t len) {
2785 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2786}
2787
2788static robj *createStringObjectFromLongLong(long long value) {
2789 robj *o;
2790 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2791 incrRefCount(shared.integers[value]);
2792 o = shared.integers[value];
2793 } else {
2794 o = createObject(REDIS_STRING, NULL);
2795 if (value >= LONG_MIN && value <= LONG_MAX) {
2796 o->encoding = REDIS_ENCODING_INT;
2797 o->ptr = (void*)((long)value);
2798 } else {
2799 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2800 }
2801 }
2802 return o;
2803}
2804
2805static robj *dupStringObject(robj *o) {
2806 assert(o->encoding == REDIS_ENCODING_RAW);
2807 return createStringObject(o->ptr,sdslen(o->ptr));
2808}
2809
2810static robj *createListObject(void) {
2811 list *l = listCreate();
2812
2813 listSetFreeMethod(l,decrRefCount);
2814 return createObject(REDIS_LIST,l);
2815}
2816
2817static robj *createSetObject(void) {
2818 dict *d = dictCreate(&setDictType,NULL);
2819 return createObject(REDIS_SET,d);
2820}
2821
2822static robj *createHashObject(void) {
2823 /* All the Hashes start as zipmaps. Will be automatically converted
2824 * into hash tables if there are enough elements or big elements
2825 * inside. */
2826 unsigned char *zm = zipmapNew();
2827 robj *o = createObject(REDIS_HASH,zm);
2828 o->encoding = REDIS_ENCODING_ZIPMAP;
2829 return o;
2830}
2831
2832static robj *createZsetObject(void) {
2833 zset *zs = zmalloc(sizeof(*zs));
2834
2835 zs->dict = dictCreate(&zsetDictType,NULL);
2836 zs->zsl = zslCreate();
2837 return createObject(REDIS_ZSET,zs);
2838}
2839
2840static void freeStringObject(robj *o) {
2841 if (o->encoding == REDIS_ENCODING_RAW) {
2842 sdsfree(o->ptr);
2843 }
2844}
2845
2846static void freeListObject(robj *o) {
2847 listRelease((list*) o->ptr);
2848}
2849
2850static void freeSetObject(robj *o) {
2851 dictRelease((dict*) o->ptr);
2852}
2853
2854static void freeZsetObject(robj *o) {
2855 zset *zs = o->ptr;
2856
2857 dictRelease(zs->dict);
2858 zslFree(zs->zsl);
2859 zfree(zs);
2860}
2861
2862static void freeHashObject(robj *o) {
2863 switch (o->encoding) {
2864 case REDIS_ENCODING_HT:
2865 dictRelease((dict*) o->ptr);
2866 break;
2867 case REDIS_ENCODING_ZIPMAP:
2868 zfree(o->ptr);
2869 break;
2870 default:
2871 redisPanic("Unknown hash encoding type");
2872 break;
2873 }
2874}
2875
2876static void incrRefCount(robj *o) {
2877 o->refcount++;
2878}
2879
2880static void decrRefCount(void *obj) {
2881 robj *o = obj;
2882
2883 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2884 /* Object is a key of a swapped out value, or in the process of being
2885 * loaded. */
2886 if (server.vm_enabled &&
2887 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2888 {
2889 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2890 redisAssert(o->type == REDIS_STRING);
2891 freeStringObject(o);
2892 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2893 pthread_mutex_lock(&server.obj_freelist_mutex);
2894 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2895 !listAddNodeHead(server.objfreelist,o))
2896 zfree(o);
2897 pthread_mutex_unlock(&server.obj_freelist_mutex);
2898 server.vm_stats_swapped_objects--;
2899 return;
2900 }
2901 /* Object is in memory, or in the process of being swapped out. */
2902 if (--(o->refcount) == 0) {
2903 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2904 vmCancelThreadedIOJob(obj);
2905 switch(o->type) {
2906 case REDIS_STRING: freeStringObject(o); break;
2907 case REDIS_LIST: freeListObject(o); break;
2908 case REDIS_SET: freeSetObject(o); break;
2909 case REDIS_ZSET: freeZsetObject(o); break;
2910 case REDIS_HASH: freeHashObject(o); break;
2911 default: redisPanic("Unknown object type"); break;
2912 }
2913 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2914 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2915 !listAddNodeHead(server.objfreelist,o))
2916 zfree(o);
2917 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2918 }
2919}
2920
2921static robj *lookupKey(redisDb *db, robj *key) {
2922 dictEntry *de = dictFind(db->dict,key);
2923 if (de) {
2924 robj *key = dictGetEntryKey(de);
2925 robj *val = dictGetEntryVal(de);
2926
2927 if (server.vm_enabled) {
2928 if (key->storage == REDIS_VM_MEMORY ||
2929 key->storage == REDIS_VM_SWAPPING)
2930 {
2931 /* If we were swapping the object out, stop it, this key
2932 * was requested. */
2933 if (key->storage == REDIS_VM_SWAPPING)
2934 vmCancelThreadedIOJob(key);
2935 /* Update the access time of the key for the aging algorithm. */
2936 key->vm.atime = server.unixtime;
2937 } else {
2938 int notify = (key->storage == REDIS_VM_LOADING);
2939
2940 /* Our value was swapped on disk. Bring it at home. */
2941 redisAssert(val == NULL);
2942 val = vmLoadObject(key);
2943 dictGetEntryVal(de) = val;
2944
2945 /* Clients blocked by the VM subsystem may be waiting for
2946 * this key... */
2947 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2948 }
2949 }
2950 return val;
2951 } else {
2952 return NULL;
2953 }
2954}
2955
2956static robj *lookupKeyRead(redisDb *db, robj *key) {
2957 expireIfNeeded(db,key);
2958 return lookupKey(db,key);
2959}
2960
2961static robj *lookupKeyWrite(redisDb *db, robj *key) {
2962 deleteIfVolatile(db,key);
2963 return lookupKey(db,key);
2964}
2965
2966static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2967 robj *o = lookupKeyRead(c->db, key);
2968 if (!o) addReply(c,reply);
2969 return o;
2970}
2971
2972static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2973 robj *o = lookupKeyWrite(c->db, key);
2974 if (!o) addReply(c,reply);
2975 return o;
2976}
2977
2978static int checkType(redisClient *c, robj *o, int type) {
2979 if (o->type != type) {
2980 addReply(c,shared.wrongtypeerr);
2981 return 1;
2982 }
2983 return 0;
2984}
2985
2986static int deleteKey(redisDb *db, robj *key) {
2987 int retval;
2988
2989 /* We need to protect key from destruction: after the first dictDelete()
2990 * it may happen that 'key' is no longer valid if we don't increment
2991 * it's count. This may happen when we get the object reference directly
2992 * from the hash table with dictRandomKey() or dict iterators */
2993 incrRefCount(key);
2994 if (dictSize(db->expires)) dictDelete(db->expires,key);
2995 retval = dictDelete(db->dict,key);
2996 decrRefCount(key);
2997
2998 return retval == DICT_OK;
2999}
3000
3001/* Check if the nul-terminated string 's' can be represented by a long
3002 * (that is, is a number that fits into long without any other space or
3003 * character before or after the digits).
3004 *
3005 * If so, the function returns REDIS_OK and *longval is set to the value
3006 * of the number. Otherwise REDIS_ERR is returned */
3007static int isStringRepresentableAsLong(sds s, long *longval) {
3008 char buf[32], *endptr;
3009 long value;
3010 int slen;
3011
3012 value = strtol(s, &endptr, 10);
3013 if (endptr[0] != '\0') return REDIS_ERR;
3014 slen = snprintf(buf,32,"%ld",value);
3015
3016 /* If the number converted back into a string is not identical
3017 * then it's not possible to encode the string as integer */
3018 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3019 if (longval) *longval = value;
3020 return REDIS_OK;
3021}
3022
3023/* Try to encode a string object in order to save space */
3024static robj *tryObjectEncoding(robj *o) {
3025 long value;
3026 sds s = o->ptr;
3027
3028 if (o->encoding != REDIS_ENCODING_RAW)
3029 return o; /* Already encoded */
3030
3031 /* It's not safe to encode shared objects: shared objects can be shared
3032 * everywhere in the "object space" of Redis. Encoded objects can only
3033 * appear as "values" (and not, for instance, as keys) */
3034 if (o->refcount > 1) return o;
3035
3036 /* Currently we try to encode only strings */
3037 redisAssert(o->type == REDIS_STRING);
3038
3039 /* Check if we can represent this string as a long integer */
3040 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3041
3042 /* Ok, this object can be encoded */
3043 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3044 decrRefCount(o);
3045 incrRefCount(shared.integers[value]);
3046 return shared.integers[value];
3047 } else {
3048 o->encoding = REDIS_ENCODING_INT;
3049 sdsfree(o->ptr);
3050 o->ptr = (void*) value;
3051 return o;
3052 }
3053}
3054
3055/* Get a decoded version of an encoded object (returned as a new object).
3056 * If the object is already raw-encoded just increment the ref count. */
3057static robj *getDecodedObject(robj *o) {
3058 robj *dec;
3059
3060 if (o->encoding == REDIS_ENCODING_RAW) {
3061 incrRefCount(o);
3062 return o;
3063 }
3064 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3065 char buf[32];
3066
3067 snprintf(buf,32,"%ld",(long)o->ptr);
3068 dec = createStringObject(buf,strlen(buf));
3069 return dec;
3070 } else {
3071 redisPanic("Unknown encoding type");
3072 }
3073}
3074
3075/* Compare two string objects via strcmp() or alike.
3076 * Note that the objects may be integer-encoded. In such a case we
3077 * use snprintf() to get a string representation of the numbers on the stack
3078 * and compare the strings, it's much faster than calling getDecodedObject().
3079 *
3080 * Important note: if objects are not integer encoded, but binary-safe strings,
3081 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3082 * binary safe. */
3083static int compareStringObjects(robj *a, robj *b) {
3084 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3085 char bufa[128], bufb[128], *astr, *bstr;
3086 int bothsds = 1;
3087
3088 if (a == b) return 0;
3089 if (a->encoding != REDIS_ENCODING_RAW) {
3090 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3091 astr = bufa;
3092 bothsds = 0;
3093 } else {
3094 astr = a->ptr;
3095 }
3096 if (b->encoding != REDIS_ENCODING_RAW) {
3097 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3098 bstr = bufb;
3099 bothsds = 0;
3100 } else {
3101 bstr = b->ptr;
3102 }
3103 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3104}
3105
3106static size_t stringObjectLen(robj *o) {
3107 redisAssert(o->type == REDIS_STRING);
3108 if (o->encoding == REDIS_ENCODING_RAW) {
3109 return sdslen(o->ptr);
3110 } else {
3111 char buf[32];
3112
3113 return snprintf(buf,32,"%ld",(long)o->ptr);
3114 }
3115}
3116
3117static int getDoubleFromObject(robj *o, double *target) {
3118 double value;
3119 char *eptr;
3120
3121 if (o == NULL) {
3122 value = 0;
3123 } else {
3124 redisAssert(o->type == REDIS_STRING);
3125 if (o->encoding == REDIS_ENCODING_RAW) {
3126 value = strtod(o->ptr, &eptr);
3127 if (eptr[0] != '\0') return REDIS_ERR;
3128 } else if (o->encoding == REDIS_ENCODING_INT) {
3129 value = (long)o->ptr;
3130 } else {
3131 redisAssert(1 != 1);
3132 }
3133 }
3134
3135 *target = value;
3136 return REDIS_OK;
3137}
3138
3139static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3140 double value;
3141 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3142 if (msg != NULL) {
3143 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3144 } else {
3145 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3146 }
3147 return REDIS_ERR;
3148 }
3149
3150 *target = value;
3151 return REDIS_OK;
3152}
3153
3154static int getLongLongFromObject(robj *o, long long *target) {
3155 long long value;
3156 char *eptr;
3157
3158 if (o == NULL) {
3159 value = 0;
3160 } else {
3161 redisAssert(o->type == REDIS_STRING);
3162 if (o->encoding == REDIS_ENCODING_RAW) {
3163 value = strtoll(o->ptr, &eptr, 10);
3164 if (eptr[0] != '\0') return REDIS_ERR;
3165 } else if (o->encoding == REDIS_ENCODING_INT) {
3166 value = (long)o->ptr;
3167 } else {
3168 redisAssert(1 != 1);
3169 }
3170 }
3171
3172 *target = value;
3173 return REDIS_OK;
3174}
3175
3176static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3177 long long value;
3178 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3179 if (msg != NULL) {
3180 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3181 } else {
3182 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3183 }
3184 return REDIS_ERR;
3185 }
3186
3187 *target = value;
3188 return REDIS_OK;
3189}
3190
3191static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3192 long long value;
3193
3194 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3195 if (value < LONG_MIN || value > LONG_MAX) {
3196 if (msg != NULL) {
3197 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3198 } else {
3199 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3200 }
3201 return REDIS_ERR;
3202 }
3203
3204 *target = value;
3205 return REDIS_OK;
3206}
3207
3208/*============================ RDB saving/loading =========================== */
3209
3210static int rdbSaveType(FILE *fp, unsigned char type) {
3211 if (fwrite(&type,1,1,fp) == 0) return -1;
3212 return 0;
3213}
3214
3215static int rdbSaveTime(FILE *fp, time_t t) {
3216 int32_t t32 = (int32_t) t;
3217 if (fwrite(&t32,4,1,fp) == 0) return -1;
3218 return 0;
3219}
3220
3221/* check rdbLoadLen() comments for more info */
3222static int rdbSaveLen(FILE *fp, uint32_t len) {
3223 unsigned char buf[2];
3224
3225 if (len < (1<<6)) {
3226 /* Save a 6 bit len */
3227 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3228 if (fwrite(buf,1,1,fp) == 0) return -1;
3229 } else if (len < (1<<14)) {
3230 /* Save a 14 bit len */
3231 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3232 buf[1] = len&0xFF;
3233 if (fwrite(buf,2,1,fp) == 0) return -1;
3234 } else {
3235 /* Save a 32 bit len */
3236 buf[0] = (REDIS_RDB_32BITLEN<<6);
3237 if (fwrite(buf,1,1,fp) == 0) return -1;
3238 len = htonl(len);
3239 if (fwrite(&len,4,1,fp) == 0) return -1;
3240 }
3241 return 0;
3242}
3243
3244/* String objects in the form "2391" "-100" without any space and with a
3245 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3246 * encoded as integers to save space */
3247static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3248 long long value;
3249 char *endptr, buf[32];
3250
3251 /* Check if it's possible to encode this value as a number */
3252 value = strtoll(s, &endptr, 10);
3253 if (endptr[0] != '\0') return 0;
3254 snprintf(buf,32,"%lld",value);
3255
3256 /* If the number converted back into a string is not identical
3257 * then it's not possible to encode the string as integer */
3258 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3259
3260 /* Finally check if it fits in our ranges */
3261 if (value >= -(1<<7) && value <= (1<<7)-1) {
3262 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3263 enc[1] = value&0xFF;
3264 return 2;
3265 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3266 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3267 enc[1] = value&0xFF;
3268 enc[2] = (value>>8)&0xFF;
3269 return 3;
3270 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3271 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3272 enc[1] = value&0xFF;
3273 enc[2] = (value>>8)&0xFF;
3274 enc[3] = (value>>16)&0xFF;
3275 enc[4] = (value>>24)&0xFF;
3276 return 5;
3277 } else {
3278 return 0;
3279 }
3280}
3281
3282static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3283 size_t comprlen, outlen;
3284 unsigned char byte;
3285 void *out;
3286
3287 /* We require at least four bytes compression for this to be worth it */
3288 if (len <= 4) return 0;
3289 outlen = len-4;
3290 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3291 comprlen = lzf_compress(s, len, out, outlen);
3292 if (comprlen == 0) {
3293 zfree(out);
3294 return 0;
3295 }
3296 /* Data compressed! Let's save it on disk */
3297 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3298 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3299 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3300 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3301 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3302 zfree(out);
3303 return comprlen;
3304
3305writeerr:
3306 zfree(out);
3307 return -1;
3308}
3309
3310/* Save a string objet as [len][data] on disk. If the object is a string
3311 * representation of an integer value we try to safe it in a special form */
3312static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3313 int enclen;
3314
3315 /* Try integer encoding */
3316 if (len <= 11) {
3317 unsigned char buf[5];
3318 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3319 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3320 return 0;
3321 }
3322 }
3323
3324 /* Try LZF compression - under 20 bytes it's unable to compress even
3325 * aaaaaaaaaaaaaaaaaa so skip it */
3326 if (server.rdbcompression && len > 20) {
3327 int retval;
3328
3329 retval = rdbSaveLzfStringObject(fp,s,len);
3330 if (retval == -1) return -1;
3331 if (retval > 0) return 0;
3332 /* retval == 0 means data can't be compressed, save the old way */
3333 }
3334
3335 /* Store verbatim */
3336 if (rdbSaveLen(fp,len) == -1) return -1;
3337 if (len && fwrite(s,len,1,fp) == 0) return -1;
3338 return 0;
3339}
3340
3341/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3342static int rdbSaveStringObject(FILE *fp, robj *obj) {
3343 int retval;
3344
3345 /* Avoid incr/decr ref count business when possible.
3346 * This plays well with copy-on-write given that we are probably
3347 * in a child process (BGSAVE). Also this makes sure key objects
3348 * of swapped objects are not incRefCount-ed (an assert does not allow
3349 * this in order to avoid bugs) */
3350 if (obj->encoding != REDIS_ENCODING_RAW) {
3351 obj = getDecodedObject(obj);
3352 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3353 decrRefCount(obj);
3354 } else {
3355 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3356 }
3357 return retval;
3358}
3359
3360/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3361 * 8 bit integer specifing the length of the representation.
3362 * This 8 bit integer has special values in order to specify the following
3363 * conditions:
3364 * 253: not a number
3365 * 254: + inf
3366 * 255: - inf
3367 */
3368static int rdbSaveDoubleValue(FILE *fp, double val) {
3369 unsigned char buf[128];
3370 int len;
3371
3372 if (isnan(val)) {
3373 buf[0] = 253;
3374 len = 1;
3375 } else if (!isfinite(val)) {
3376 len = 1;
3377 buf[0] = (val < 0) ? 255 : 254;
3378 } else {
3379 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3380 buf[0] = strlen((char*)buf+1);
3381 len = buf[0]+1;
3382 }
3383 if (fwrite(buf,len,1,fp) == 0) return -1;
3384 return 0;
3385}
3386
3387/* Save a Redis object. */
3388static int rdbSaveObject(FILE *fp, robj *o) {
3389 if (o->type == REDIS_STRING) {
3390 /* Save a string value */
3391 if (rdbSaveStringObject(fp,o) == -1) return -1;
3392 } else if (o->type == REDIS_LIST) {
3393 /* Save a list value */
3394 list *list = o->ptr;
3395 listIter li;
3396 listNode *ln;
3397
3398 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3399 listRewind(list,&li);
3400 while((ln = listNext(&li))) {
3401 robj *eleobj = listNodeValue(ln);
3402
3403 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3404 }
3405 } else if (o->type == REDIS_SET) {
3406 /* Save a set value */
3407 dict *set = o->ptr;
3408 dictIterator *di = dictGetIterator(set);
3409 dictEntry *de;
3410
3411 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3412 while((de = dictNext(di)) != NULL) {
3413 robj *eleobj = dictGetEntryKey(de);
3414
3415 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3416 }
3417 dictReleaseIterator(di);
3418 } else if (o->type == REDIS_ZSET) {
3419 /* Save a set value */
3420 zset *zs = o->ptr;
3421 dictIterator *di = dictGetIterator(zs->dict);
3422 dictEntry *de;
3423
3424 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3425 while((de = dictNext(di)) != NULL) {
3426 robj *eleobj = dictGetEntryKey(de);
3427 double *score = dictGetEntryVal(de);
3428
3429 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3430 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3431 }
3432 dictReleaseIterator(di);
3433 } else if (o->type == REDIS_HASH) {
3434 /* Save a hash value */
3435 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3436 unsigned char *p = zipmapRewind(o->ptr);
3437 unsigned int count = zipmapLen(o->ptr);
3438 unsigned char *key, *val;
3439 unsigned int klen, vlen;
3440
3441 if (rdbSaveLen(fp,count) == -1) return -1;
3442 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3443 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3444 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3445 }
3446 } else {
3447 dictIterator *di = dictGetIterator(o->ptr);
3448 dictEntry *de;
3449
3450 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3451 while((de = dictNext(di)) != NULL) {
3452 robj *key = dictGetEntryKey(de);
3453 robj *val = dictGetEntryVal(de);
3454
3455 if (rdbSaveStringObject(fp,key) == -1) return -1;
3456 if (rdbSaveStringObject(fp,val) == -1) return -1;
3457 }
3458 dictReleaseIterator(di);
3459 }
3460 } else {
3461 redisPanic("Unknown object type");
3462 }
3463 return 0;
3464}
3465
3466/* Return the length the object will have on disk if saved with
3467 * the rdbSaveObject() function. Currently we use a trick to get
3468 * this length with very little changes to the code. In the future
3469 * we could switch to a faster solution. */
3470static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3471 if (fp == NULL) fp = server.devnull;
3472 rewind(fp);
3473 assert(rdbSaveObject(fp,o) != 1);
3474 return ftello(fp);
3475}
3476
3477/* Return the number of pages required to save this object in the swap file */
3478static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3479 off_t bytes = rdbSavedObjectLen(o,fp);
3480
3481 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3482}
3483
3484/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3485static int rdbSave(char *filename) {
3486 dictIterator *di = NULL;
3487 dictEntry *de;
3488 FILE *fp;
3489 char tmpfile[256];
3490 int j;
3491 time_t now = time(NULL);
3492
3493 /* Wait for I/O therads to terminate, just in case this is a
3494 * foreground-saving, to avoid seeking the swap file descriptor at the
3495 * same time. */
3496 if (server.vm_enabled)
3497 waitEmptyIOJobsQueue();
3498
3499 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3500 fp = fopen(tmpfile,"w");
3501 if (!fp) {
3502 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3503 return REDIS_ERR;
3504 }
3505 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3506 for (j = 0; j < server.dbnum; j++) {
3507 redisDb *db = server.db+j;
3508 dict *d = db->dict;
3509 if (dictSize(d) == 0) continue;
3510 di = dictGetIterator(d);
3511 if (!di) {
3512 fclose(fp);
3513 return REDIS_ERR;
3514 }
3515
3516 /* Write the SELECT DB opcode */
3517 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3518 if (rdbSaveLen(fp,j) == -1) goto werr;
3519
3520 /* Iterate this DB writing every entry */
3521 while((de = dictNext(di)) != NULL) {
3522 robj *key = dictGetEntryKey(de);
3523 robj *o = dictGetEntryVal(de);
3524 time_t expiretime = getExpire(db,key);
3525
3526 /* Save the expire time */
3527 if (expiretime != -1) {
3528 /* If this key is already expired skip it */
3529 if (expiretime < now) continue;
3530 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3531 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3532 }
3533 /* Save the key and associated value. This requires special
3534 * handling if the value is swapped out. */
3535 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3536 key->storage == REDIS_VM_SWAPPING) {
3537 /* Save type, key, value */
3538 if (rdbSaveType(fp,o->type) == -1) goto werr;
3539 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3540 if (rdbSaveObject(fp,o) == -1) goto werr;
3541 } else {
3542 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3543 robj *po;
3544 /* Get a preview of the object in memory */
3545 po = vmPreviewObject(key);
3546 /* Save type, key, value */
3547 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3548 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3549 if (rdbSaveObject(fp,po) == -1) goto werr;
3550 /* Remove the loaded object from memory */
3551 decrRefCount(po);
3552 }
3553 }
3554 dictReleaseIterator(di);
3555 }
3556 /* EOF opcode */
3557 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3558
3559 /* Make sure data will not remain on the OS's output buffers */
3560 fflush(fp);
3561 fsync(fileno(fp));
3562 fclose(fp);
3563
3564 /* Use RENAME to make sure the DB file is changed atomically only
3565 * if the generate DB file is ok. */
3566 if (rename(tmpfile,filename) == -1) {
3567 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3568 unlink(tmpfile);
3569 return REDIS_ERR;
3570 }
3571 redisLog(REDIS_NOTICE,"DB saved on disk");
3572 server.dirty = 0;
3573 server.lastsave = time(NULL);
3574 return REDIS_OK;
3575
3576werr:
3577 fclose(fp);
3578 unlink(tmpfile);
3579 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3580 if (di) dictReleaseIterator(di);
3581 return REDIS_ERR;
3582}
3583
3584static int rdbSaveBackground(char *filename) {
3585 pid_t childpid;
3586
3587 if (server.bgsavechildpid != -1) return REDIS_ERR;
3588 if (server.vm_enabled) waitEmptyIOJobsQueue();
3589 if ((childpid = fork()) == 0) {
3590 /* Child */
3591 if (server.vm_enabled) vmReopenSwapFile();
3592 close(server.fd);
3593 if (rdbSave(filename) == REDIS_OK) {
3594 _exit(0);
3595 } else {
3596 _exit(1);
3597 }
3598 } else {
3599 /* Parent */
3600 if (childpid == -1) {
3601 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3602 strerror(errno));
3603 return REDIS_ERR;
3604 }
3605 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3606 server.bgsavechildpid = childpid;
3607 updateDictResizePolicy();
3608 return REDIS_OK;
3609 }
3610 return REDIS_OK; /* unreached */
3611}
3612
3613static void rdbRemoveTempFile(pid_t childpid) {
3614 char tmpfile[256];
3615
3616 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3617 unlink(tmpfile);
3618}
3619
3620static int rdbLoadType(FILE *fp) {
3621 unsigned char type;
3622 if (fread(&type,1,1,fp) == 0) return -1;
3623 return type;
3624}
3625
3626static time_t rdbLoadTime(FILE *fp) {
3627 int32_t t32;
3628 if (fread(&t32,4,1,fp) == 0) return -1;
3629 return (time_t) t32;
3630}
3631
3632/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3633 * of this file for a description of how this are stored on disk.
3634 *
3635 * isencoded is set to 1 if the readed length is not actually a length but
3636 * an "encoding type", check the above comments for more info */
3637static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3638 unsigned char buf[2];
3639 uint32_t len;
3640 int type;
3641
3642 if (isencoded) *isencoded = 0;
3643 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3644 type = (buf[0]&0xC0)>>6;
3645 if (type == REDIS_RDB_6BITLEN) {
3646 /* Read a 6 bit len */
3647 return buf[0]&0x3F;
3648 } else if (type == REDIS_RDB_ENCVAL) {
3649 /* Read a 6 bit len encoding type */
3650 if (isencoded) *isencoded = 1;
3651 return buf[0]&0x3F;
3652 } else if (type == REDIS_RDB_14BITLEN) {
3653 /* Read a 14 bit len */
3654 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3655 return ((buf[0]&0x3F)<<8)|buf[1];
3656 } else {
3657 /* Read a 32 bit len */
3658 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3659 return ntohl(len);
3660 }
3661}
3662
3663static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3664 unsigned char enc[4];
3665 long long val;
3666
3667 if (enctype == REDIS_RDB_ENC_INT8) {
3668 if (fread(enc,1,1,fp) == 0) return NULL;
3669 val = (signed char)enc[0];
3670 } else if (enctype == REDIS_RDB_ENC_INT16) {
3671 uint16_t v;
3672 if (fread(enc,2,1,fp) == 0) return NULL;
3673 v = enc[0]|(enc[1]<<8);
3674 val = (int16_t)v;
3675 } else if (enctype == REDIS_RDB_ENC_INT32) {
3676 uint32_t v;
3677 if (fread(enc,4,1,fp) == 0) return NULL;
3678 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3679 val = (int32_t)v;
3680 } else {
3681 val = 0; /* anti-warning */
3682 redisPanic("Unknown RDB integer encoding type");
3683 }
3684 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3685}
3686
3687static robj *rdbLoadLzfStringObject(FILE*fp) {
3688 unsigned int len, clen;
3689 unsigned char *c = NULL;
3690 sds val = NULL;
3691
3692 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3693 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3694 if ((c = zmalloc(clen)) == NULL) goto err;
3695 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3696 if (fread(c,clen,1,fp) == 0) goto err;
3697 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3698 zfree(c);
3699 return createObject(REDIS_STRING,val);
3700err:
3701 zfree(c);
3702 sdsfree(val);
3703 return NULL;
3704}
3705
3706static robj *rdbLoadStringObject(FILE*fp) {
3707 int isencoded;
3708 uint32_t len;
3709 sds val;
3710
3711 len = rdbLoadLen(fp,&isencoded);
3712 if (isencoded) {
3713 switch(len) {
3714 case REDIS_RDB_ENC_INT8:
3715 case REDIS_RDB_ENC_INT16:
3716 case REDIS_RDB_ENC_INT32:
3717 return rdbLoadIntegerObject(fp,len);
3718 case REDIS_RDB_ENC_LZF:
3719 return rdbLoadLzfStringObject(fp);
3720 default:
3721 redisPanic("Unknown RDB encoding type");
3722 }
3723 }
3724
3725 if (len == REDIS_RDB_LENERR) return NULL;
3726 val = sdsnewlen(NULL,len);
3727 if (len && fread(val,len,1,fp) == 0) {
3728 sdsfree(val);
3729 return NULL;
3730 }
3731 return createObject(REDIS_STRING,val);
3732}
3733
3734/* For information about double serialization check rdbSaveDoubleValue() */
3735static int rdbLoadDoubleValue(FILE *fp, double *val) {
3736 char buf[128];
3737 unsigned char len;
3738
3739 if (fread(&len,1,1,fp) == 0) return -1;
3740 switch(len) {
3741 case 255: *val = R_NegInf; return 0;
3742 case 254: *val = R_PosInf; return 0;
3743 case 253: *val = R_Nan; return 0;
3744 default:
3745 if (fread(buf,len,1,fp) == 0) return -1;
3746 buf[len] = '\0';
3747 sscanf(buf, "%lg", val);
3748 return 0;
3749 }
3750}
3751
3752/* Load a Redis object of the specified type from the specified file.
3753 * On success a newly allocated object is returned, otherwise NULL. */
3754static robj *rdbLoadObject(int type, FILE *fp) {
3755 robj *o;
3756
3757 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3758 if (type == REDIS_STRING) {
3759 /* Read string value */
3760 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3761 o = tryObjectEncoding(o);
3762 } else if (type == REDIS_LIST || type == REDIS_SET) {
3763 /* Read list/set value */
3764 uint32_t listlen;
3765
3766 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3767 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3768 /* It's faster to expand the dict to the right size asap in order
3769 * to avoid rehashing */
3770 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3771 dictExpand(o->ptr,listlen);
3772 /* Load every single element of the list/set */
3773 while(listlen--) {
3774 robj *ele;
3775
3776 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3777 ele = tryObjectEncoding(ele);
3778 if (type == REDIS_LIST) {
3779 listAddNodeTail((list*)o->ptr,ele);
3780 } else {
3781 dictAdd((dict*)o->ptr,ele,NULL);
3782 }
3783 }
3784 } else if (type == REDIS_ZSET) {
3785 /* Read list/set value */
3786 size_t zsetlen;
3787 zset *zs;
3788
3789 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3790 o = createZsetObject();
3791 zs = o->ptr;
3792 /* Load every single element of the list/set */
3793 while(zsetlen--) {
3794 robj *ele;
3795 double *score = zmalloc(sizeof(double));
3796
3797 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3798 ele = tryObjectEncoding(ele);
3799 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3800 dictAdd(zs->dict,ele,score);
3801 zslInsert(zs->zsl,*score,ele);
3802 incrRefCount(ele); /* added to skiplist */
3803 }
3804 } else if (type == REDIS_HASH) {
3805 size_t hashlen;
3806
3807 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3808 o = createHashObject();
3809 /* Too many entries? Use an hash table. */
3810 if (hashlen > server.hash_max_zipmap_entries)
3811 convertToRealHash(o);
3812 /* Load every key/value, then set it into the zipmap or hash
3813 * table, as needed. */
3814 while(hashlen--) {
3815 robj *key, *val;
3816
3817 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3818 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3819 /* If we are using a zipmap and there are too big values
3820 * the object is converted to real hash table encoding. */
3821 if (o->encoding != REDIS_ENCODING_HT &&
3822 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3823 sdslen(val->ptr) > server.hash_max_zipmap_value))
3824 {
3825 convertToRealHash(o);
3826 }
3827
3828 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3829 unsigned char *zm = o->ptr;
3830
3831 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3832 val->ptr,sdslen(val->ptr),NULL);
3833 o->ptr = zm;
3834 decrRefCount(key);
3835 decrRefCount(val);
3836 } else {
3837 key = tryObjectEncoding(key);
3838 val = tryObjectEncoding(val);
3839 dictAdd((dict*)o->ptr,key,val);
3840 }
3841 }
3842 } else {
3843 redisPanic("Unknown object type");
3844 }
3845 return o;
3846}
3847
3848static int rdbLoad(char *filename) {
3849 FILE *fp;
3850 robj *keyobj = NULL;
3851 uint32_t dbid;
3852 int type, retval, rdbver;
3853 dict *d = server.db[0].dict;
3854 redisDb *db = server.db+0;
3855 char buf[1024];
3856 time_t expiretime = -1, now = time(NULL);
3857 long long loadedkeys = 0;
3858
3859 fp = fopen(filename,"r");
3860 if (!fp) return REDIS_ERR;
3861 if (fread(buf,9,1,fp) == 0) goto eoferr;
3862 buf[9] = '\0';
3863 if (memcmp(buf,"REDIS",5) != 0) {
3864 fclose(fp);
3865 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3866 return REDIS_ERR;
3867 }
3868 rdbver = atoi(buf+5);
3869 if (rdbver != 1) {
3870 fclose(fp);
3871 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3872 return REDIS_ERR;
3873 }
3874 while(1) {
3875 robj *o;
3876
3877 /* Read type. */
3878 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3879 if (type == REDIS_EXPIRETIME) {
3880 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3881 /* We read the time so we need to read the object type again */
3882 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3883 }
3884 if (type == REDIS_EOF) break;
3885 /* Handle SELECT DB opcode as a special case */
3886 if (type == REDIS_SELECTDB) {
3887 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3888 goto eoferr;
3889 if (dbid >= (unsigned)server.dbnum) {
3890 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3891 exit(1);
3892 }
3893 db = server.db+dbid;
3894 d = db->dict;
3895 continue;
3896 }
3897 /* Read key */
3898 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3899 /* Read value */
3900 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3901 /* Add the new object in the hash table */
3902 retval = dictAdd(d,keyobj,o);
3903 if (retval == DICT_ERR) {
3904 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3905 exit(1);
3906 }
3907 /* Set the expire time if needed */
3908 if (expiretime != -1) {
3909 setExpire(db,keyobj,expiretime);
3910 /* Delete this key if already expired */
3911 if (expiretime < now) deleteKey(db,keyobj);
3912 expiretime = -1;
3913 }
3914 keyobj = o = NULL;
3915 /* Handle swapping while loading big datasets when VM is on */
3916 loadedkeys++;
3917 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3918 while (zmalloc_used_memory() > server.vm_max_memory) {
3919 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3920 }
3921 }
3922 }
3923 fclose(fp);
3924 return REDIS_OK;
3925
3926eoferr: /* unexpected end of file is handled here with a fatal exit */
3927 if (keyobj) decrRefCount(keyobj);
3928 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3929 exit(1);
3930 return REDIS_ERR; /* Just to avoid warning */
3931}
3932
3933/*================================== Commands =============================== */
3934
3935static void authCommand(redisClient *c) {
3936 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3937 c->authenticated = 1;
3938 addReply(c,shared.ok);
3939 } else {
3940 c->authenticated = 0;
3941 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3942 }
3943}
3944
3945static void pingCommand(redisClient *c) {
3946 addReply(c,shared.pong);
3947}
3948
3949static void echoCommand(redisClient *c) {
3950 addReplyBulk(c,c->argv[1]);
3951}
3952
3953/*=================================== Strings =============================== */
3954
3955static void setGenericCommand(redisClient *c, int nx) {
3956 int retval;
3957
3958 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3959 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3960 if (retval == DICT_ERR) {
3961 if (!nx) {
3962 /* If the key is about a swapped value, we want a new key object
3963 * to overwrite the old. So we delete the old key in the database.
3964 * This will also make sure that swap pages about the old object
3965 * will be marked as free. */
3966 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3967 incrRefCount(c->argv[1]);
3968 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3969 incrRefCount(c->argv[2]);
3970 } else {
3971 addReply(c,shared.czero);
3972 return;
3973 }
3974 } else {
3975 incrRefCount(c->argv[1]);
3976 incrRefCount(c->argv[2]);
3977 }
3978 server.dirty++;
3979 removeExpire(c->db,c->argv[1]);
3980 addReply(c, nx ? shared.cone : shared.ok);
3981}
3982
3983static void setCommand(redisClient *c) {
3984 setGenericCommand(c,0);
3985}
3986
3987static void setnxCommand(redisClient *c) {
3988 setGenericCommand(c,1);
3989}
3990
3991static int getGenericCommand(redisClient *c) {
3992 robj *o;
3993
3994 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3995 return REDIS_OK;
3996
3997 if (o->type != REDIS_STRING) {
3998 addReply(c,shared.wrongtypeerr);
3999 return REDIS_ERR;
4000 } else {
4001 addReplyBulk(c,o);
4002 return REDIS_OK;
4003 }
4004}
4005
4006static void getCommand(redisClient *c) {
4007 getGenericCommand(c);
4008}
4009
4010static void getsetCommand(redisClient *c) {
4011 if (getGenericCommand(c) == REDIS_ERR) return;
4012 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4013 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4014 } else {
4015 incrRefCount(c->argv[1]);
4016 }
4017 incrRefCount(c->argv[2]);
4018 server.dirty++;
4019 removeExpire(c->db,c->argv[1]);
4020}
4021
4022static void mgetCommand(redisClient *c) {
4023 int j;
4024
4025 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4026 for (j = 1; j < c->argc; j++) {
4027 robj *o = lookupKeyRead(c->db,c->argv[j]);
4028 if (o == NULL) {
4029 addReply(c,shared.nullbulk);
4030 } else {
4031 if (o->type != REDIS_STRING) {
4032 addReply(c,shared.nullbulk);
4033 } else {
4034 addReplyBulk(c,o);
4035 }
4036 }
4037 }
4038}
4039
4040static void msetGenericCommand(redisClient *c, int nx) {
4041 int j, busykeys = 0;
4042
4043 if ((c->argc % 2) == 0) {
4044 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4045 return;
4046 }
4047 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4048 * set nothing at all if at least one already key exists. */
4049 if (nx) {
4050 for (j = 1; j < c->argc; j += 2) {
4051 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4052 busykeys++;
4053 }
4054 }
4055 }
4056 if (busykeys) {
4057 addReply(c, shared.czero);
4058 return;
4059 }
4060
4061 for (j = 1; j < c->argc; j += 2) {
4062 int retval;
4063
4064 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4065 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4066 if (retval == DICT_ERR) {
4067 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4068 incrRefCount(c->argv[j+1]);
4069 } else {
4070 incrRefCount(c->argv[j]);
4071 incrRefCount(c->argv[j+1]);
4072 }
4073 removeExpire(c->db,c->argv[j]);
4074 }
4075 server.dirty += (c->argc-1)/2;
4076 addReply(c, nx ? shared.cone : shared.ok);
4077}
4078
4079static void msetCommand(redisClient *c) {
4080 msetGenericCommand(c,0);
4081}
4082
4083static void msetnxCommand(redisClient *c) {
4084 msetGenericCommand(c,1);
4085}
4086
4087static void incrDecrCommand(redisClient *c, long long incr) {
4088 long long value;
4089 int retval;
4090 robj *o;
4091
4092 o = lookupKeyWrite(c->db,c->argv[1]);
4093
4094 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4095
4096 value += incr;
4097 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4098 o = tryObjectEncoding(o);
4099 retval = dictAdd(c->db->dict,c->argv[1],o);
4100 if (retval == DICT_ERR) {
4101 dictReplace(c->db->dict,c->argv[1],o);
4102 removeExpire(c->db,c->argv[1]);
4103 } else {
4104 incrRefCount(c->argv[1]);
4105 }
4106 server.dirty++;
4107 addReply(c,shared.colon);
4108 addReply(c,o);
4109 addReply(c,shared.crlf);
4110}
4111
4112static void incrCommand(redisClient *c) {
4113 incrDecrCommand(c,1);
4114}
4115
4116static void decrCommand(redisClient *c) {
4117 incrDecrCommand(c,-1);
4118}
4119
4120static void incrbyCommand(redisClient *c) {
4121 long long incr;
4122
4123 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4124 incrDecrCommand(c,incr);
4125}
4126
4127static void decrbyCommand(redisClient *c) {
4128 long long incr;
4129
4130 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4131 incrDecrCommand(c,-incr);
4132}
4133
4134static void appendCommand(redisClient *c) {
4135 int retval;
4136 size_t totlen;
4137 robj *o;
4138
4139 o = lookupKeyWrite(c->db,c->argv[1]);
4140 if (o == NULL) {
4141 /* Create the key */
4142 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4143 incrRefCount(c->argv[1]);
4144 incrRefCount(c->argv[2]);
4145 totlen = stringObjectLen(c->argv[2]);
4146 } else {
4147 dictEntry *de;
4148
4149 de = dictFind(c->db->dict,c->argv[1]);
4150 assert(de != NULL);
4151
4152 o = dictGetEntryVal(de);
4153 if (o->type != REDIS_STRING) {
4154 addReply(c,shared.wrongtypeerr);
4155 return;
4156 }
4157 /* If the object is specially encoded or shared we have to make
4158 * a copy */
4159 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4160 robj *decoded = getDecodedObject(o);
4161
4162 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4163 decrRefCount(decoded);
4164 dictReplace(c->db->dict,c->argv[1],o);
4165 }
4166 /* APPEND! */
4167 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4168 o->ptr = sdscatlen(o->ptr,
4169 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4170 } else {
4171 o->ptr = sdscatprintf(o->ptr, "%ld",
4172 (unsigned long) c->argv[2]->ptr);
4173 }
4174 totlen = sdslen(o->ptr);
4175 }
4176 server.dirty++;
4177 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4178}
4179
4180static void substrCommand(redisClient *c) {
4181 robj *o;
4182 long start = atoi(c->argv[2]->ptr);
4183 long end = atoi(c->argv[3]->ptr);
4184 size_t rangelen, strlen;
4185 sds range;
4186
4187 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4188 checkType(c,o,REDIS_STRING)) return;
4189
4190 o = getDecodedObject(o);
4191 strlen = sdslen(o->ptr);
4192
4193 /* convert negative indexes */
4194 if (start < 0) start = strlen+start;
4195 if (end < 0) end = strlen+end;
4196 if (start < 0) start = 0;
4197 if (end < 0) end = 0;
4198
4199 /* indexes sanity checks */
4200 if (start > end || (size_t)start >= strlen) {
4201 /* Out of range start or start > end result in null reply */
4202 addReply(c,shared.nullbulk);
4203 decrRefCount(o);
4204 return;
4205 }
4206 if ((size_t)end >= strlen) end = strlen-1;
4207 rangelen = (end-start)+1;
4208
4209 /* Return the result */
4210 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4211 range = sdsnewlen((char*)o->ptr+start,rangelen);
4212 addReplySds(c,range);
4213 addReply(c,shared.crlf);
4214 decrRefCount(o);
4215}
4216
4217/* ========================= Type agnostic commands ========================= */
4218
4219static void delCommand(redisClient *c) {
4220 int deleted = 0, j;
4221
4222 for (j = 1; j < c->argc; j++) {
4223 if (deleteKey(c->db,c->argv[j])) {
4224 server.dirty++;
4225 deleted++;
4226 }
4227 }
4228 addReplyLong(c,deleted);
4229}
4230
4231static void existsCommand(redisClient *c) {
4232 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4233}
4234
4235static void selectCommand(redisClient *c) {
4236 int id = atoi(c->argv[1]->ptr);
4237
4238 if (selectDb(c,id) == REDIS_ERR) {
4239 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4240 } else {
4241 addReply(c,shared.ok);
4242 }
4243}
4244
4245static void randomkeyCommand(redisClient *c) {
4246 dictEntry *de;
4247
4248 while(1) {
4249 de = dictGetRandomKey(c->db->dict);
4250 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4251 }
4252 if (de == NULL) {
4253 addReply(c,shared.plus);
4254 addReply(c,shared.crlf);
4255 } else {
4256 addReply(c,shared.plus);
4257 addReply(c,dictGetEntryKey(de));
4258 addReply(c,shared.crlf);
4259 }
4260}
4261
4262static void keysCommand(redisClient *c) {
4263 dictIterator *di;
4264 dictEntry *de;
4265 sds pattern = c->argv[1]->ptr;
4266 int plen = sdslen(pattern);
4267 unsigned long numkeys = 0;
4268 robj *lenobj = createObject(REDIS_STRING,NULL);
4269
4270 di = dictGetIterator(c->db->dict);
4271 addReply(c,lenobj);
4272 decrRefCount(lenobj);
4273 while((de = dictNext(di)) != NULL) {
4274 robj *keyobj = dictGetEntryKey(de);
4275
4276 sds key = keyobj->ptr;
4277 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4278 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4279 if (expireIfNeeded(c->db,keyobj) == 0) {
4280 addReplyBulk(c,keyobj);
4281 numkeys++;
4282 }
4283 }
4284 }
4285 dictReleaseIterator(di);
4286 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4287}
4288
4289static void dbsizeCommand(redisClient *c) {
4290 addReplySds(c,
4291 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4292}
4293
4294static void lastsaveCommand(redisClient *c) {
4295 addReplySds(c,
4296 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4297}
4298
4299static void typeCommand(redisClient *c) {
4300 robj *o;
4301 char *type;
4302
4303 o = lookupKeyRead(c->db,c->argv[1]);
4304 if (o == NULL) {
4305 type = "+none";
4306 } else {
4307 switch(o->type) {
4308 case REDIS_STRING: type = "+string"; break;
4309 case REDIS_LIST: type = "+list"; break;
4310 case REDIS_SET: type = "+set"; break;
4311 case REDIS_ZSET: type = "+zset"; break;
4312 case REDIS_HASH: type = "+hash"; break;
4313 default: type = "+unknown"; break;
4314 }
4315 }
4316 addReplySds(c,sdsnew(type));
4317 addReply(c,shared.crlf);
4318}
4319
4320static void saveCommand(redisClient *c) {
4321 if (server.bgsavechildpid != -1) {
4322 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4323 return;
4324 }
4325 if (rdbSave(server.dbfilename) == REDIS_OK) {
4326 addReply(c,shared.ok);
4327 } else {
4328 addReply(c,shared.err);
4329 }
4330}
4331
4332static void bgsaveCommand(redisClient *c) {
4333 if (server.bgsavechildpid != -1) {
4334 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4335 return;
4336 }
4337 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4338 char *status = "+Background saving started\r\n";
4339 addReplySds(c,sdsnew(status));
4340 } else {
4341 addReply(c,shared.err);
4342 }
4343}
4344
4345static void shutdownCommand(redisClient *c) {
4346 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4347 /* Kill the saving child if there is a background saving in progress.
4348 We want to avoid race conditions, for instance our saving child may
4349 overwrite the synchronous saving did by SHUTDOWN. */
4350 if (server.bgsavechildpid != -1) {
4351 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4352 kill(server.bgsavechildpid,SIGKILL);
4353 rdbRemoveTempFile(server.bgsavechildpid);
4354 }
4355 if (server.appendonly) {
4356 /* Append only file: fsync() the AOF and exit */
4357 fsync(server.appendfd);
4358 if (server.vm_enabled) unlink(server.vm_swap_file);
4359 exit(0);
4360 } else {
4361 /* Snapshotting. Perform a SYNC SAVE and exit */
4362 if (rdbSave(server.dbfilename) == REDIS_OK) {
4363 if (server.daemonize)
4364 unlink(server.pidfile);
4365 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4366 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4367 if (server.vm_enabled) unlink(server.vm_swap_file);
4368 exit(0);
4369 } else {
4370 /* Ooops.. error saving! The best we can do is to continue
4371 * operating. Note that if there was a background saving process,
4372 * in the next cron() Redis will be notified that the background
4373 * saving aborted, handling special stuff like slaves pending for
4374 * synchronization... */
4375 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4376 addReplySds(c,
4377 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4378 }
4379 }
4380}
4381
4382static void renameGenericCommand(redisClient *c, int nx) {
4383 robj *o;
4384
4385 /* To use the same key as src and dst is probably an error */
4386 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4387 addReply(c,shared.sameobjecterr);
4388 return;
4389 }
4390
4391 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4392 return;
4393
4394 incrRefCount(o);
4395 deleteIfVolatile(c->db,c->argv[2]);
4396 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4397 if (nx) {
4398 decrRefCount(o);
4399 addReply(c,shared.czero);
4400 return;
4401 }
4402 dictReplace(c->db->dict,c->argv[2],o);
4403 } else {
4404 incrRefCount(c->argv[2]);
4405 }
4406 deleteKey(c->db,c->argv[1]);
4407 server.dirty++;
4408 addReply(c,nx ? shared.cone : shared.ok);
4409}
4410
4411static void renameCommand(redisClient *c) {
4412 renameGenericCommand(c,0);
4413}
4414
4415static void renamenxCommand(redisClient *c) {
4416 renameGenericCommand(c,1);
4417}
4418
4419static void moveCommand(redisClient *c) {
4420 robj *o;
4421 redisDb *src, *dst;
4422 int srcid;
4423
4424 /* Obtain source and target DB pointers */
4425 src = c->db;
4426 srcid = c->db->id;
4427 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4428 addReply(c,shared.outofrangeerr);
4429 return;
4430 }
4431 dst = c->db;
4432 selectDb(c,srcid); /* Back to the source DB */
4433
4434 /* If the user is moving using as target the same
4435 * DB as the source DB it is probably an error. */
4436 if (src == dst) {
4437 addReply(c,shared.sameobjecterr);
4438 return;
4439 }
4440
4441 /* Check if the element exists and get a reference */
4442 o = lookupKeyWrite(c->db,c->argv[1]);
4443 if (!o) {
4444 addReply(c,shared.czero);
4445 return;
4446 }
4447
4448 /* Try to add the element to the target DB */
4449 deleteIfVolatile(dst,c->argv[1]);
4450 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4451 addReply(c,shared.czero);
4452 return;
4453 }
4454 incrRefCount(c->argv[1]);
4455 incrRefCount(o);
4456
4457 /* OK! key moved, free the entry in the source DB */
4458 deleteKey(src,c->argv[1]);
4459 server.dirty++;
4460 addReply(c,shared.cone);
4461}
4462
4463/* =================================== Lists ================================ */
4464static void pushGenericCommand(redisClient *c, int where) {
4465 robj *lobj;
4466 list *list;
4467
4468 lobj = lookupKeyWrite(c->db,c->argv[1]);
4469 if (lobj == NULL) {
4470 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4471 addReply(c,shared.cone);
4472 return;
4473 }
4474 lobj = createListObject();
4475 list = lobj->ptr;
4476 if (where == REDIS_HEAD) {
4477 listAddNodeHead(list,c->argv[2]);
4478 } else {
4479 listAddNodeTail(list,c->argv[2]);
4480 }
4481 dictAdd(c->db->dict,c->argv[1],lobj);
4482 incrRefCount(c->argv[1]);
4483 incrRefCount(c->argv[2]);
4484 } else {
4485 if (lobj->type != REDIS_LIST) {
4486 addReply(c,shared.wrongtypeerr);
4487 return;
4488 }
4489 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4490 addReply(c,shared.cone);
4491 return;
4492 }
4493 list = lobj->ptr;
4494 if (where == REDIS_HEAD) {
4495 listAddNodeHead(list,c->argv[2]);
4496 } else {
4497 listAddNodeTail(list,c->argv[2]);
4498 }
4499 incrRefCount(c->argv[2]);
4500 }
4501 server.dirty++;
4502 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4503}
4504
4505static void lpushCommand(redisClient *c) {
4506 pushGenericCommand(c,REDIS_HEAD);
4507}
4508
4509static void rpushCommand(redisClient *c) {
4510 pushGenericCommand(c,REDIS_TAIL);
4511}
4512
4513static void llenCommand(redisClient *c) {
4514 robj *o;
4515 list *l;
4516
4517 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4518 checkType(c,o,REDIS_LIST)) return;
4519
4520 l = o->ptr;
4521 addReplyUlong(c,listLength(l));
4522}
4523
4524static void lindexCommand(redisClient *c) {
4525 robj *o;
4526 int index = atoi(c->argv[2]->ptr);
4527 list *list;
4528 listNode *ln;
4529
4530 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4531 checkType(c,o,REDIS_LIST)) return;
4532 list = o->ptr;
4533
4534 ln = listIndex(list, index);
4535 if (ln == NULL) {
4536 addReply(c,shared.nullbulk);
4537 } else {
4538 robj *ele = listNodeValue(ln);
4539 addReplyBulk(c,ele);
4540 }
4541}
4542
4543static void lsetCommand(redisClient *c) {
4544 robj *o;
4545 int index = atoi(c->argv[2]->ptr);
4546 list *list;
4547 listNode *ln;
4548
4549 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4550 checkType(c,o,REDIS_LIST)) return;
4551 list = o->ptr;
4552
4553 ln = listIndex(list, index);
4554 if (ln == NULL) {
4555 addReply(c,shared.outofrangeerr);
4556 } else {
4557 robj *ele = listNodeValue(ln);
4558
4559 decrRefCount(ele);
4560 listNodeValue(ln) = c->argv[3];
4561 incrRefCount(c->argv[3]);
4562 addReply(c,shared.ok);
4563 server.dirty++;
4564 }
4565}
4566
4567static void popGenericCommand(redisClient *c, int where) {
4568 robj *o;
4569 list *list;
4570 listNode *ln;
4571
4572 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4573 checkType(c,o,REDIS_LIST)) return;
4574 list = o->ptr;
4575
4576 if (where == REDIS_HEAD)
4577 ln = listFirst(list);
4578 else
4579 ln = listLast(list);
4580
4581 if (ln == NULL) {
4582 addReply(c,shared.nullbulk);
4583 } else {
4584 robj *ele = listNodeValue(ln);
4585 addReplyBulk(c,ele);
4586 listDelNode(list,ln);
4587 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4588 server.dirty++;
4589 }
4590}
4591
4592static void lpopCommand(redisClient *c) {
4593 popGenericCommand(c,REDIS_HEAD);
4594}
4595
4596static void rpopCommand(redisClient *c) {
4597 popGenericCommand(c,REDIS_TAIL);
4598}
4599
4600static void lrangeCommand(redisClient *c) {
4601 robj *o;
4602 int start = atoi(c->argv[2]->ptr);
4603 int end = atoi(c->argv[3]->ptr);
4604 int llen;
4605 int rangelen, j;
4606 list *list;
4607 listNode *ln;
4608 robj *ele;
4609
4610 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4611 || checkType(c,o,REDIS_LIST)) return;
4612 list = o->ptr;
4613 llen = listLength(list);
4614
4615 /* convert negative indexes */
4616 if (start < 0) start = llen+start;
4617 if (end < 0) end = llen+end;
4618 if (start < 0) start = 0;
4619 if (end < 0) end = 0;
4620
4621 /* indexes sanity checks */
4622 if (start > end || start >= llen) {
4623 /* Out of range start or start > end result in empty list */
4624 addReply(c,shared.emptymultibulk);
4625 return;
4626 }
4627 if (end >= llen) end = llen-1;
4628 rangelen = (end-start)+1;
4629
4630 /* Return the result in form of a multi-bulk reply */
4631 ln = listIndex(list, start);
4632 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4633 for (j = 0; j < rangelen; j++) {
4634 ele = listNodeValue(ln);
4635 addReplyBulk(c,ele);
4636 ln = ln->next;
4637 }
4638}
4639
4640static void ltrimCommand(redisClient *c) {
4641 robj *o;
4642 int start = atoi(c->argv[2]->ptr);
4643 int end = atoi(c->argv[3]->ptr);
4644 int llen;
4645 int j, ltrim, rtrim;
4646 list *list;
4647 listNode *ln;
4648
4649 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4650 checkType(c,o,REDIS_LIST)) return;
4651 list = o->ptr;
4652 llen = listLength(list);
4653
4654 /* convert negative indexes */
4655 if (start < 0) start = llen+start;
4656 if (end < 0) end = llen+end;
4657 if (start < 0) start = 0;
4658 if (end < 0) end = 0;
4659
4660 /* indexes sanity checks */
4661 if (start > end || start >= llen) {
4662 /* Out of range start or start > end result in empty list */
4663 ltrim = llen;
4664 rtrim = 0;
4665 } else {
4666 if (end >= llen) end = llen-1;
4667 ltrim = start;
4668 rtrim = llen-end-1;
4669 }
4670
4671 /* Remove list elements to perform the trim */
4672 for (j = 0; j < ltrim; j++) {
4673 ln = listFirst(list);
4674 listDelNode(list,ln);
4675 }
4676 for (j = 0; j < rtrim; j++) {
4677 ln = listLast(list);
4678 listDelNode(list,ln);
4679 }
4680 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4681 server.dirty++;
4682 addReply(c,shared.ok);
4683}
4684
4685static void lremCommand(redisClient *c) {
4686 robj *o;
4687 list *list;
4688 listNode *ln, *next;
4689 int toremove = atoi(c->argv[2]->ptr);
4690 int removed = 0;
4691 int fromtail = 0;
4692
4693 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4694 checkType(c,o,REDIS_LIST)) return;
4695 list = o->ptr;
4696
4697 if (toremove < 0) {
4698 toremove = -toremove;
4699 fromtail = 1;
4700 }
4701 ln = fromtail ? list->tail : list->head;
4702 while (ln) {
4703 robj *ele = listNodeValue(ln);
4704
4705 next = fromtail ? ln->prev : ln->next;
4706 if (compareStringObjects(ele,c->argv[3]) == 0) {
4707 listDelNode(list,ln);
4708 server.dirty++;
4709 removed++;
4710 if (toremove && removed == toremove) break;
4711 }
4712 ln = next;
4713 }
4714 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4715 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4716}
4717
4718/* This is the semantic of this command:
4719 * RPOPLPUSH srclist dstlist:
4720 * IF LLEN(srclist) > 0
4721 * element = RPOP srclist
4722 * LPUSH dstlist element
4723 * RETURN element
4724 * ELSE
4725 * RETURN nil
4726 * END
4727 * END
4728 *
4729 * The idea is to be able to get an element from a list in a reliable way
4730 * since the element is not just returned but pushed against another list
4731 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4732 */
4733static void rpoplpushcommand(redisClient *c) {
4734 robj *sobj;
4735 list *srclist;
4736 listNode *ln;
4737
4738 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4739 checkType(c,sobj,REDIS_LIST)) return;
4740 srclist = sobj->ptr;
4741 ln = listLast(srclist);
4742
4743 if (ln == NULL) {
4744 addReply(c,shared.nullbulk);
4745 } else {
4746 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4747 robj *ele = listNodeValue(ln);
4748 list *dstlist;
4749
4750 if (dobj && dobj->type != REDIS_LIST) {
4751 addReply(c,shared.wrongtypeerr);
4752 return;
4753 }
4754
4755 /* Add the element to the target list (unless it's directly
4756 * passed to some BLPOP-ing client */
4757 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4758 if (dobj == NULL) {
4759 /* Create the list if the key does not exist */
4760 dobj = createListObject();
4761 dictAdd(c->db->dict,c->argv[2],dobj);
4762 incrRefCount(c->argv[2]);
4763 }
4764 dstlist = dobj->ptr;
4765 listAddNodeHead(dstlist,ele);
4766 incrRefCount(ele);
4767 }
4768
4769 /* Send the element to the client as reply as well */
4770 addReplyBulk(c,ele);
4771
4772 /* Finally remove the element from the source list */
4773 listDelNode(srclist,ln);
4774 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4775 server.dirty++;
4776 }
4777}
4778
4779/* ==================================== Sets ================================ */
4780
4781static void saddCommand(redisClient *c) {
4782 robj *set;
4783
4784 set = lookupKeyWrite(c->db,c->argv[1]);
4785 if (set == NULL) {
4786 set = createSetObject();
4787 dictAdd(c->db->dict,c->argv[1],set);
4788 incrRefCount(c->argv[1]);
4789 } else {
4790 if (set->type != REDIS_SET) {
4791 addReply(c,shared.wrongtypeerr);
4792 return;
4793 }
4794 }
4795 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4796 incrRefCount(c->argv[2]);
4797 server.dirty++;
4798 addReply(c,shared.cone);
4799 } else {
4800 addReply(c,shared.czero);
4801 }
4802}
4803
4804static void sremCommand(redisClient *c) {
4805 robj *set;
4806
4807 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4808 checkType(c,set,REDIS_SET)) return;
4809
4810 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4811 server.dirty++;
4812 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4813 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4814 addReply(c,shared.cone);
4815 } else {
4816 addReply(c,shared.czero);
4817 }
4818}
4819
4820static void smoveCommand(redisClient *c) {
4821 robj *srcset, *dstset;
4822
4823 srcset = lookupKeyWrite(c->db,c->argv[1]);
4824 dstset = lookupKeyWrite(c->db,c->argv[2]);
4825
4826 /* If the source key does not exist return 0, if it's of the wrong type
4827 * raise an error */
4828 if (srcset == NULL || srcset->type != REDIS_SET) {
4829 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4830 return;
4831 }
4832 /* Error if the destination key is not a set as well */
4833 if (dstset && dstset->type != REDIS_SET) {
4834 addReply(c,shared.wrongtypeerr);
4835 return;
4836 }
4837 /* Remove the element from the source set */
4838 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4839 /* Key not found in the src set! return zero */
4840 addReply(c,shared.czero);
4841 return;
4842 }
4843 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4844 deleteKey(c->db,c->argv[1]);
4845 server.dirty++;
4846 /* Add the element to the destination set */
4847 if (!dstset) {
4848 dstset = createSetObject();
4849 dictAdd(c->db->dict,c->argv[2],dstset);
4850 incrRefCount(c->argv[2]);
4851 }
4852 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4853 incrRefCount(c->argv[3]);
4854 addReply(c,shared.cone);
4855}
4856
4857static void sismemberCommand(redisClient *c) {
4858 robj *set;
4859
4860 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4861 checkType(c,set,REDIS_SET)) return;
4862
4863 if (dictFind(set->ptr,c->argv[2]))
4864 addReply(c,shared.cone);
4865 else
4866 addReply(c,shared.czero);
4867}
4868
4869static void scardCommand(redisClient *c) {
4870 robj *o;
4871 dict *s;
4872
4873 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4874 checkType(c,o,REDIS_SET)) return;
4875
4876 s = o->ptr;
4877 addReplyUlong(c,dictSize(s));
4878}
4879
4880static void spopCommand(redisClient *c) {
4881 robj *set;
4882 dictEntry *de;
4883
4884 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4885 checkType(c,set,REDIS_SET)) return;
4886
4887 de = dictGetRandomKey(set->ptr);
4888 if (de == NULL) {
4889 addReply(c,shared.nullbulk);
4890 } else {
4891 robj *ele = dictGetEntryKey(de);
4892
4893 addReplyBulk(c,ele);
4894 dictDelete(set->ptr,ele);
4895 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4896 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4897 server.dirty++;
4898 }
4899}
4900
4901static void srandmemberCommand(redisClient *c) {
4902 robj *set;
4903 dictEntry *de;
4904
4905 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4906 checkType(c,set,REDIS_SET)) return;
4907
4908 de = dictGetRandomKey(set->ptr);
4909 if (de == NULL) {
4910 addReply(c,shared.nullbulk);
4911 } else {
4912 robj *ele = dictGetEntryKey(de);
4913
4914 addReplyBulk(c,ele);
4915 }
4916}
4917
4918static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4919 dict **d1 = (void*) s1, **d2 = (void*) s2;
4920
4921 return dictSize(*d1)-dictSize(*d2);
4922}
4923
4924static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4925 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4926 dictIterator *di;
4927 dictEntry *de;
4928 robj *lenobj = NULL, *dstset = NULL;
4929 unsigned long j, cardinality = 0;
4930
4931 for (j = 0; j < setsnum; j++) {
4932 robj *setobj;
4933
4934 setobj = dstkey ?
4935 lookupKeyWrite(c->db,setskeys[j]) :
4936 lookupKeyRead(c->db,setskeys[j]);
4937 if (!setobj) {
4938 zfree(dv);
4939 if (dstkey) {
4940 if (deleteKey(c->db,dstkey))
4941 server.dirty++;
4942 addReply(c,shared.czero);
4943 } else {
4944 addReply(c,shared.emptymultibulk);
4945 }
4946 return;
4947 }
4948 if (setobj->type != REDIS_SET) {
4949 zfree(dv);
4950 addReply(c,shared.wrongtypeerr);
4951 return;
4952 }
4953 dv[j] = setobj->ptr;
4954 }
4955 /* Sort sets from the smallest to largest, this will improve our
4956 * algorithm's performace */
4957 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4958
4959 /* The first thing we should output is the total number of elements...
4960 * since this is a multi-bulk write, but at this stage we don't know
4961 * the intersection set size, so we use a trick, append an empty object
4962 * to the output list and save the pointer to later modify it with the
4963 * right length */
4964 if (!dstkey) {
4965 lenobj = createObject(REDIS_STRING,NULL);
4966 addReply(c,lenobj);
4967 decrRefCount(lenobj);
4968 } else {
4969 /* If we have a target key where to store the resulting set
4970 * create this key with an empty set inside */
4971 dstset = createSetObject();
4972 }
4973
4974 /* Iterate all the elements of the first (smallest) set, and test
4975 * the element against all the other sets, if at least one set does
4976 * not include the element it is discarded */
4977 di = dictGetIterator(dv[0]);
4978
4979 while((de = dictNext(di)) != NULL) {
4980 robj *ele;
4981
4982 for (j = 1; j < setsnum; j++)
4983 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4984 if (j != setsnum)
4985 continue; /* at least one set does not contain the member */
4986 ele = dictGetEntryKey(de);
4987 if (!dstkey) {
4988 addReplyBulk(c,ele);
4989 cardinality++;
4990 } else {
4991 dictAdd(dstset->ptr,ele,NULL);
4992 incrRefCount(ele);
4993 }
4994 }
4995 dictReleaseIterator(di);
4996
4997 if (dstkey) {
4998 /* Store the resulting set into the target, if the intersection
4999 * is not an empty set. */
5000 deleteKey(c->db,dstkey);
5001 if (dictSize((dict*)dstset->ptr) > 0) {
5002 dictAdd(c->db->dict,dstkey,dstset);
5003 incrRefCount(dstkey);
5004 addReplyLong(c,dictSize((dict*)dstset->ptr));
5005 } else {
5006 decrRefCount(dstset);
5007 addReply(c,shared.czero);
5008 }
5009 server.dirty++;
5010 } else {
5011 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5012 }
5013 zfree(dv);
5014}
5015
5016static void sinterCommand(redisClient *c) {
5017 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5018}
5019
5020static void sinterstoreCommand(redisClient *c) {
5021 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5022}
5023
5024#define REDIS_OP_UNION 0
5025#define REDIS_OP_DIFF 1
5026#define REDIS_OP_INTER 2
5027
5028static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5029 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5030 dictIterator *di;
5031 dictEntry *de;
5032 robj *dstset = NULL;
5033 int j, cardinality = 0;
5034
5035 for (j = 0; j < setsnum; j++) {
5036 robj *setobj;
5037
5038 setobj = dstkey ?
5039 lookupKeyWrite(c->db,setskeys[j]) :
5040 lookupKeyRead(c->db,setskeys[j]);
5041 if (!setobj) {
5042 dv[j] = NULL;
5043 continue;
5044 }
5045 if (setobj->type != REDIS_SET) {
5046 zfree(dv);
5047 addReply(c,shared.wrongtypeerr);
5048 return;
5049 }
5050 dv[j] = setobj->ptr;
5051 }
5052
5053 /* We need a temp set object to store our union. If the dstkey
5054 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5055 * this set object will be the resulting object to set into the target key*/
5056 dstset = createSetObject();
5057
5058 /* Iterate all the elements of all the sets, add every element a single
5059 * time to the result set */
5060 for (j = 0; j < setsnum; j++) {
5061 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5062 if (!dv[j]) continue; /* non existing keys are like empty sets */
5063
5064 di = dictGetIterator(dv[j]);
5065
5066 while((de = dictNext(di)) != NULL) {
5067 robj *ele;
5068
5069 /* dictAdd will not add the same element multiple times */
5070 ele = dictGetEntryKey(de);
5071 if (op == REDIS_OP_UNION || j == 0) {
5072 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5073 incrRefCount(ele);
5074 cardinality++;
5075 }
5076 } else if (op == REDIS_OP_DIFF) {
5077 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5078 cardinality--;
5079 }
5080 }
5081 }
5082 dictReleaseIterator(di);
5083
5084 /* result set is empty? Exit asap. */
5085 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5086 }
5087
5088 /* Output the content of the resulting set, if not in STORE mode */
5089 if (!dstkey) {
5090 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5091 di = dictGetIterator(dstset->ptr);
5092 while((de = dictNext(di)) != NULL) {
5093 robj *ele;
5094
5095 ele = dictGetEntryKey(de);
5096 addReplyBulk(c,ele);
5097 }
5098 dictReleaseIterator(di);
5099 decrRefCount(dstset);
5100 } else {
5101 /* If we have a target key where to store the resulting set
5102 * create this key with the result set inside */
5103 deleteKey(c->db,dstkey);
5104 if (dictSize((dict*)dstset->ptr) > 0) {
5105 dictAdd(c->db->dict,dstkey,dstset);
5106 incrRefCount(dstkey);
5107 addReplyLong(c,dictSize((dict*)dstset->ptr));
5108 } else {
5109 decrRefCount(dstset);
5110 addReply(c,shared.czero);
5111 }
5112 server.dirty++;
5113 }
5114 zfree(dv);
5115}
5116
5117static void sunionCommand(redisClient *c) {
5118 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5119}
5120
5121static void sunionstoreCommand(redisClient *c) {
5122 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5123}
5124
5125static void sdiffCommand(redisClient *c) {
5126 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5127}
5128
5129static void sdiffstoreCommand(redisClient *c) {
5130 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5131}
5132
5133/* ==================================== ZSets =============================== */
5134
5135/* ZSETs are ordered sets using two data structures to hold the same elements
5136 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5137 * data structure.
5138 *
5139 * The elements are added to an hash table mapping Redis objects to scores.
5140 * At the same time the elements are added to a skip list mapping scores
5141 * to Redis objects (so objects are sorted by scores in this "view"). */
5142
5143/* This skiplist implementation is almost a C translation of the original
5144 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5145 * Alternative to Balanced Trees", modified in three ways:
5146 * a) this implementation allows for repeated values.
5147 * b) the comparison is not just by key (our 'score') but by satellite data.
5148 * c) there is a back pointer, so it's a doubly linked list with the back
5149 * pointers being only at "level 1". This allows to traverse the list
5150 * from tail to head, useful for ZREVRANGE. */
5151
5152static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5153 zskiplistNode *zn = zmalloc(sizeof(*zn));
5154
5155 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5156 if (level > 0)
5157 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5158 zn->score = score;
5159 zn->obj = obj;
5160 return zn;
5161}
5162
5163static zskiplist *zslCreate(void) {
5164 int j;
5165 zskiplist *zsl;
5166
5167 zsl = zmalloc(sizeof(*zsl));
5168 zsl->level = 1;
5169 zsl->length = 0;
5170 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5171 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5172 zsl->header->forward[j] = NULL;
5173
5174 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5175 if (j < ZSKIPLIST_MAXLEVEL-1)
5176 zsl->header->span[j] = 0;
5177 }
5178 zsl->header->backward = NULL;
5179 zsl->tail = NULL;
5180 return zsl;
5181}
5182
5183static void zslFreeNode(zskiplistNode *node) {
5184 decrRefCount(node->obj);
5185 zfree(node->forward);
5186 zfree(node->span);
5187 zfree(node);
5188}
5189
5190static void zslFree(zskiplist *zsl) {
5191 zskiplistNode *node = zsl->header->forward[0], *next;
5192
5193 zfree(zsl->header->forward);
5194 zfree(zsl->header->span);
5195 zfree(zsl->header);
5196 while(node) {
5197 next = node->forward[0];
5198 zslFreeNode(node);
5199 node = next;
5200 }
5201 zfree(zsl);
5202}
5203
5204static int zslRandomLevel(void) {
5205 int level = 1;
5206 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5207 level += 1;
5208 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5209}
5210
5211static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5212 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5213 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5214 int i, level;
5215
5216 x = zsl->header;
5217 for (i = zsl->level-1; i >= 0; i--) {
5218 /* store rank that is crossed to reach the insert position */
5219 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5220
5221 while (x->forward[i] &&
5222 (x->forward[i]->score < score ||
5223 (x->forward[i]->score == score &&
5224 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5225 rank[i] += i > 0 ? x->span[i-1] : 1;
5226 x = x->forward[i];
5227 }
5228 update[i] = x;
5229 }
5230 /* we assume the key is not already inside, since we allow duplicated
5231 * scores, and the re-insertion of score and redis object should never
5232 * happpen since the caller of zslInsert() should test in the hash table
5233 * if the element is already inside or not. */
5234 level = zslRandomLevel();
5235 if (level > zsl->level) {
5236 for (i = zsl->level; i < level; i++) {
5237 rank[i] = 0;
5238 update[i] = zsl->header;
5239 update[i]->span[i-1] = zsl->length;
5240 }
5241 zsl->level = level;
5242 }
5243 x = zslCreateNode(level,score,obj);
5244 for (i = 0; i < level; i++) {
5245 x->forward[i] = update[i]->forward[i];
5246 update[i]->forward[i] = x;
5247
5248 /* update span covered by update[i] as x is inserted here */
5249 if (i > 0) {
5250 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5251 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5252 }
5253 }
5254
5255 /* increment span for untouched levels */
5256 for (i = level; i < zsl->level; i++) {
5257 update[i]->span[i-1]++;
5258 }
5259
5260 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5261 if (x->forward[0])
5262 x->forward[0]->backward = x;
5263 else
5264 zsl->tail = x;
5265 zsl->length++;
5266}
5267
5268/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5269void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5270 int i;
5271 for (i = 0; i < zsl->level; i++) {
5272 if (update[i]->forward[i] == x) {
5273 if (i > 0) {
5274 update[i]->span[i-1] += x->span[i-1] - 1;
5275 }
5276 update[i]->forward[i] = x->forward[i];
5277 } else {
5278 /* invariant: i > 0, because update[0]->forward[0]
5279 * is always equal to x */
5280 update[i]->span[i-1] -= 1;
5281 }
5282 }
5283 if (x->forward[0]) {
5284 x->forward[0]->backward = x->backward;
5285 } else {
5286 zsl->tail = x->backward;
5287 }
5288 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5289 zsl->level--;
5290 zsl->length--;
5291}
5292
5293/* Delete an element with matching score/object from the skiplist. */
5294static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5295 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5296 int i;
5297
5298 x = zsl->header;
5299 for (i = zsl->level-1; i >= 0; i--) {
5300 while (x->forward[i] &&
5301 (x->forward[i]->score < score ||
5302 (x->forward[i]->score == score &&
5303 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5304 x = x->forward[i];
5305 update[i] = x;
5306 }
5307 /* We may have multiple elements with the same score, what we need
5308 * is to find the element with both the right score and object. */
5309 x = x->forward[0];
5310 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5311 zslDeleteNode(zsl, x, update);
5312 zslFreeNode(x);
5313 return 1;
5314 } else {
5315 return 0; /* not found */
5316 }
5317 return 0; /* not found */
5318}
5319
5320/* Delete all the elements with score between min and max from the skiplist.
5321 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5322 * Note that this function takes the reference to the hash table view of the
5323 * sorted set, in order to remove the elements from the hash table too. */
5324static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5325 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5326 unsigned long removed = 0;
5327 int i;
5328
5329 x = zsl->header;
5330 for (i = zsl->level-1; i >= 0; i--) {
5331 while (x->forward[i] && x->forward[i]->score < min)
5332 x = x->forward[i];
5333 update[i] = x;
5334 }
5335 /* We may have multiple elements with the same score, what we need
5336 * is to find the element with both the right score and object. */
5337 x = x->forward[0];
5338 while (x && x->score <= max) {
5339 zskiplistNode *next = x->forward[0];
5340 zslDeleteNode(zsl, x, update);
5341 dictDelete(dict,x->obj);
5342 zslFreeNode(x);
5343 removed++;
5344 x = next;
5345 }
5346 return removed; /* not found */
5347}
5348
5349/* Delete all the elements with rank between start and end from the skiplist.
5350 * Start and end are inclusive. Note that start and end need to be 1-based */
5351static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5352 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5353 unsigned long traversed = 0, removed = 0;
5354 int i;
5355
5356 x = zsl->header;
5357 for (i = zsl->level-1; i >= 0; i--) {
5358 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5359 traversed += i > 0 ? x->span[i-1] : 1;
5360 x = x->forward[i];
5361 }
5362 update[i] = x;
5363 }
5364
5365 traversed++;
5366 x = x->forward[0];
5367 while (x && traversed <= end) {
5368 zskiplistNode *next = x->forward[0];
5369 zslDeleteNode(zsl, x, update);
5370 dictDelete(dict,x->obj);
5371 zslFreeNode(x);
5372 removed++;
5373 traversed++;
5374 x = next;
5375 }
5376 return removed;
5377}
5378
5379/* Find the first node having a score equal or greater than the specified one.
5380 * Returns NULL if there is no match. */
5381static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5382 zskiplistNode *x;
5383 int i;
5384
5385 x = zsl->header;
5386 for (i = zsl->level-1; i >= 0; i--) {
5387 while (x->forward[i] && x->forward[i]->score < score)
5388 x = x->forward[i];
5389 }
5390 /* We may have multiple elements with the same score, what we need
5391 * is to find the element with both the right score and object. */
5392 return x->forward[0];
5393}
5394
5395/* Find the rank for an element by both score and key.
5396 * Returns 0 when the element cannot be found, rank otherwise.
5397 * Note that the rank is 1-based due to the span of zsl->header to the
5398 * first element. */
5399static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5400 zskiplistNode *x;
5401 unsigned long rank = 0;
5402 int i;
5403
5404 x = zsl->header;
5405 for (i = zsl->level-1; i >= 0; i--) {
5406 while (x->forward[i] &&
5407 (x->forward[i]->score < score ||
5408 (x->forward[i]->score == score &&
5409 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5410 rank += i > 0 ? x->span[i-1] : 1;
5411 x = x->forward[i];
5412 }
5413
5414 /* x might be equal to zsl->header, so test if obj is non-NULL */
5415 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5416 return rank;
5417 }
5418 }
5419 return 0;
5420}
5421
5422/* Finds an element by its rank. The rank argument needs to be 1-based. */
5423zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5424 zskiplistNode *x;
5425 unsigned long traversed = 0;
5426 int i;
5427
5428 x = zsl->header;
5429 for (i = zsl->level-1; i >= 0; i--) {
5430 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5431 {
5432 traversed += i > 0 ? x->span[i-1] : 1;
5433 x = x->forward[i];
5434 }
5435 if (traversed == rank) {
5436 return x;
5437 }
5438 }
5439 return NULL;
5440}
5441
5442/* The actual Z-commands implementations */
5443
5444/* This generic command implements both ZADD and ZINCRBY.
5445 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5446 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5447static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5448 robj *zsetobj;
5449 zset *zs;
5450 double *score;
5451
5452 zsetobj = lookupKeyWrite(c->db,key);
5453 if (zsetobj == NULL) {
5454 zsetobj = createZsetObject();
5455 dictAdd(c->db->dict,key,zsetobj);
5456 incrRefCount(key);
5457 } else {
5458 if (zsetobj->type != REDIS_ZSET) {
5459 addReply(c,shared.wrongtypeerr);
5460 return;
5461 }
5462 }
5463 zs = zsetobj->ptr;
5464
5465 /* Ok now since we implement both ZADD and ZINCRBY here the code
5466 * needs to handle the two different conditions. It's all about setting
5467 * '*score', that is, the new score to set, to the right value. */
5468 score = zmalloc(sizeof(double));
5469 if (doincrement) {
5470 dictEntry *de;
5471
5472 /* Read the old score. If the element was not present starts from 0 */
5473 de = dictFind(zs->dict,ele);
5474 if (de) {
5475 double *oldscore = dictGetEntryVal(de);
5476 *score = *oldscore + scoreval;
5477 } else {
5478 *score = scoreval;
5479 }
5480 } else {
5481 *score = scoreval;
5482 }
5483
5484 /* What follows is a simple remove and re-insert operation that is common
5485 * to both ZADD and ZINCRBY... */
5486 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5487 /* case 1: New element */
5488 incrRefCount(ele); /* added to hash */
5489 zslInsert(zs->zsl,*score,ele);
5490 incrRefCount(ele); /* added to skiplist */
5491 server.dirty++;
5492 if (doincrement)
5493 addReplyDouble(c,*score);
5494 else
5495 addReply(c,shared.cone);
5496 } else {
5497 dictEntry *de;
5498 double *oldscore;
5499
5500 /* case 2: Score update operation */
5501 de = dictFind(zs->dict,ele);
5502 redisAssert(de != NULL);
5503 oldscore = dictGetEntryVal(de);
5504 if (*score != *oldscore) {
5505 int deleted;
5506
5507 /* Remove and insert the element in the skip list with new score */
5508 deleted = zslDelete(zs->zsl,*oldscore,ele);
5509 redisAssert(deleted != 0);
5510 zslInsert(zs->zsl,*score,ele);
5511 incrRefCount(ele);
5512 /* Update the score in the hash table */
5513 dictReplace(zs->dict,ele,score);
5514 server.dirty++;
5515 } else {
5516 zfree(score);
5517 }
5518 if (doincrement)
5519 addReplyDouble(c,*score);
5520 else
5521 addReply(c,shared.czero);
5522 }
5523}
5524
5525static void zaddCommand(redisClient *c) {
5526 double scoreval;
5527
5528 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5529 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5530}
5531
5532static void zincrbyCommand(redisClient *c) {
5533 double scoreval;
5534
5535 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5536 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5537}
5538
5539static void zremCommand(redisClient *c) {
5540 robj *zsetobj;
5541 zset *zs;
5542 dictEntry *de;
5543 double *oldscore;
5544 int deleted;
5545
5546 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5547 checkType(c,zsetobj,REDIS_ZSET)) return;
5548
5549 zs = zsetobj->ptr;
5550 de = dictFind(zs->dict,c->argv[2]);
5551 if (de == NULL) {
5552 addReply(c,shared.czero);
5553 return;
5554 }
5555 /* Delete from the skiplist */
5556 oldscore = dictGetEntryVal(de);
5557 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5558 redisAssert(deleted != 0);
5559
5560 /* Delete from the hash table */
5561 dictDelete(zs->dict,c->argv[2]);
5562 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5563 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5564 server.dirty++;
5565 addReply(c,shared.cone);
5566}
5567
5568static void zremrangebyscoreCommand(redisClient *c) {
5569 double min;
5570 double max;
5571 long deleted;
5572 robj *zsetobj;
5573 zset *zs;
5574
5575 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5576 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5577
5578 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5579 checkType(c,zsetobj,REDIS_ZSET)) return;
5580
5581 zs = zsetobj->ptr;
5582 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5583 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5584 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5585 server.dirty += deleted;
5586 addReplyLong(c,deleted);
5587}
5588
5589static void zremrangebyrankCommand(redisClient *c) {
5590 long start;
5591 long end;
5592 int llen;
5593 long deleted;
5594 robj *zsetobj;
5595 zset *zs;
5596
5597 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5598 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5599
5600 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5601 checkType(c,zsetobj,REDIS_ZSET)) return;
5602 zs = zsetobj->ptr;
5603 llen = zs->zsl->length;
5604
5605 /* convert negative indexes */
5606 if (start < 0) start = llen+start;
5607 if (end < 0) end = llen+end;
5608 if (start < 0) start = 0;
5609 if (end < 0) end = 0;
5610
5611 /* indexes sanity checks */
5612 if (start > end || start >= llen) {
5613 addReply(c,shared.czero);
5614 return;
5615 }
5616 if (end >= llen) end = llen-1;
5617
5618 /* increment start and end because zsl*Rank functions
5619 * use 1-based rank */
5620 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5621 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5622 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5623 server.dirty += deleted;
5624 addReplyLong(c, deleted);
5625}
5626
5627typedef struct {
5628 dict *dict;
5629 double weight;
5630} zsetopsrc;
5631
5632static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5633 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5634 unsigned long size1, size2;
5635 size1 = d1->dict ? dictSize(d1->dict) : 0;
5636 size2 = d2->dict ? dictSize(d2->dict) : 0;
5637 return size1 - size2;
5638}
5639
5640#define REDIS_AGGR_SUM 1
5641#define REDIS_AGGR_MIN 2
5642#define REDIS_AGGR_MAX 3
5643
5644inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5645 if (aggregate == REDIS_AGGR_SUM) {
5646 *target = *target + val;
5647 } else if (aggregate == REDIS_AGGR_MIN) {
5648 *target = val < *target ? val : *target;
5649 } else if (aggregate == REDIS_AGGR_MAX) {
5650 *target = val > *target ? val : *target;
5651 } else {
5652 /* safety net */
5653 redisPanic("Unknown ZUNION/INTER aggregate type");
5654 }
5655}
5656
5657static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5658 int i, j, zsetnum;
5659 int aggregate = REDIS_AGGR_SUM;
5660 zsetopsrc *src;
5661 robj *dstobj;
5662 zset *dstzset;
5663 dictIterator *di;
5664 dictEntry *de;
5665
5666 /* expect zsetnum input keys to be given */
5667 zsetnum = atoi(c->argv[2]->ptr);
5668 if (zsetnum < 1) {
5669 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5670 return;
5671 }
5672
5673 /* test if the expected number of keys would overflow */
5674 if (3+zsetnum > c->argc) {
5675 addReply(c,shared.syntaxerr);
5676 return;
5677 }
5678
5679 /* read keys to be used for input */
5680 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5681 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5682 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5683 if (!zsetobj) {
5684 src[i].dict = NULL;
5685 } else {
5686 if (zsetobj->type != REDIS_ZSET) {
5687 zfree(src);
5688 addReply(c,shared.wrongtypeerr);
5689 return;
5690 }
5691 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5692 }
5693
5694 /* default all weights to 1 */
5695 src[i].weight = 1.0;
5696 }
5697
5698 /* parse optional extra arguments */
5699 if (j < c->argc) {
5700 int remaining = c->argc - j;
5701
5702 while (remaining) {
5703 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5704 j++; remaining--;
5705 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5706 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5707 return;
5708 }
5709 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5710 j++; remaining--;
5711 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5712 aggregate = REDIS_AGGR_SUM;
5713 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5714 aggregate = REDIS_AGGR_MIN;
5715 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5716 aggregate = REDIS_AGGR_MAX;
5717 } else {
5718 zfree(src);
5719 addReply(c,shared.syntaxerr);
5720 return;
5721 }
5722 j++; remaining--;
5723 } else {
5724 zfree(src);
5725 addReply(c,shared.syntaxerr);
5726 return;
5727 }
5728 }
5729 }
5730
5731 /* sort sets from the smallest to largest, this will improve our
5732 * algorithm's performance */
5733 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5734
5735 dstobj = createZsetObject();
5736 dstzset = dstobj->ptr;
5737
5738 if (op == REDIS_OP_INTER) {
5739 /* skip going over all entries if the smallest zset is NULL or empty */
5740 if (src[0].dict && dictSize(src[0].dict) > 0) {
5741 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5742 * from small to large, all src[i > 0].dict are non-empty too */
5743 di = dictGetIterator(src[0].dict);
5744 while((de = dictNext(di)) != NULL) {
5745 double *score = zmalloc(sizeof(double)), value;
5746 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5747
5748 for (j = 1; j < zsetnum; j++) {
5749 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5750 if (other) {
5751 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5752 zunionInterAggregate(score, value, aggregate);
5753 } else {
5754 break;
5755 }
5756 }
5757
5758 /* skip entry when not present in every source dict */
5759 if (j != zsetnum) {
5760 zfree(score);
5761 } else {
5762 robj *o = dictGetEntryKey(de);
5763 dictAdd(dstzset->dict,o,score);
5764 incrRefCount(o); /* added to dictionary */
5765 zslInsert(dstzset->zsl,*score,o);
5766 incrRefCount(o); /* added to skiplist */
5767 }
5768 }
5769 dictReleaseIterator(di);
5770 }
5771 } else if (op == REDIS_OP_UNION) {
5772 for (i = 0; i < zsetnum; i++) {
5773 if (!src[i].dict) continue;
5774
5775 di = dictGetIterator(src[i].dict);
5776 while((de = dictNext(di)) != NULL) {
5777 /* skip key when already processed */
5778 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5779
5780 double *score = zmalloc(sizeof(double)), value;
5781 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5782
5783 /* because the zsets are sorted by size, its only possible
5784 * for sets at larger indices to hold this entry */
5785 for (j = (i+1); j < zsetnum; j++) {
5786 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5787 if (other) {
5788 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5789 zunionInterAggregate(score, value, aggregate);
5790 }
5791 }
5792
5793 robj *o = dictGetEntryKey(de);
5794 dictAdd(dstzset->dict,o,score);
5795 incrRefCount(o); /* added to dictionary */
5796 zslInsert(dstzset->zsl,*score,o);
5797 incrRefCount(o); /* added to skiplist */
5798 }
5799 dictReleaseIterator(di);
5800 }
5801 } else {
5802 /* unknown operator */
5803 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5804 }
5805
5806 deleteKey(c->db,dstkey);
5807 if (dstzset->zsl->length) {
5808 dictAdd(c->db->dict,dstkey,dstobj);
5809 incrRefCount(dstkey);
5810 addReplyLong(c, dstzset->zsl->length);
5811 server.dirty++;
5812 } else {
5813 decrRefCount(dstobj);
5814 addReply(c, shared.czero);
5815 }
5816 zfree(src);
5817}
5818
5819static void zunionCommand(redisClient *c) {
5820 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5821}
5822
5823static void zinterCommand(redisClient *c) {
5824 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5825}
5826
5827static void zrangeGenericCommand(redisClient *c, int reverse) {
5828 robj *o;
5829 long start;
5830 long end;
5831 int withscores = 0;
5832 int llen;
5833 int rangelen, j;
5834 zset *zsetobj;
5835 zskiplist *zsl;
5836 zskiplistNode *ln;
5837 robj *ele;
5838
5839 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5840 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5841
5842 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5843 withscores = 1;
5844 } else if (c->argc >= 5) {
5845 addReply(c,shared.syntaxerr);
5846 return;
5847 }
5848
5849 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5850 || checkType(c,o,REDIS_ZSET)) return;
5851 zsetobj = o->ptr;
5852 zsl = zsetobj->zsl;
5853 llen = zsl->length;
5854
5855 /* convert negative indexes */
5856 if (start < 0) start = llen+start;
5857 if (end < 0) end = llen+end;
5858 if (start < 0) start = 0;
5859 if (end < 0) end = 0;
5860
5861 /* indexes sanity checks */
5862 if (start > end || start >= llen) {
5863 /* Out of range start or start > end result in empty list */
5864 addReply(c,shared.emptymultibulk);
5865 return;
5866 }
5867 if (end >= llen) end = llen-1;
5868 rangelen = (end-start)+1;
5869
5870 /* check if starting point is trivial, before searching
5871 * the element in log(N) time */
5872 if (reverse) {
5873 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5874 } else {
5875 ln = start == 0 ?
5876 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5877 }
5878
5879 /* Return the result in form of a multi-bulk reply */
5880 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5881 withscores ? (rangelen*2) : rangelen));
5882 for (j = 0; j < rangelen; j++) {
5883 ele = ln->obj;
5884 addReplyBulk(c,ele);
5885 if (withscores)
5886 addReplyDouble(c,ln->score);
5887 ln = reverse ? ln->backward : ln->forward[0];
5888 }
5889}
5890
5891static void zrangeCommand(redisClient *c) {
5892 zrangeGenericCommand(c,0);
5893}
5894
5895static void zrevrangeCommand(redisClient *c) {
5896 zrangeGenericCommand(c,1);
5897}
5898
5899/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5900 * If justcount is non-zero, just the count is returned. */
5901static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5902 robj *o;
5903 double min, max;
5904 int minex = 0, maxex = 0; /* are min or max exclusive? */
5905 int offset = 0, limit = -1;
5906 int withscores = 0;
5907 int badsyntax = 0;
5908
5909 /* Parse the min-max interval. If one of the values is prefixed
5910 * by the "(" character, it's considered "open". For instance
5911 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5912 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5913 if (((char*)c->argv[2]->ptr)[0] == '(') {
5914 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5915 minex = 1;
5916 } else {
5917 min = strtod(c->argv[2]->ptr,NULL);
5918 }
5919 if (((char*)c->argv[3]->ptr)[0] == '(') {
5920 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5921 maxex = 1;
5922 } else {
5923 max = strtod(c->argv[3]->ptr,NULL);
5924 }
5925
5926 /* Parse "WITHSCORES": note that if the command was called with
5927 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5928 * enter the following paths to parse WITHSCORES and LIMIT. */
5929 if (c->argc == 5 || c->argc == 8) {
5930 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5931 withscores = 1;
5932 else
5933 badsyntax = 1;
5934 }
5935 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5936 badsyntax = 1;
5937 if (badsyntax) {
5938 addReplySds(c,
5939 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5940 return;
5941 }
5942
5943 /* Parse "LIMIT" */
5944 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5945 addReply(c,shared.syntaxerr);
5946 return;
5947 } else if (c->argc == (7 + withscores)) {
5948 offset = atoi(c->argv[5]->ptr);
5949 limit = atoi(c->argv[6]->ptr);
5950 if (offset < 0) offset = 0;
5951 }
5952
5953 /* Ok, lookup the key and get the range */
5954 o = lookupKeyRead(c->db,c->argv[1]);
5955 if (o == NULL) {
5956 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5957 } else {
5958 if (o->type != REDIS_ZSET) {
5959 addReply(c,shared.wrongtypeerr);
5960 } else {
5961 zset *zsetobj = o->ptr;
5962 zskiplist *zsl = zsetobj->zsl;
5963 zskiplistNode *ln;
5964 robj *ele, *lenobj = NULL;
5965 unsigned long rangelen = 0;
5966
5967 /* Get the first node with the score >= min, or with
5968 * score > min if 'minex' is true. */
5969 ln = zslFirstWithScore(zsl,min);
5970 while (minex && ln && ln->score == min) ln = ln->forward[0];
5971
5972 if (ln == NULL) {
5973 /* No element matching the speciifed interval */
5974 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5975 return;
5976 }
5977
5978 /* We don't know in advance how many matching elements there
5979 * are in the list, so we push this object that will represent
5980 * the multi-bulk length in the output buffer, and will "fix"
5981 * it later */
5982 if (!justcount) {
5983 lenobj = createObject(REDIS_STRING,NULL);
5984 addReply(c,lenobj);
5985 decrRefCount(lenobj);
5986 }
5987
5988 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5989 if (offset) {
5990 offset--;
5991 ln = ln->forward[0];
5992 continue;
5993 }
5994 if (limit == 0) break;
5995 if (!justcount) {
5996 ele = ln->obj;
5997 addReplyBulk(c,ele);
5998 if (withscores)
5999 addReplyDouble(c,ln->score);
6000 }
6001 ln = ln->forward[0];
6002 rangelen++;
6003 if (limit > 0) limit--;
6004 }
6005 if (justcount) {
6006 addReplyLong(c,(long)rangelen);
6007 } else {
6008 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6009 withscores ? (rangelen*2) : rangelen);
6010 }
6011 }
6012 }
6013}
6014
6015static void zrangebyscoreCommand(redisClient *c) {
6016 genericZrangebyscoreCommand(c,0);
6017}
6018
6019static void zcountCommand(redisClient *c) {
6020 genericZrangebyscoreCommand(c,1);
6021}
6022
6023static void zcardCommand(redisClient *c) {
6024 robj *o;
6025 zset *zs;
6026
6027 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6028 checkType(c,o,REDIS_ZSET)) return;
6029
6030 zs = o->ptr;
6031 addReplyUlong(c,zs->zsl->length);
6032}
6033
6034static void zscoreCommand(redisClient *c) {
6035 robj *o;
6036 zset *zs;
6037 dictEntry *de;
6038
6039 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6040 checkType(c,o,REDIS_ZSET)) return;
6041
6042 zs = o->ptr;
6043 de = dictFind(zs->dict,c->argv[2]);
6044 if (!de) {
6045 addReply(c,shared.nullbulk);
6046 } else {
6047 double *score = dictGetEntryVal(de);
6048
6049 addReplyDouble(c,*score);
6050 }
6051}
6052
6053static void zrankGenericCommand(redisClient *c, int reverse) {
6054 robj *o;
6055 zset *zs;
6056 zskiplist *zsl;
6057 dictEntry *de;
6058 unsigned long rank;
6059 double *score;
6060
6061 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6062 checkType(c,o,REDIS_ZSET)) return;
6063
6064 zs = o->ptr;
6065 zsl = zs->zsl;
6066 de = dictFind(zs->dict,c->argv[2]);
6067 if (!de) {
6068 addReply(c,shared.nullbulk);
6069 return;
6070 }
6071
6072 score = dictGetEntryVal(de);
6073 rank = zslGetRank(zsl, *score, c->argv[2]);
6074 if (rank) {
6075 if (reverse) {
6076 addReplyLong(c, zsl->length - rank);
6077 } else {
6078 addReplyLong(c, rank-1);
6079 }
6080 } else {
6081 addReply(c,shared.nullbulk);
6082 }
6083}
6084
6085static void zrankCommand(redisClient *c) {
6086 zrankGenericCommand(c, 0);
6087}
6088
6089static void zrevrankCommand(redisClient *c) {
6090 zrankGenericCommand(c, 1);
6091}
6092
6093/* ========================= Hashes utility functions ======================= */
6094#define REDIS_HASH_KEY 1
6095#define REDIS_HASH_VALUE 2
6096
6097/* Check the length of a number of objects to see if we need to convert a
6098 * zipmap to a real hash. Note that we only check string encoded objects
6099 * as their string length can be queried in constant time. */
6100static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6101 int i;
6102 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6103
6104 for (i = start; i <= end; i++) {
6105 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6106 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6107 {
6108 convertToRealHash(subject);
6109 return;
6110 }
6111 }
6112}
6113
6114/* Encode given objects in-place when the hash uses a dict. */
6115static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6116 if (subject->encoding == REDIS_ENCODING_HT) {
6117 if (o1) *o1 = tryObjectEncoding(*o1);
6118 if (o2) *o2 = tryObjectEncoding(*o2);
6119 }
6120}
6121
6122/* Get the value from a hash identified by key. Returns either a string
6123 * object or NULL if the value cannot be found. The refcount of the object
6124 * is always increased by 1 when the value was found. */
6125static robj *hashGet(robj *o, robj *key) {
6126 robj *value = NULL;
6127 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6128 unsigned char *v;
6129 unsigned int vlen;
6130 key = getDecodedObject(key);
6131 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6132 value = createStringObject((char*)v,vlen);
6133 }
6134 decrRefCount(key);
6135 } else {
6136 dictEntry *de = dictFind(o->ptr,key);
6137 if (de != NULL) {
6138 value = dictGetEntryVal(de);
6139 incrRefCount(value);
6140 }
6141 }
6142 return value;
6143}
6144
6145/* Test if the key exists in the given hash. Returns 1 if the key
6146 * exists and 0 when it doesn't. */
6147static int hashExists(robj *o, robj *key) {
6148 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6149 key = getDecodedObject(key);
6150 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6151 decrRefCount(key);
6152 return 1;
6153 }
6154 decrRefCount(key);
6155 } else {
6156 if (dictFind(o->ptr,key) != NULL) {
6157 return 1;
6158 }
6159 }
6160 return 0;
6161}
6162
6163/* Add an element, discard the old if the key already exists.
6164 * Return 0 on insert and 1 on update. */
6165static int hashSet(robj *o, robj *key, robj *value) {
6166 int update = 0;
6167 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6168 key = getDecodedObject(key);
6169 value = getDecodedObject(value);
6170 o->ptr = zipmapSet(o->ptr,
6171 key->ptr,sdslen(key->ptr),
6172 value->ptr,sdslen(value->ptr), &update);
6173 decrRefCount(key);
6174 decrRefCount(value);
6175
6176 /* Check if the zipmap needs to be upgraded to a real hash table */
6177 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6178 convertToRealHash(o);
6179 } else {
6180 if (dictReplace(o->ptr,key,value)) {
6181 /* Insert */
6182 incrRefCount(key);
6183 } else {
6184 /* Update */
6185 update = 1;
6186 }
6187 incrRefCount(value);
6188 }
6189 return update;
6190}
6191
6192/* Delete an element from a hash.
6193 * Return 1 on deleted and 0 on not found. */
6194static int hashDelete(robj *o, robj *key) {
6195 int deleted = 0;
6196 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6197 key = getDecodedObject(key);
6198 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6199 decrRefCount(key);
6200 } else {
6201 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6202 /* Always check if the dictionary needs a resize after a delete. */
6203 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6204 }
6205 return deleted;
6206}
6207
6208/* Return the number of elements in a hash. */
6209static unsigned long hashLength(robj *o) {
6210 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6211 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6212}
6213
6214/* Structure to hold hash iteration abstration. Note that iteration over
6215 * hashes involves both fields and values. Because it is possible that
6216 * not both are required, store pointers in the iterator to avoid
6217 * unnecessary memory allocation for fields/values. */
6218typedef struct {
6219 int encoding;
6220 unsigned char *zi;
6221 unsigned char *zk, *zv;
6222 unsigned int zklen, zvlen;
6223
6224 dictIterator *di;
6225 dictEntry *de;
6226} hashIterator;
6227
6228static hashIterator *hashInitIterator(robj *subject) {
6229 hashIterator *hi = zmalloc(sizeof(hashIterator));
6230 hi->encoding = subject->encoding;
6231 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6232 hi->zi = zipmapRewind(subject->ptr);
6233 } else if (hi->encoding == REDIS_ENCODING_HT) {
6234 hi->di = dictGetIterator(subject->ptr);
6235 } else {
6236 redisAssert(NULL);
6237 }
6238 return hi;
6239}
6240
6241static void hashReleaseIterator(hashIterator *hi) {
6242 if (hi->encoding == REDIS_ENCODING_HT) {
6243 dictReleaseIterator(hi->di);
6244 }
6245 zfree(hi);
6246}
6247
6248/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6249 * could be found and REDIS_ERR when the iterator reaches the end. */
6250static int hashNext(hashIterator *hi) {
6251 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6252 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6253 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6254 } else {
6255 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6256 }
6257 return REDIS_OK;
6258}
6259
6260/* Get key or value object at current iteration position.
6261 * This increases the refcount of the field object by 1. */
6262static robj *hashCurrent(hashIterator *hi, int what) {
6263 robj *o;
6264 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6265 if (what & REDIS_HASH_KEY) {
6266 o = createStringObject((char*)hi->zk,hi->zklen);
6267 } else {
6268 o = createStringObject((char*)hi->zv,hi->zvlen);
6269 }
6270 } else {
6271 if (what & REDIS_HASH_KEY) {
6272 o = dictGetEntryKey(hi->de);
6273 } else {
6274 o = dictGetEntryVal(hi->de);
6275 }
6276 incrRefCount(o);
6277 }
6278 return o;
6279}
6280
6281static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6282 robj *o = lookupKeyWrite(c->db,key);
6283 if (o == NULL) {
6284 o = createHashObject();
6285 dictAdd(c->db->dict,key,o);
6286 incrRefCount(key);
6287 } else {
6288 if (o->type != REDIS_HASH) {
6289 addReply(c,shared.wrongtypeerr);
6290 return NULL;
6291 }
6292 }
6293 return o;
6294}
6295
6296/* ============================= Hash commands ============================== */
6297static void hsetCommand(redisClient *c) {
6298 int update;
6299 robj *o;
6300
6301 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6302 hashTryConversion(o,c->argv,2,3);
6303 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6304 update = hashSet(o,c->argv[2],c->argv[3]);
6305 addReply(c, update ? shared.czero : shared.cone);
6306 server.dirty++;
6307}
6308
6309static void hsetnxCommand(redisClient *c) {
6310 robj *o;
6311 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6312 hashTryConversion(o,c->argv,2,3);
6313
6314 if (hashExists(o, c->argv[2])) {
6315 addReply(c, shared.czero);
6316 } else {
6317 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6318 hashSet(o,c->argv[2],c->argv[3]);
6319 addReply(c, shared.cone);
6320 server.dirty++;
6321 }
6322}
6323
6324static void hmsetCommand(redisClient *c) {
6325 int i;
6326 robj *o;
6327
6328 if ((c->argc % 2) == 1) {
6329 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6330 return;
6331 }
6332
6333 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6334 hashTryConversion(o,c->argv,2,c->argc-1);
6335 for (i = 2; i < c->argc; i += 2) {
6336 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6337 hashSet(o,c->argv[i],c->argv[i+1]);
6338 }
6339 addReply(c, shared.ok);
6340 server.dirty++;
6341}
6342
6343static void hincrbyCommand(redisClient *c) {
6344 long long value, incr;
6345 robj *o, *current, *new;
6346
6347 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6348 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6349 if ((current = hashGet(o,c->argv[2])) != NULL) {
6350 if (current->encoding == REDIS_ENCODING_RAW)
6351 value = strtoll(current->ptr,NULL,10);
6352 else if (current->encoding == REDIS_ENCODING_INT)
6353 value = (long)current->ptr;
6354 else
6355 redisAssert(1 != 1);
6356 decrRefCount(current);
6357 } else {
6358 value = 0;
6359 }
6360
6361 value += incr;
6362 new = createStringObjectFromLongLong(value);
6363 hashTryObjectEncoding(o,&c->argv[2],NULL);
6364 hashSet(o,c->argv[2],new);
6365 decrRefCount(new);
6366 addReplyLongLong(c,value);
6367 server.dirty++;
6368}
6369
6370static void hgetCommand(redisClient *c) {
6371 robj *o, *value;
6372 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6373 checkType(c,o,REDIS_HASH)) return;
6374
6375 if ((value = hashGet(o,c->argv[2])) != NULL) {
6376 addReplyBulk(c,value);
6377 decrRefCount(value);
6378 } else {
6379 addReply(c,shared.nullbulk);
6380 }
6381}
6382
6383static void hmgetCommand(redisClient *c) {
6384 int i;
6385 robj *o, *value;
6386 o = lookupKeyRead(c->db,c->argv[1]);
6387 if (o != NULL && o->type != REDIS_HASH) {
6388 addReply(c,shared.wrongtypeerr);
6389 }
6390
6391 /* Note the check for o != NULL happens inside the loop. This is
6392 * done because objects that cannot be found are considered to be
6393 * an empty hash. The reply should then be a series of NULLs. */
6394 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6395 for (i = 2; i < c->argc; i++) {
6396 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6397 addReplyBulk(c,value);
6398 decrRefCount(value);
6399 } else {
6400 addReply(c,shared.nullbulk);
6401 }
6402 }
6403}
6404
6405static void hdelCommand(redisClient *c) {
6406 robj *o;
6407 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6408 checkType(c,o,REDIS_HASH)) return;
6409
6410 if (hashDelete(o,c->argv[2])) {
6411 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6412 addReply(c,shared.cone);
6413 server.dirty++;
6414 } else {
6415 addReply(c,shared.czero);
6416 }
6417}
6418
6419static void hlenCommand(redisClient *c) {
6420 robj *o;
6421 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6422 checkType(c,o,REDIS_HASH)) return;
6423
6424 addReplyUlong(c,hashLength(o));
6425}
6426
6427static void genericHgetallCommand(redisClient *c, int flags) {
6428 robj *o, *lenobj, *obj;
6429 unsigned long count = 0;
6430 hashIterator *hi;
6431
6432 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6433 || checkType(c,o,REDIS_HASH)) return;
6434
6435 lenobj = createObject(REDIS_STRING,NULL);
6436 addReply(c,lenobj);
6437 decrRefCount(lenobj);
6438
6439 hi = hashInitIterator(o);
6440 while (hashNext(hi) != REDIS_ERR) {
6441 if (flags & REDIS_HASH_KEY) {
6442 obj = hashCurrent(hi,REDIS_HASH_KEY);
6443 addReplyBulk(c,obj);
6444 decrRefCount(obj);
6445 count++;
6446 }
6447 if (flags & REDIS_HASH_VALUE) {
6448 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6449 addReplyBulk(c,obj);
6450 decrRefCount(obj);
6451 count++;
6452 }
6453 }
6454 hashReleaseIterator(hi);
6455
6456 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6457}
6458
6459static void hkeysCommand(redisClient *c) {
6460 genericHgetallCommand(c,REDIS_HASH_KEY);
6461}
6462
6463static void hvalsCommand(redisClient *c) {
6464 genericHgetallCommand(c,REDIS_HASH_VALUE);
6465}
6466
6467static void hgetallCommand(redisClient *c) {
6468 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6469}
6470
6471static void hexistsCommand(redisClient *c) {
6472 robj *o;
6473 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6474 checkType(c,o,REDIS_HASH)) return;
6475
6476 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6477}
6478
6479static void convertToRealHash(robj *o) {
6480 unsigned char *key, *val, *p, *zm = o->ptr;
6481 unsigned int klen, vlen;
6482 dict *dict = dictCreate(&hashDictType,NULL);
6483
6484 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6485 p = zipmapRewind(zm);
6486 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6487 robj *keyobj, *valobj;
6488
6489 keyobj = createStringObject((char*)key,klen);
6490 valobj = createStringObject((char*)val,vlen);
6491 keyobj = tryObjectEncoding(keyobj);
6492 valobj = tryObjectEncoding(valobj);
6493 dictAdd(dict,keyobj,valobj);
6494 }
6495 o->encoding = REDIS_ENCODING_HT;
6496 o->ptr = dict;
6497 zfree(zm);
6498}
6499
6500/* ========================= Non type-specific commands ==================== */
6501
6502static void flushdbCommand(redisClient *c) {
6503 server.dirty += dictSize(c->db->dict);
6504 dictEmpty(c->db->dict);
6505 dictEmpty(c->db->expires);
6506 addReply(c,shared.ok);
6507}
6508
6509static void flushallCommand(redisClient *c) {
6510 server.dirty += emptyDb();
6511 addReply(c,shared.ok);
6512 if (server.bgsavechildpid != -1) {
6513 kill(server.bgsavechildpid,SIGKILL);
6514 rdbRemoveTempFile(server.bgsavechildpid);
6515 }
6516 rdbSave(server.dbfilename);
6517 server.dirty++;
6518}
6519
6520static redisSortOperation *createSortOperation(int type, robj *pattern) {
6521 redisSortOperation *so = zmalloc(sizeof(*so));
6522 so->type = type;
6523 so->pattern = pattern;
6524 return so;
6525}
6526
6527/* Return the value associated to the key with a name obtained
6528 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6529 * The returned object will always have its refcount increased by 1
6530 * when it is non-NULL. */
6531static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6532 char *p, *f;
6533 sds spat, ssub;
6534 robj keyobj, fieldobj, *o;
6535 int prefixlen, sublen, postfixlen, fieldlen;
6536 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6537 struct {
6538 long len;
6539 long free;
6540 char buf[REDIS_SORTKEY_MAX+1];
6541 } keyname, fieldname;
6542
6543 /* If the pattern is "#" return the substitution object itself in order
6544 * to implement the "SORT ... GET #" feature. */
6545 spat = pattern->ptr;
6546 if (spat[0] == '#' && spat[1] == '\0') {
6547 incrRefCount(subst);
6548 return subst;
6549 }
6550
6551 /* The substitution object may be specially encoded. If so we create
6552 * a decoded object on the fly. Otherwise getDecodedObject will just
6553 * increment the ref count, that we'll decrement later. */
6554 subst = getDecodedObject(subst);
6555
6556 ssub = subst->ptr;
6557 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6558 p = strchr(spat,'*');
6559 if (!p) {
6560 decrRefCount(subst);
6561 return NULL;
6562 }
6563
6564 /* Find out if we're dealing with a hash dereference. */
6565 if ((f = strstr(p+1, "->")) != NULL) {
6566 fieldlen = sdslen(spat)-(f-spat);
6567 /* this also copies \0 character */
6568 memcpy(fieldname.buf,f+2,fieldlen-1);
6569 fieldname.len = fieldlen-2;
6570 } else {
6571 fieldlen = 0;
6572 }
6573
6574 prefixlen = p-spat;
6575 sublen = sdslen(ssub);
6576 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6577 memcpy(keyname.buf,spat,prefixlen);
6578 memcpy(keyname.buf+prefixlen,ssub,sublen);
6579 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6580 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6581 keyname.len = prefixlen+sublen+postfixlen;
6582 decrRefCount(subst);
6583
6584 /* Lookup substituted key */
6585 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6586 o = lookupKeyRead(db,&keyobj);
6587 if (o == NULL) return NULL;
6588
6589 if (fieldlen > 0) {
6590 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6591
6592 /* Retrieve value from hash by the field name. This operation
6593 * already increases the refcount of the returned object. */
6594 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6595 o = hashGet(o, &fieldobj);
6596 } else {
6597 if (o->type != REDIS_STRING) return NULL;
6598
6599 /* Every object that this function returns needs to have its refcount
6600 * increased. sortCommand decreases it again. */
6601 incrRefCount(o);
6602 }
6603
6604 return o;
6605}
6606
6607/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6608 * the additional parameter is not standard but a BSD-specific we have to
6609 * pass sorting parameters via the global 'server' structure */
6610static int sortCompare(const void *s1, const void *s2) {
6611 const redisSortObject *so1 = s1, *so2 = s2;
6612 int cmp;
6613
6614 if (!server.sort_alpha) {
6615 /* Numeric sorting. Here it's trivial as we precomputed scores */
6616 if (so1->u.score > so2->u.score) {
6617 cmp = 1;
6618 } else if (so1->u.score < so2->u.score) {
6619 cmp = -1;
6620 } else {
6621 cmp = 0;
6622 }
6623 } else {
6624 /* Alphanumeric sorting */
6625 if (server.sort_bypattern) {
6626 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6627 /* At least one compare object is NULL */
6628 if (so1->u.cmpobj == so2->u.cmpobj)
6629 cmp = 0;
6630 else if (so1->u.cmpobj == NULL)
6631 cmp = -1;
6632 else
6633 cmp = 1;
6634 } else {
6635 /* We have both the objects, use strcoll */
6636 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6637 }
6638 } else {
6639 /* Compare elements directly. */
6640 cmp = compareStringObjects(so1->obj,so2->obj);
6641 }
6642 }
6643 return server.sort_desc ? -cmp : cmp;
6644}
6645
6646/* The SORT command is the most complex command in Redis. Warning: this code
6647 * is optimized for speed and a bit less for readability */
6648static void sortCommand(redisClient *c) {
6649 list *operations;
6650 int outputlen = 0;
6651 int desc = 0, alpha = 0;
6652 int limit_start = 0, limit_count = -1, start, end;
6653 int j, dontsort = 0, vectorlen;
6654 int getop = 0; /* GET operation counter */
6655 robj *sortval, *sortby = NULL, *storekey = NULL;
6656 redisSortObject *vector; /* Resulting vector to sort */
6657
6658 /* Lookup the key to sort. It must be of the right types */
6659 sortval = lookupKeyRead(c->db,c->argv[1]);
6660 if (sortval == NULL) {
6661 addReply(c,shared.emptymultibulk);
6662 return;
6663 }
6664 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6665 sortval->type != REDIS_ZSET)
6666 {
6667 addReply(c,shared.wrongtypeerr);
6668 return;
6669 }
6670
6671 /* Create a list of operations to perform for every sorted element.
6672 * Operations can be GET/DEL/INCR/DECR */
6673 operations = listCreate();
6674 listSetFreeMethod(operations,zfree);
6675 j = 2;
6676
6677 /* Now we need to protect sortval incrementing its count, in the future
6678 * SORT may have options able to overwrite/delete keys during the sorting
6679 * and the sorted key itself may get destroied */
6680 incrRefCount(sortval);
6681
6682 /* The SORT command has an SQL-alike syntax, parse it */
6683 while(j < c->argc) {
6684 int leftargs = c->argc-j-1;
6685 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6686 desc = 0;
6687 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6688 desc = 1;
6689 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6690 alpha = 1;
6691 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6692 limit_start = atoi(c->argv[j+1]->ptr);
6693 limit_count = atoi(c->argv[j+2]->ptr);
6694 j+=2;
6695 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6696 storekey = c->argv[j+1];
6697 j++;
6698 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6699 sortby = c->argv[j+1];
6700 /* If the BY pattern does not contain '*', i.e. it is constant,
6701 * we don't need to sort nor to lookup the weight keys. */
6702 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6703 j++;
6704 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6705 listAddNodeTail(operations,createSortOperation(
6706 REDIS_SORT_GET,c->argv[j+1]));
6707 getop++;
6708 j++;
6709 } else {
6710 decrRefCount(sortval);
6711 listRelease(operations);
6712 addReply(c,shared.syntaxerr);
6713 return;
6714 }
6715 j++;
6716 }
6717
6718 /* Load the sorting vector with all the objects to sort */
6719 switch(sortval->type) {
6720 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6721 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6722 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6723 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6724 }
6725 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6726 j = 0;
6727
6728 if (sortval->type == REDIS_LIST) {
6729 list *list = sortval->ptr;
6730 listNode *ln;
6731 listIter li;
6732
6733 listRewind(list,&li);
6734 while((ln = listNext(&li))) {
6735 robj *ele = ln->value;
6736 vector[j].obj = ele;
6737 vector[j].u.score = 0;
6738 vector[j].u.cmpobj = NULL;
6739 j++;
6740 }
6741 } else {
6742 dict *set;
6743 dictIterator *di;
6744 dictEntry *setele;
6745
6746 if (sortval->type == REDIS_SET) {
6747 set = sortval->ptr;
6748 } else {
6749 zset *zs = sortval->ptr;
6750 set = zs->dict;
6751 }
6752
6753 di = dictGetIterator(set);
6754 while((setele = dictNext(di)) != NULL) {
6755 vector[j].obj = dictGetEntryKey(setele);
6756 vector[j].u.score = 0;
6757 vector[j].u.cmpobj = NULL;
6758 j++;
6759 }
6760 dictReleaseIterator(di);
6761 }
6762 redisAssert(j == vectorlen);
6763
6764 /* Now it's time to load the right scores in the sorting vector */
6765 if (dontsort == 0) {
6766 for (j = 0; j < vectorlen; j++) {
6767 robj *byval;
6768 if (sortby) {
6769 /* lookup value to sort by */
6770 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6771 if (!byval) continue;
6772 } else {
6773 /* use object itself to sort by */
6774 byval = vector[j].obj;
6775 }
6776
6777 if (alpha) {
6778 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6779 } else {
6780 if (byval->encoding == REDIS_ENCODING_RAW) {
6781 vector[j].u.score = strtod(byval->ptr,NULL);
6782 } else if (byval->encoding == REDIS_ENCODING_INT) {
6783 /* Don't need to decode the object if it's
6784 * integer-encoded (the only encoding supported) so
6785 * far. We can just cast it */
6786 vector[j].u.score = (long)byval->ptr;
6787 } else {
6788 redisAssert(1 != 1);
6789 }
6790 }
6791
6792 /* when the object was retrieved using lookupKeyByPattern,
6793 * its refcount needs to be decreased. */
6794 if (sortby) {
6795 decrRefCount(byval);
6796 }
6797 }
6798 }
6799
6800 /* We are ready to sort the vector... perform a bit of sanity check
6801 * on the LIMIT option too. We'll use a partial version of quicksort. */
6802 start = (limit_start < 0) ? 0 : limit_start;
6803 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6804 if (start >= vectorlen) {
6805 start = vectorlen-1;
6806 end = vectorlen-2;
6807 }
6808 if (end >= vectorlen) end = vectorlen-1;
6809
6810 if (dontsort == 0) {
6811 server.sort_desc = desc;
6812 server.sort_alpha = alpha;
6813 server.sort_bypattern = sortby ? 1 : 0;
6814 if (sortby && (start != 0 || end != vectorlen-1))
6815 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6816 else
6817 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6818 }
6819
6820 /* Send command output to the output buffer, performing the specified
6821 * GET/DEL/INCR/DECR operations if any. */
6822 outputlen = getop ? getop*(end-start+1) : end-start+1;
6823 if (storekey == NULL) {
6824 /* STORE option not specified, sent the sorting result to client */
6825 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6826 for (j = start; j <= end; j++) {
6827 listNode *ln;
6828 listIter li;
6829
6830 if (!getop) addReplyBulk(c,vector[j].obj);
6831 listRewind(operations,&li);
6832 while((ln = listNext(&li))) {
6833 redisSortOperation *sop = ln->value;
6834 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6835 vector[j].obj);
6836
6837 if (sop->type == REDIS_SORT_GET) {
6838 if (!val) {
6839 addReply(c,shared.nullbulk);
6840 } else {
6841 addReplyBulk(c,val);
6842 decrRefCount(val);
6843 }
6844 } else {
6845 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6846 }
6847 }
6848 }
6849 } else {
6850 robj *listObject = createListObject();
6851 list *listPtr = (list*) listObject->ptr;
6852
6853 /* STORE option specified, set the sorting result as a List object */
6854 for (j = start; j <= end; j++) {
6855 listNode *ln;
6856 listIter li;
6857
6858 if (!getop) {
6859 listAddNodeTail(listPtr,vector[j].obj);
6860 incrRefCount(vector[j].obj);
6861 }
6862 listRewind(operations,&li);
6863 while((ln = listNext(&li))) {
6864 redisSortOperation *sop = ln->value;
6865 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6866 vector[j].obj);
6867
6868 if (sop->type == REDIS_SORT_GET) {
6869 if (!val) {
6870 listAddNodeTail(listPtr,createStringObject("",0));
6871 } else {
6872 /* We should do a incrRefCount on val because it is
6873 * added to the list, but also a decrRefCount because
6874 * it is returned by lookupKeyByPattern. This results
6875 * in doing nothing at all. */
6876 listAddNodeTail(listPtr,val);
6877 }
6878 } else {
6879 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6880 }
6881 }
6882 }
6883 if (dictReplace(c->db->dict,storekey,listObject)) {
6884 incrRefCount(storekey);
6885 }
6886 /* Note: we add 1 because the DB is dirty anyway since even if the
6887 * SORT result is empty a new key is set and maybe the old content
6888 * replaced. */
6889 server.dirty += 1+outputlen;
6890 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6891 }
6892
6893 /* Cleanup */
6894 decrRefCount(sortval);
6895 listRelease(operations);
6896 for (j = 0; j < vectorlen; j++) {
6897 if (alpha && vector[j].u.cmpobj)
6898 decrRefCount(vector[j].u.cmpobj);
6899 }
6900 zfree(vector);
6901}
6902
6903/* Convert an amount of bytes into a human readable string in the form
6904 * of 100B, 2G, 100M, 4K, and so forth. */
6905static void bytesToHuman(char *s, unsigned long long n) {
6906 double d;
6907
6908 if (n < 1024) {
6909 /* Bytes */
6910 sprintf(s,"%lluB",n);
6911 return;
6912 } else if (n < (1024*1024)) {
6913 d = (double)n/(1024);
6914 sprintf(s,"%.2fK",d);
6915 } else if (n < (1024LL*1024*1024)) {
6916 d = (double)n/(1024*1024);
6917 sprintf(s,"%.2fM",d);
6918 } else if (n < (1024LL*1024*1024*1024)) {
6919 d = (double)n/(1024LL*1024*1024);
6920 sprintf(s,"%.2fG",d);
6921 }
6922}
6923
6924/* Create the string returned by the INFO command. This is decoupled
6925 * by the INFO command itself as we need to report the same information
6926 * on memory corruption problems. */
6927static sds genRedisInfoString(void) {
6928 sds info;
6929 time_t uptime = time(NULL)-server.stat_starttime;
6930 int j;
6931 char hmem[64];
6932
6933 bytesToHuman(hmem,zmalloc_used_memory());
6934 info = sdscatprintf(sdsempty(),
6935 "redis_version:%s\r\n"
6936 "arch_bits:%s\r\n"
6937 "multiplexing_api:%s\r\n"
6938 "process_id:%ld\r\n"
6939 "uptime_in_seconds:%ld\r\n"
6940 "uptime_in_days:%ld\r\n"
6941 "connected_clients:%d\r\n"
6942 "connected_slaves:%d\r\n"
6943 "blocked_clients:%d\r\n"
6944 "used_memory:%zu\r\n"
6945 "used_memory_human:%s\r\n"
6946 "changes_since_last_save:%lld\r\n"
6947 "bgsave_in_progress:%d\r\n"
6948 "last_save_time:%ld\r\n"
6949 "bgrewriteaof_in_progress:%d\r\n"
6950 "total_connections_received:%lld\r\n"
6951 "total_commands_processed:%lld\r\n"
6952 "expired_keys:%lld\r\n"
6953 "hash_max_zipmap_entries:%ld\r\n"
6954 "hash_max_zipmap_value:%ld\r\n"
6955 "pubsub_channels:%ld\r\n"
6956 "pubsub_patterns:%u\r\n"
6957 "vm_enabled:%d\r\n"
6958 "role:%s\r\n"
6959 ,REDIS_VERSION,
6960 (sizeof(long) == 8) ? "64" : "32",
6961 aeGetApiName(),
6962 (long) getpid(),
6963 uptime,
6964 uptime/(3600*24),
6965 listLength(server.clients)-listLength(server.slaves),
6966 listLength(server.slaves),
6967 server.blpop_blocked_clients,
6968 zmalloc_used_memory(),
6969 hmem,
6970 server.dirty,
6971 server.bgsavechildpid != -1,
6972 server.lastsave,
6973 server.bgrewritechildpid != -1,
6974 server.stat_numconnections,
6975 server.stat_numcommands,
6976 server.stat_expiredkeys,
6977 server.hash_max_zipmap_entries,
6978 server.hash_max_zipmap_value,
6979 dictSize(server.pubsub_channels),
6980 listLength(server.pubsub_patterns),
6981 server.vm_enabled != 0,
6982 server.masterhost == NULL ? "master" : "slave"
6983 );
6984 if (server.masterhost) {
6985 info = sdscatprintf(info,
6986 "master_host:%s\r\n"
6987 "master_port:%d\r\n"
6988 "master_link_status:%s\r\n"
6989 "master_last_io_seconds_ago:%d\r\n"
6990 ,server.masterhost,
6991 server.masterport,
6992 (server.replstate == REDIS_REPL_CONNECTED) ?
6993 "up" : "down",
6994 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6995 );
6996 }
6997 if (server.vm_enabled) {
6998 lockThreadedIO();
6999 info = sdscatprintf(info,
7000 "vm_conf_max_memory:%llu\r\n"
7001 "vm_conf_page_size:%llu\r\n"
7002 "vm_conf_pages:%llu\r\n"
7003 "vm_stats_used_pages:%llu\r\n"
7004 "vm_stats_swapped_objects:%llu\r\n"
7005 "vm_stats_swappin_count:%llu\r\n"
7006 "vm_stats_swappout_count:%llu\r\n"
7007 "vm_stats_io_newjobs_len:%lu\r\n"
7008 "vm_stats_io_processing_len:%lu\r\n"
7009 "vm_stats_io_processed_len:%lu\r\n"
7010 "vm_stats_io_active_threads:%lu\r\n"
7011 "vm_stats_blocked_clients:%lu\r\n"
7012 ,(unsigned long long) server.vm_max_memory,
7013 (unsigned long long) server.vm_page_size,
7014 (unsigned long long) server.vm_pages,
7015 (unsigned long long) server.vm_stats_used_pages,
7016 (unsigned long long) server.vm_stats_swapped_objects,
7017 (unsigned long long) server.vm_stats_swapins,
7018 (unsigned long long) server.vm_stats_swapouts,
7019 (unsigned long) listLength(server.io_newjobs),
7020 (unsigned long) listLength(server.io_processing),
7021 (unsigned long) listLength(server.io_processed),
7022 (unsigned long) server.io_active_threads,
7023 (unsigned long) server.vm_blocked_clients
7024 );
7025 unlockThreadedIO();
7026 }
7027 for (j = 0; j < server.dbnum; j++) {
7028 long long keys, vkeys;
7029
7030 keys = dictSize(server.db[j].dict);
7031 vkeys = dictSize(server.db[j].expires);
7032 if (keys || vkeys) {
7033 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7034 j, keys, vkeys);
7035 }
7036 }
7037 return info;
7038}
7039
7040static void infoCommand(redisClient *c) {
7041 sds info = genRedisInfoString();
7042 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7043 (unsigned long)sdslen(info)));
7044 addReplySds(c,info);
7045 addReply(c,shared.crlf);
7046}
7047
7048static void monitorCommand(redisClient *c) {
7049 /* ignore MONITOR if aleady slave or in monitor mode */
7050 if (c->flags & REDIS_SLAVE) return;
7051
7052 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7053 c->slaveseldb = 0;
7054 listAddNodeTail(server.monitors,c);
7055 addReply(c,shared.ok);
7056}
7057
7058/* ================================= Expire ================================= */
7059static int removeExpire(redisDb *db, robj *key) {
7060 if (dictDelete(db->expires,key) == DICT_OK) {
7061 return 1;
7062 } else {
7063 return 0;
7064 }
7065}
7066
7067static int setExpire(redisDb *db, robj *key, time_t when) {
7068 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7069 return 0;
7070 } else {
7071 incrRefCount(key);
7072 return 1;
7073 }
7074}
7075
7076/* Return the expire time of the specified key, or -1 if no expire
7077 * is associated with this key (i.e. the key is non volatile) */
7078static time_t getExpire(redisDb *db, robj *key) {
7079 dictEntry *de;
7080
7081 /* No expire? return ASAP */
7082 if (dictSize(db->expires) == 0 ||
7083 (de = dictFind(db->expires,key)) == NULL) return -1;
7084
7085 return (time_t) dictGetEntryVal(de);
7086}
7087
7088static int expireIfNeeded(redisDb *db, robj *key) {
7089 time_t when;
7090 dictEntry *de;
7091
7092 /* No expire? return ASAP */
7093 if (dictSize(db->expires) == 0 ||
7094 (de = dictFind(db->expires,key)) == NULL) return 0;
7095
7096 /* Lookup the expire */
7097 when = (time_t) dictGetEntryVal(de);
7098 if (time(NULL) <= when) return 0;
7099
7100 /* Delete the key */
7101 dictDelete(db->expires,key);
7102 server.stat_expiredkeys++;
7103 return dictDelete(db->dict,key) == DICT_OK;
7104}
7105
7106static int deleteIfVolatile(redisDb *db, robj *key) {
7107 dictEntry *de;
7108
7109 /* No expire? return ASAP */
7110 if (dictSize(db->expires) == 0 ||
7111 (de = dictFind(db->expires,key)) == NULL) return 0;
7112
7113 /* Delete the key */
7114 server.dirty++;
7115 server.stat_expiredkeys++;
7116 dictDelete(db->expires,key);
7117 return dictDelete(db->dict,key) == DICT_OK;
7118}
7119
7120static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7121 dictEntry *de;
7122 time_t seconds;
7123
7124 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7125
7126 seconds -= offset;
7127
7128 de = dictFind(c->db->dict,key);
7129 if (de == NULL) {
7130 addReply(c,shared.czero);
7131 return;
7132 }
7133 if (seconds <= 0) {
7134 if (deleteKey(c->db,key)) server.dirty++;
7135 addReply(c, shared.cone);
7136 return;
7137 } else {
7138 time_t when = time(NULL)+seconds;
7139 if (setExpire(c->db,key,when)) {
7140 addReply(c,shared.cone);
7141 server.dirty++;
7142 } else {
7143 addReply(c,shared.czero);
7144 }
7145 return;
7146 }
7147}
7148
7149static void expireCommand(redisClient *c) {
7150 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7151}
7152
7153static void expireatCommand(redisClient *c) {
7154 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7155}
7156
7157static void ttlCommand(redisClient *c) {
7158 time_t expire;
7159 int ttl = -1;
7160
7161 expire = getExpire(c->db,c->argv[1]);
7162 if (expire != -1) {
7163 ttl = (int) (expire-time(NULL));
7164 if (ttl < 0) ttl = -1;
7165 }
7166 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7167}
7168
7169/* ================================ MULTI/EXEC ============================== */
7170
7171/* Client state initialization for MULTI/EXEC */
7172static void initClientMultiState(redisClient *c) {
7173 c->mstate.commands = NULL;
7174 c->mstate.count = 0;
7175}
7176
7177/* Release all the resources associated with MULTI/EXEC state */
7178static void freeClientMultiState(redisClient *c) {
7179 int j;
7180
7181 for (j = 0; j < c->mstate.count; j++) {
7182 int i;
7183 multiCmd *mc = c->mstate.commands+j;
7184
7185 for (i = 0; i < mc->argc; i++)
7186 decrRefCount(mc->argv[i]);
7187 zfree(mc->argv);
7188 }
7189 zfree(c->mstate.commands);
7190}
7191
7192/* Add a new command into the MULTI commands queue */
7193static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7194 multiCmd *mc;
7195 int j;
7196
7197 c->mstate.commands = zrealloc(c->mstate.commands,
7198 sizeof(multiCmd)*(c->mstate.count+1));
7199 mc = c->mstate.commands+c->mstate.count;
7200 mc->cmd = cmd;
7201 mc->argc = c->argc;
7202 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7203 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7204 for (j = 0; j < c->argc; j++)
7205 incrRefCount(mc->argv[j]);
7206 c->mstate.count++;
7207}
7208
7209static void multiCommand(redisClient *c) {
7210 c->flags |= REDIS_MULTI;
7211 addReply(c,shared.ok);
7212}
7213
7214static void discardCommand(redisClient *c) {
7215 if (!(c->flags & REDIS_MULTI)) {
7216 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7217 return;
7218 }
7219
7220 freeClientMultiState(c);
7221 initClientMultiState(c);
7222 c->flags &= (~REDIS_MULTI);
7223 addReply(c,shared.ok);
7224}
7225
7226/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7227 * implememntation for more information. */
7228static void execCommandReplicateMulti(redisClient *c) {
7229 struct redisCommand *cmd;
7230 robj *multistring = createStringObject("MULTI",5);
7231
7232 cmd = lookupCommand("multi");
7233 if (server.appendonly)
7234 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7235 if (listLength(server.slaves))
7236 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7237 decrRefCount(multistring);
7238}
7239
7240static void execCommand(redisClient *c) {
7241 int j;
7242 robj **orig_argv;
7243 int orig_argc;
7244
7245 if (!(c->flags & REDIS_MULTI)) {
7246 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7247 return;
7248 }
7249
7250 /* Replicate a MULTI request now that we are sure the block is executed.
7251 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7252 * both the AOF and the replication link will have the same consistency
7253 * and atomicity guarantees. */
7254 execCommandReplicateMulti(c);
7255
7256 /* Exec all the queued commands */
7257 orig_argv = c->argv;
7258 orig_argc = c->argc;
7259 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7260 for (j = 0; j < c->mstate.count; j++) {
7261 c->argc = c->mstate.commands[j].argc;
7262 c->argv = c->mstate.commands[j].argv;
7263 call(c,c->mstate.commands[j].cmd);
7264 }
7265 c->argv = orig_argv;
7266 c->argc = orig_argc;
7267 freeClientMultiState(c);
7268 initClientMultiState(c);
7269 c->flags &= (~REDIS_MULTI);
7270 /* Make sure the EXEC command is always replicated / AOF, since we
7271 * always send the MULTI command (we can't know beforehand if the
7272 * next operations will contain at least a modification to the DB). */
7273 server.dirty++;
7274}
7275
7276/* =========================== Blocking Operations ========================= */
7277
7278/* Currently Redis blocking operations support is limited to list POP ops,
7279 * so the current implementation is not fully generic, but it is also not
7280 * completely specific so it will not require a rewrite to support new
7281 * kind of blocking operations in the future.
7282 *
7283 * Still it's important to note that list blocking operations can be already
7284 * used as a notification mechanism in order to implement other blocking
7285 * operations at application level, so there must be a very strong evidence
7286 * of usefulness and generality before new blocking operations are implemented.
7287 *
7288 * This is how the current blocking POP works, we use BLPOP as example:
7289 * - If the user calls BLPOP and the key exists and contains a non empty list
7290 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7291 * if there is not to block.
7292 * - If instead BLPOP is called and the key does not exists or the list is
7293 * empty we need to block. In order to do so we remove the notification for
7294 * new data to read in the client socket (so that we'll not serve new
7295 * requests if the blocking request is not served). Also we put the client
7296 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7297 * blocking for this keys.
7298 * - If a PUSH operation against a key with blocked clients waiting is
7299 * performed, we serve the first in the list: basically instead to push
7300 * the new element inside the list we return it to the (first / oldest)
7301 * blocking client, unblock the client, and remove it form the list.
7302 *
7303 * The above comment and the source code should be enough in order to understand
7304 * the implementation and modify / fix it later.
7305 */
7306
7307/* Set a client in blocking mode for the specified key, with the specified
7308 * timeout */
7309static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7310 dictEntry *de;
7311 list *l;
7312 int j;
7313
7314 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7315 c->blockingkeysnum = numkeys;
7316 c->blockingto = timeout;
7317 for (j = 0; j < numkeys; j++) {
7318 /* Add the key in the client structure, to map clients -> keys */
7319 c->blockingkeys[j] = keys[j];
7320 incrRefCount(keys[j]);
7321
7322 /* And in the other "side", to map keys -> clients */
7323 de = dictFind(c->db->blockingkeys,keys[j]);
7324 if (de == NULL) {
7325 int retval;
7326
7327 /* For every key we take a list of clients blocked for it */
7328 l = listCreate();
7329 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7330 incrRefCount(keys[j]);
7331 assert(retval == DICT_OK);
7332 } else {
7333 l = dictGetEntryVal(de);
7334 }
7335 listAddNodeTail(l,c);
7336 }
7337 /* Mark the client as a blocked client */
7338 c->flags |= REDIS_BLOCKED;
7339 server.blpop_blocked_clients++;
7340}
7341
7342/* Unblock a client that's waiting in a blocking operation such as BLPOP */
7343static void unblockClientWaitingData(redisClient *c) {
7344 dictEntry *de;
7345 list *l;
7346 int j;
7347
7348 assert(c->blockingkeys != NULL);
7349 /* The client may wait for multiple keys, so unblock it for every key. */
7350 for (j = 0; j < c->blockingkeysnum; j++) {
7351 /* Remove this client from the list of clients waiting for this key. */
7352 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7353 assert(de != NULL);
7354 l = dictGetEntryVal(de);
7355 listDelNode(l,listSearchKey(l,c));
7356 /* If the list is empty we need to remove it to avoid wasting memory */
7357 if (listLength(l) == 0)
7358 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7359 decrRefCount(c->blockingkeys[j]);
7360 }
7361 /* Cleanup the client structure */
7362 zfree(c->blockingkeys);
7363 c->blockingkeys = NULL;
7364 c->flags &= (~REDIS_BLOCKED);
7365 server.blpop_blocked_clients--;
7366 /* We want to process data if there is some command waiting
7367 * in the input buffer. Note that this is safe even if
7368 * unblockClientWaitingData() gets called from freeClient() because
7369 * freeClient() will be smart enough to call this function
7370 * *after* c->querybuf was set to NULL. */
7371 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7372}
7373
7374/* This should be called from any function PUSHing into lists.
7375 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7376 * 'ele' is the element pushed.
7377 *
7378 * If the function returns 0 there was no client waiting for a list push
7379 * against this key.
7380 *
7381 * If the function returns 1 there was a client waiting for a list push
7382 * against this key, the element was passed to this client thus it's not
7383 * needed to actually add it to the list and the caller should return asap. */
7384static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7385 struct dictEntry *de;
7386 redisClient *receiver;
7387 list *l;
7388 listNode *ln;
7389
7390 de = dictFind(c->db->blockingkeys,key);
7391 if (de == NULL) return 0;
7392 l = dictGetEntryVal(de);
7393 ln = listFirst(l);
7394 assert(ln != NULL);
7395 receiver = ln->value;
7396
7397 addReplySds(receiver,sdsnew("*2\r\n"));
7398 addReplyBulk(receiver,key);
7399 addReplyBulk(receiver,ele);
7400 unblockClientWaitingData(receiver);
7401 return 1;
7402}
7403
7404/* Blocking RPOP/LPOP */
7405static void blockingPopGenericCommand(redisClient *c, int where) {
7406 robj *o;
7407 time_t timeout;
7408 int j;
7409
7410 for (j = 1; j < c->argc-1; j++) {
7411 o = lookupKeyWrite(c->db,c->argv[j]);
7412 if (o != NULL) {
7413 if (o->type != REDIS_LIST) {
7414 addReply(c,shared.wrongtypeerr);
7415 return;
7416 } else {
7417 list *list = o->ptr;
7418 if (listLength(list) != 0) {
7419 /* If the list contains elements fall back to the usual
7420 * non-blocking POP operation */
7421 robj *argv[2], **orig_argv;
7422 int orig_argc;
7423
7424 /* We need to alter the command arguments before to call
7425 * popGenericCommand() as the command takes a single key. */
7426 orig_argv = c->argv;
7427 orig_argc = c->argc;
7428 argv[1] = c->argv[j];
7429 c->argv = argv;
7430 c->argc = 2;
7431
7432 /* Also the return value is different, we need to output
7433 * the multi bulk reply header and the key name. The
7434 * "real" command will add the last element (the value)
7435 * for us. If this souds like an hack to you it's just
7436 * because it is... */
7437 addReplySds(c,sdsnew("*2\r\n"));
7438 addReplyBulk(c,argv[1]);
7439 popGenericCommand(c,where);
7440
7441 /* Fix the client structure with the original stuff */
7442 c->argv = orig_argv;
7443 c->argc = orig_argc;
7444 return;
7445 }
7446 }
7447 }
7448 }
7449 /* If the list is empty or the key does not exists we must block */
7450 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7451 if (timeout > 0) timeout += time(NULL);
7452 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7453}
7454
7455static void blpopCommand(redisClient *c) {
7456 blockingPopGenericCommand(c,REDIS_HEAD);
7457}
7458
7459static void brpopCommand(redisClient *c) {
7460 blockingPopGenericCommand(c,REDIS_TAIL);
7461}
7462
7463/* =============================== Replication ============================= */
7464
7465static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7466 ssize_t nwritten, ret = size;
7467 time_t start = time(NULL);
7468
7469 timeout++;
7470 while(size) {
7471 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7472 nwritten = write(fd,ptr,size);
7473 if (nwritten == -1) return -1;
7474 ptr += nwritten;
7475 size -= nwritten;
7476 }
7477 if ((time(NULL)-start) > timeout) {
7478 errno = ETIMEDOUT;
7479 return -1;
7480 }
7481 }
7482 return ret;
7483}
7484
7485static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7486 ssize_t nread, totread = 0;
7487 time_t start = time(NULL);
7488
7489 timeout++;
7490 while(size) {
7491 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7492 nread = read(fd,ptr,size);
7493 if (nread == -1) return -1;
7494 ptr += nread;
7495 size -= nread;
7496 totread += nread;
7497 }
7498 if ((time(NULL)-start) > timeout) {
7499 errno = ETIMEDOUT;
7500 return -1;
7501 }
7502 }
7503 return totread;
7504}
7505
7506static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7507 ssize_t nread = 0;
7508
7509 size--;
7510 while(size) {
7511 char c;
7512
7513 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7514 if (c == '\n') {
7515 *ptr = '\0';
7516 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7517 return nread;
7518 } else {
7519 *ptr++ = c;
7520 *ptr = '\0';
7521 nread++;
7522 }
7523 }
7524 return nread;
7525}
7526
7527static void syncCommand(redisClient *c) {
7528 /* ignore SYNC if aleady slave or in monitor mode */
7529 if (c->flags & REDIS_SLAVE) return;
7530
7531 /* SYNC can't be issued when the server has pending data to send to
7532 * the client about already issued commands. We need a fresh reply
7533 * buffer registering the differences between the BGSAVE and the current
7534 * dataset, so that we can copy to other slaves if needed. */
7535 if (listLength(c->reply) != 0) {
7536 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7537 return;
7538 }
7539
7540 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7541 /* Here we need to check if there is a background saving operation
7542 * in progress, or if it is required to start one */
7543 if (server.bgsavechildpid != -1) {
7544 /* Ok a background save is in progress. Let's check if it is a good
7545 * one for replication, i.e. if there is another slave that is
7546 * registering differences since the server forked to save */
7547 redisClient *slave;
7548 listNode *ln;
7549 listIter li;
7550
7551 listRewind(server.slaves,&li);
7552 while((ln = listNext(&li))) {
7553 slave = ln->value;
7554 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7555 }
7556 if (ln) {
7557 /* Perfect, the server is already registering differences for
7558 * another slave. Set the right state, and copy the buffer. */
7559 listRelease(c->reply);
7560 c->reply = listDup(slave->reply);
7561 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7562 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7563 } else {
7564 /* No way, we need to wait for the next BGSAVE in order to
7565 * register differences */
7566 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7567 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7568 }
7569 } else {
7570 /* Ok we don't have a BGSAVE in progress, let's start one */
7571 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7572 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7573 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7574 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7575 return;
7576 }
7577 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7578 }
7579 c->repldbfd = -1;
7580 c->flags |= REDIS_SLAVE;
7581 c->slaveseldb = 0;
7582 listAddNodeTail(server.slaves,c);
7583 return;
7584}
7585
7586static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7587 redisClient *slave = privdata;
7588 REDIS_NOTUSED(el);
7589 REDIS_NOTUSED(mask);
7590 char buf[REDIS_IOBUF_LEN];
7591 ssize_t nwritten, buflen;
7592
7593 if (slave->repldboff == 0) {
7594 /* Write the bulk write count before to transfer the DB. In theory here
7595 * we don't know how much room there is in the output buffer of the
7596 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7597 * operations) will never be smaller than the few bytes we need. */
7598 sds bulkcount;
7599
7600 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7601 slave->repldbsize);
7602 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7603 {
7604 sdsfree(bulkcount);
7605 freeClient(slave);
7606 return;
7607 }
7608 sdsfree(bulkcount);
7609 }
7610 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7611 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7612 if (buflen <= 0) {
7613 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7614 (buflen == 0) ? "premature EOF" : strerror(errno));
7615 freeClient(slave);
7616 return;
7617 }
7618 if ((nwritten = write(fd,buf,buflen)) == -1) {
7619 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7620 strerror(errno));
7621 freeClient(slave);
7622 return;
7623 }
7624 slave->repldboff += nwritten;
7625 if (slave->repldboff == slave->repldbsize) {
7626 close(slave->repldbfd);
7627 slave->repldbfd = -1;
7628 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7629 slave->replstate = REDIS_REPL_ONLINE;
7630 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7631 sendReplyToClient, slave) == AE_ERR) {
7632 freeClient(slave);
7633 return;
7634 }
7635 addReplySds(slave,sdsempty());
7636 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7637 }
7638}
7639
7640/* This function is called at the end of every backgrond saving.
7641 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7642 * otherwise REDIS_ERR is passed to the function.
7643 *
7644 * The goal of this function is to handle slaves waiting for a successful
7645 * background saving in order to perform non-blocking synchronization. */
7646static void updateSlavesWaitingBgsave(int bgsaveerr) {
7647 listNode *ln;
7648 int startbgsave = 0;
7649 listIter li;
7650
7651 listRewind(server.slaves,&li);
7652 while((ln = listNext(&li))) {
7653 redisClient *slave = ln->value;
7654
7655 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7656 startbgsave = 1;
7657 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7658 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7659 struct redis_stat buf;
7660
7661 if (bgsaveerr != REDIS_OK) {
7662 freeClient(slave);
7663 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7664 continue;
7665 }
7666 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7667 redis_fstat(slave->repldbfd,&buf) == -1) {
7668 freeClient(slave);
7669 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7670 continue;
7671 }
7672 slave->repldboff = 0;
7673 slave->repldbsize = buf.st_size;
7674 slave->replstate = REDIS_REPL_SEND_BULK;
7675 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7676 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7677 freeClient(slave);
7678 continue;
7679 }
7680 }
7681 }
7682 if (startbgsave) {
7683 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7684 listIter li;
7685
7686 listRewind(server.slaves,&li);
7687 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7688 while((ln = listNext(&li))) {
7689 redisClient *slave = ln->value;
7690
7691 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7692 freeClient(slave);
7693 }
7694 }
7695 }
7696}
7697
7698static int syncWithMaster(void) {
7699 char buf[1024], tmpfile[256], authcmd[1024];
7700 long dumpsize;
7701 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7702 int dfd, maxtries = 5;
7703
7704 if (fd == -1) {
7705 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7706 strerror(errno));
7707 return REDIS_ERR;
7708 }
7709
7710 /* AUTH with the master if required. */
7711 if(server.masterauth) {
7712 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7713 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7714 close(fd);
7715 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7716 strerror(errno));
7717 return REDIS_ERR;
7718 }
7719 /* Read the AUTH result. */
7720 if (syncReadLine(fd,buf,1024,3600) == -1) {
7721 close(fd);
7722 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7723 strerror(errno));
7724 return REDIS_ERR;
7725 }
7726 if (buf[0] != '+') {
7727 close(fd);
7728 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7729 return REDIS_ERR;
7730 }
7731 }
7732
7733 /* Issue the SYNC command */
7734 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7735 close(fd);
7736 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7737 strerror(errno));
7738 return REDIS_ERR;
7739 }
7740 /* Read the bulk write count */
7741 if (syncReadLine(fd,buf,1024,3600) == -1) {
7742 close(fd);
7743 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7744 strerror(errno));
7745 return REDIS_ERR;
7746 }
7747 if (buf[0] != '$') {
7748 close(fd);
7749 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7750 return REDIS_ERR;
7751 }
7752 dumpsize = strtol(buf+1,NULL,10);
7753 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7754 /* Read the bulk write data on a temp file */
7755 while(maxtries--) {
7756 snprintf(tmpfile,256,
7757 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7758 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7759 if (dfd != -1) break;
7760 sleep(1);
7761 }
7762 if (dfd == -1) {
7763 close(fd);
7764 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7765 return REDIS_ERR;
7766 }
7767 while(dumpsize) {
7768 int nread, nwritten;
7769
7770 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7771 if (nread == -1) {
7772 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7773 strerror(errno));
7774 close(fd);
7775 close(dfd);
7776 return REDIS_ERR;
7777 }
7778 nwritten = write(dfd,buf,nread);
7779 if (nwritten == -1) {
7780 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7781 close(fd);
7782 close(dfd);
7783 return REDIS_ERR;
7784 }
7785 dumpsize -= nread;
7786 }
7787 close(dfd);
7788 if (rename(tmpfile,server.dbfilename) == -1) {
7789 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7790 unlink(tmpfile);
7791 close(fd);
7792 return REDIS_ERR;
7793 }
7794 emptyDb();
7795 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7796 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7797 close(fd);
7798 return REDIS_ERR;
7799 }
7800 server.master = createClient(fd);
7801 server.master->flags |= REDIS_MASTER;
7802 server.master->authenticated = 1;
7803 server.replstate = REDIS_REPL_CONNECTED;
7804 return REDIS_OK;
7805}
7806
7807static void slaveofCommand(redisClient *c) {
7808 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7809 !strcasecmp(c->argv[2]->ptr,"one")) {
7810 if (server.masterhost) {
7811 sdsfree(server.masterhost);
7812 server.masterhost = NULL;
7813 if (server.master) freeClient(server.master);
7814 server.replstate = REDIS_REPL_NONE;
7815 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7816 }
7817 } else {
7818 sdsfree(server.masterhost);
7819 server.masterhost = sdsdup(c->argv[1]->ptr);
7820 server.masterport = atoi(c->argv[2]->ptr);
7821 if (server.master) freeClient(server.master);
7822 server.replstate = REDIS_REPL_CONNECT;
7823 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7824 server.masterhost, server.masterport);
7825 }
7826 addReply(c,shared.ok);
7827}
7828
7829/* ============================ Maxmemory directive ======================== */
7830
7831/* Try to free one object form the pre-allocated objects free list.
7832 * This is useful under low mem conditions as by default we take 1 million
7833 * free objects allocated. On success REDIS_OK is returned, otherwise
7834 * REDIS_ERR. */
7835static int tryFreeOneObjectFromFreelist(void) {
7836 robj *o;
7837
7838 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7839 if (listLength(server.objfreelist)) {
7840 listNode *head = listFirst(server.objfreelist);
7841 o = listNodeValue(head);
7842 listDelNode(server.objfreelist,head);
7843 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7844 zfree(o);
7845 return REDIS_OK;
7846 } else {
7847 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7848 return REDIS_ERR;
7849 }
7850}
7851
7852/* This function gets called when 'maxmemory' is set on the config file to limit
7853 * the max memory used by the server, and we are out of memory.
7854 * This function will try to, in order:
7855 *
7856 * - Free objects from the free list
7857 * - Try to remove keys with an EXPIRE set
7858 *
7859 * It is not possible to free enough memory to reach used-memory < maxmemory
7860 * the server will start refusing commands that will enlarge even more the
7861 * memory usage.
7862 */
7863static void freeMemoryIfNeeded(void) {
7864 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7865 int j, k, freed = 0;
7866
7867 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7868 for (j = 0; j < server.dbnum; j++) {
7869 int minttl = -1;
7870 robj *minkey = NULL;
7871 struct dictEntry *de;
7872
7873 if (dictSize(server.db[j].expires)) {
7874 freed = 1;
7875 /* From a sample of three keys drop the one nearest to
7876 * the natural expire */
7877 for (k = 0; k < 3; k++) {
7878 time_t t;
7879
7880 de = dictGetRandomKey(server.db[j].expires);
7881 t = (time_t) dictGetEntryVal(de);
7882 if (minttl == -1 || t < minttl) {
7883 minkey = dictGetEntryKey(de);
7884 minttl = t;
7885 }
7886 }
7887 deleteKey(server.db+j,minkey);
7888 }
7889 }
7890 if (!freed) return; /* nothing to free... */
7891 }
7892}
7893
7894/* ============================== Append Only file ========================== */
7895
7896static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7897 sds buf = sdsempty();
7898 int j;
7899 ssize_t nwritten;
7900 time_t now;
7901 robj *tmpargv[3];
7902
7903 /* The DB this command was targetting is not the same as the last command
7904 * we appendend. To issue a SELECT command is needed. */
7905 if (dictid != server.appendseldb) {
7906 char seldb[64];
7907
7908 snprintf(seldb,sizeof(seldb),"%d",dictid);
7909 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7910 (unsigned long)strlen(seldb),seldb);
7911 server.appendseldb = dictid;
7912 }
7913
7914 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7915 * EXPIREs into EXPIREATs calls */
7916 if (cmd->proc == expireCommand) {
7917 long when;
7918
7919 tmpargv[0] = createStringObject("EXPIREAT",8);
7920 tmpargv[1] = argv[1];
7921 incrRefCount(argv[1]);
7922 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7923 tmpargv[2] = createObject(REDIS_STRING,
7924 sdscatprintf(sdsempty(),"%ld",when));
7925 argv = tmpargv;
7926 }
7927
7928 /* Append the actual command */
7929 buf = sdscatprintf(buf,"*%d\r\n",argc);
7930 for (j = 0; j < argc; j++) {
7931 robj *o = argv[j];
7932
7933 o = getDecodedObject(o);
7934 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7935 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7936 buf = sdscatlen(buf,"\r\n",2);
7937 decrRefCount(o);
7938 }
7939
7940 /* Free the objects from the modified argv for EXPIREAT */
7941 if (cmd->proc == expireCommand) {
7942 for (j = 0; j < 3; j++)
7943 decrRefCount(argv[j]);
7944 }
7945
7946 /* We want to perform a single write. This should be guaranteed atomic
7947 * at least if the filesystem we are writing is a real physical one.
7948 * While this will save us against the server being killed I don't think
7949 * there is much to do about the whole server stopping for power problems
7950 * or alike */
7951 nwritten = write(server.appendfd,buf,sdslen(buf));
7952 if (nwritten != (signed)sdslen(buf)) {
7953 /* Ooops, we are in troubles. The best thing to do for now is
7954 * to simply exit instead to give the illusion that everything is
7955 * working as expected. */
7956 if (nwritten == -1) {
7957 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7958 } else {
7959 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7960 }
7961 exit(1);
7962 }
7963 /* If a background append only file rewriting is in progress we want to
7964 * accumulate the differences between the child DB and the current one
7965 * in a buffer, so that when the child process will do its work we
7966 * can append the differences to the new append only file. */
7967 if (server.bgrewritechildpid != -1)
7968 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7969
7970 sdsfree(buf);
7971 now = time(NULL);
7972 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7973 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7974 now-server.lastfsync > 1))
7975 {
7976 fsync(server.appendfd); /* Let's try to get this data on the disk */
7977 server.lastfsync = now;
7978 }
7979}
7980
7981/* In Redis commands are always executed in the context of a client, so in
7982 * order to load the append only file we need to create a fake client. */
7983static struct redisClient *createFakeClient(void) {
7984 struct redisClient *c = zmalloc(sizeof(*c));
7985
7986 selectDb(c,0);
7987 c->fd = -1;
7988 c->querybuf = sdsempty();
7989 c->argc = 0;
7990 c->argv = NULL;
7991 c->flags = 0;
7992 /* We set the fake client as a slave waiting for the synchronization
7993 * so that Redis will not try to send replies to this client. */
7994 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7995 c->reply = listCreate();
7996 listSetFreeMethod(c->reply,decrRefCount);
7997 listSetDupMethod(c->reply,dupClientReplyValue);
7998 return c;
7999}
8000
8001static void freeFakeClient(struct redisClient *c) {
8002 sdsfree(c->querybuf);
8003 listRelease(c->reply);
8004 zfree(c);
8005}
8006
8007/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8008 * error (the append only file is zero-length) REDIS_ERR is returned. On
8009 * fatal error an error message is logged and the program exists. */
8010int loadAppendOnlyFile(char *filename) {
8011 struct redisClient *fakeClient;
8012 FILE *fp = fopen(filename,"r");
8013 struct redis_stat sb;
8014 unsigned long long loadedkeys = 0;
8015
8016 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8017 return REDIS_ERR;
8018
8019 if (fp == NULL) {
8020 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8021 exit(1);
8022 }
8023
8024 fakeClient = createFakeClient();
8025 while(1) {
8026 int argc, j;
8027 unsigned long len;
8028 robj **argv;
8029 char buf[128];
8030 sds argsds;
8031 struct redisCommand *cmd;
8032
8033 if (fgets(buf,sizeof(buf),fp) == NULL) {
8034 if (feof(fp))
8035 break;
8036 else
8037 goto readerr;
8038 }
8039 if (buf[0] != '*') goto fmterr;
8040 argc = atoi(buf+1);
8041 argv = zmalloc(sizeof(robj*)*argc);
8042 for (j = 0; j < argc; j++) {
8043 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8044 if (buf[0] != '$') goto fmterr;
8045 len = strtol(buf+1,NULL,10);
8046 argsds = sdsnewlen(NULL,len);
8047 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8048 argv[j] = createObject(REDIS_STRING,argsds);
8049 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8050 }
8051
8052 /* Command lookup */
8053 cmd = lookupCommand(argv[0]->ptr);
8054 if (!cmd) {
8055 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8056 exit(1);
8057 }
8058 /* Try object encoding */
8059 if (cmd->flags & REDIS_CMD_BULK)
8060 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8061 /* Run the command in the context of a fake client */
8062 fakeClient->argc = argc;
8063 fakeClient->argv = argv;
8064 cmd->proc(fakeClient);
8065 /* Discard the reply objects list from the fake client */
8066 while(listLength(fakeClient->reply))
8067 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8068 /* Clean up, ready for the next command */
8069 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8070 zfree(argv);
8071 /* Handle swapping while loading big datasets when VM is on */
8072 loadedkeys++;
8073 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8074 while (zmalloc_used_memory() > server.vm_max_memory) {
8075 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8076 }
8077 }
8078 }
8079 fclose(fp);
8080 freeFakeClient(fakeClient);
8081 return REDIS_OK;
8082
8083readerr:
8084 if (feof(fp)) {
8085 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8086 } else {
8087 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8088 }
8089 exit(1);
8090fmterr:
8091 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8092 exit(1);
8093}
8094
8095/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8096static int fwriteBulkObject(FILE *fp, robj *obj) {
8097 char buf[128];
8098 int decrrc = 0;
8099
8100 /* Avoid the incr/decr ref count business if possible to help
8101 * copy-on-write (we are often in a child process when this function
8102 * is called).
8103 * Also makes sure that key objects don't get incrRefCount-ed when VM
8104 * is enabled */
8105 if (obj->encoding != REDIS_ENCODING_RAW) {
8106 obj = getDecodedObject(obj);
8107 decrrc = 1;
8108 }
8109 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8110 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8111 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8112 goto err;
8113 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8114 if (decrrc) decrRefCount(obj);
8115 return 1;
8116err:
8117 if (decrrc) decrRefCount(obj);
8118 return 0;
8119}
8120
8121/* Write binary-safe string into a file in the bulkformat
8122 * $<count>\r\n<payload>\r\n */
8123static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8124 char buf[128];
8125
8126 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8127 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8128 if (len && fwrite(s,len,1,fp) == 0) return 0;
8129 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8130 return 1;
8131}
8132
8133/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8134static int fwriteBulkDouble(FILE *fp, double d) {
8135 char buf[128], dbuf[128];
8136
8137 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8138 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8139 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8140 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8141 return 1;
8142}
8143
8144/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8145static int fwriteBulkLong(FILE *fp, long l) {
8146 char buf[128], lbuf[128];
8147
8148 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8149 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8150 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8151 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8152 return 1;
8153}
8154
8155/* Write a sequence of commands able to fully rebuild the dataset into
8156 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8157static int rewriteAppendOnlyFile(char *filename) {
8158 dictIterator *di = NULL;
8159 dictEntry *de;
8160 FILE *fp;
8161 char tmpfile[256];
8162 int j;
8163 time_t now = time(NULL);
8164
8165 /* Note that we have to use a different temp name here compared to the
8166 * one used by rewriteAppendOnlyFileBackground() function. */
8167 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8168 fp = fopen(tmpfile,"w");
8169 if (!fp) {
8170 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8171 return REDIS_ERR;
8172 }
8173 for (j = 0; j < server.dbnum; j++) {
8174 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8175 redisDb *db = server.db+j;
8176 dict *d = db->dict;
8177 if (dictSize(d) == 0) continue;
8178 di = dictGetIterator(d);
8179 if (!di) {
8180 fclose(fp);
8181 return REDIS_ERR;
8182 }
8183
8184 /* SELECT the new DB */
8185 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8186 if (fwriteBulkLong(fp,j) == 0) goto werr;
8187
8188 /* Iterate this DB writing every entry */
8189 while((de = dictNext(di)) != NULL) {
8190 robj *key, *o;
8191 time_t expiretime;
8192 int swapped;
8193
8194 key = dictGetEntryKey(de);
8195 /* If the value for this key is swapped, load a preview in memory.
8196 * We use a "swapped" flag to remember if we need to free the
8197 * value object instead to just increment the ref count anyway
8198 * in order to avoid copy-on-write of pages if we are forked() */
8199 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8200 key->storage == REDIS_VM_SWAPPING) {
8201 o = dictGetEntryVal(de);
8202 swapped = 0;
8203 } else {
8204 o = vmPreviewObject(key);
8205 swapped = 1;
8206 }
8207 expiretime = getExpire(db,key);
8208
8209 /* Save the key and associated value */
8210 if (o->type == REDIS_STRING) {
8211 /* Emit a SET command */
8212 char cmd[]="*3\r\n$3\r\nSET\r\n";
8213 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8214 /* Key and value */
8215 if (fwriteBulkObject(fp,key) == 0) goto werr;
8216 if (fwriteBulkObject(fp,o) == 0) goto werr;
8217 } else if (o->type == REDIS_LIST) {
8218 /* Emit the RPUSHes needed to rebuild the list */
8219 list *list = o->ptr;
8220 listNode *ln;
8221 listIter li;
8222
8223 listRewind(list,&li);
8224 while((ln = listNext(&li))) {
8225 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8226 robj *eleobj = listNodeValue(ln);
8227
8228 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8229 if (fwriteBulkObject(fp,key) == 0) goto werr;
8230 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8231 }
8232 } else if (o->type == REDIS_SET) {
8233 /* Emit the SADDs needed to rebuild the set */
8234 dict *set = o->ptr;
8235 dictIterator *di = dictGetIterator(set);
8236 dictEntry *de;
8237
8238 while((de = dictNext(di)) != NULL) {
8239 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8240 robj *eleobj = dictGetEntryKey(de);
8241
8242 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8243 if (fwriteBulkObject(fp,key) == 0) goto werr;
8244 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8245 }
8246 dictReleaseIterator(di);
8247 } else if (o->type == REDIS_ZSET) {
8248 /* Emit the ZADDs needed to rebuild the sorted set */
8249 zset *zs = o->ptr;
8250 dictIterator *di = dictGetIterator(zs->dict);
8251 dictEntry *de;
8252
8253 while((de = dictNext(di)) != NULL) {
8254 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8255 robj *eleobj = dictGetEntryKey(de);
8256 double *score = dictGetEntryVal(de);
8257
8258 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8259 if (fwriteBulkObject(fp,key) == 0) goto werr;
8260 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8261 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8262 }
8263 dictReleaseIterator(di);
8264 } else if (o->type == REDIS_HASH) {
8265 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8266
8267 /* Emit the HSETs needed to rebuild the hash */
8268 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8269 unsigned char *p = zipmapRewind(o->ptr);
8270 unsigned char *field, *val;
8271 unsigned int flen, vlen;
8272
8273 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8274 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8275 if (fwriteBulkObject(fp,key) == 0) goto werr;
8276 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8277 return -1;
8278 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8279 return -1;
8280 }
8281 } else {
8282 dictIterator *di = dictGetIterator(o->ptr);
8283 dictEntry *de;
8284
8285 while((de = dictNext(di)) != NULL) {
8286 robj *field = dictGetEntryKey(de);
8287 robj *val = dictGetEntryVal(de);
8288
8289 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8290 if (fwriteBulkObject(fp,key) == 0) goto werr;
8291 if (fwriteBulkObject(fp,field) == -1) return -1;
8292 if (fwriteBulkObject(fp,val) == -1) return -1;
8293 }
8294 dictReleaseIterator(di);
8295 }
8296 } else {
8297 redisPanic("Unknown object type");
8298 }
8299 /* Save the expire time */
8300 if (expiretime != -1) {
8301 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8302 /* If this key is already expired skip it */
8303 if (expiretime < now) continue;
8304 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8305 if (fwriteBulkObject(fp,key) == 0) goto werr;
8306 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8307 }
8308 if (swapped) decrRefCount(o);
8309 }
8310 dictReleaseIterator(di);
8311 }
8312
8313 /* Make sure data will not remain on the OS's output buffers */
8314 fflush(fp);
8315 fsync(fileno(fp));
8316 fclose(fp);
8317
8318 /* Use RENAME to make sure the DB file is changed atomically only
8319 * if the generate DB file is ok. */
8320 if (rename(tmpfile,filename) == -1) {
8321 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8322 unlink(tmpfile);
8323 return REDIS_ERR;
8324 }
8325 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8326 return REDIS_OK;
8327
8328werr:
8329 fclose(fp);
8330 unlink(tmpfile);
8331 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8332 if (di) dictReleaseIterator(di);
8333 return REDIS_ERR;
8334}
8335
8336/* This is how rewriting of the append only file in background works:
8337 *
8338 * 1) The user calls BGREWRITEAOF
8339 * 2) Redis calls this function, that forks():
8340 * 2a) the child rewrite the append only file in a temp file.
8341 * 2b) the parent accumulates differences in server.bgrewritebuf.
8342 * 3) When the child finished '2a' exists.
8343 * 4) The parent will trap the exit code, if it's OK, will append the
8344 * data accumulated into server.bgrewritebuf into the temp file, and
8345 * finally will rename(2) the temp file in the actual file name.
8346 * The the new file is reopened as the new append only file. Profit!
8347 */
8348static int rewriteAppendOnlyFileBackground(void) {
8349 pid_t childpid;
8350
8351 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8352 if (server.vm_enabled) waitEmptyIOJobsQueue();
8353 if ((childpid = fork()) == 0) {
8354 /* Child */
8355 char tmpfile[256];
8356
8357 if (server.vm_enabled) vmReopenSwapFile();
8358 close(server.fd);
8359 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8360 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8361 _exit(0);
8362 } else {
8363 _exit(1);
8364 }
8365 } else {
8366 /* Parent */
8367 if (childpid == -1) {
8368 redisLog(REDIS_WARNING,
8369 "Can't rewrite append only file in background: fork: %s",
8370 strerror(errno));
8371 return REDIS_ERR;
8372 }
8373 redisLog(REDIS_NOTICE,
8374 "Background append only file rewriting started by pid %d",childpid);
8375 server.bgrewritechildpid = childpid;
8376 updateDictResizePolicy();
8377 /* We set appendseldb to -1 in order to force the next call to the
8378 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8379 * accumulated by the parent into server.bgrewritebuf will start
8380 * with a SELECT statement and it will be safe to merge. */
8381 server.appendseldb = -1;
8382 return REDIS_OK;
8383 }
8384 return REDIS_OK; /* unreached */
8385}
8386
8387static void bgrewriteaofCommand(redisClient *c) {
8388 if (server.bgrewritechildpid != -1) {
8389 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8390 return;
8391 }
8392 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8393 char *status = "+Background append only file rewriting started\r\n";
8394 addReplySds(c,sdsnew(status));
8395 } else {
8396 addReply(c,shared.err);
8397 }
8398}
8399
8400static void aofRemoveTempFile(pid_t childpid) {
8401 char tmpfile[256];
8402
8403 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8404 unlink(tmpfile);
8405}
8406
8407/* Virtual Memory is composed mainly of two subsystems:
8408 * - Blocking Virutal Memory
8409 * - Threaded Virtual Memory I/O
8410 * The two parts are not fully decoupled, but functions are split among two
8411 * different sections of the source code (delimited by comments) in order to
8412 * make more clear what functionality is about the blocking VM and what about
8413 * the threaded (not blocking) VM.
8414 *
8415 * Redis VM design:
8416 *
8417 * Redis VM is a blocking VM (one that blocks reading swapped values from
8418 * disk into memory when a value swapped out is needed in memory) that is made
8419 * unblocking by trying to examine the command argument vector in order to
8420 * load in background values that will likely be needed in order to exec
8421 * the command. The command is executed only once all the relevant keys
8422 * are loaded into memory.
8423 *
8424 * This basically is almost as simple of a blocking VM, but almost as parallel
8425 * as a fully non-blocking VM.
8426 */
8427
8428/* =================== Virtual Memory - Blocking Side ====================== */
8429
8430/* substitute the first occurrence of '%p' with the process pid in the
8431 * swap file name. */
8432static void expandVmSwapFilename(void) {
8433 char *p = strstr(server.vm_swap_file,"%p");
8434 sds new;
8435
8436 if (!p) return;
8437 new = sdsempty();
8438 *p = '\0';
8439 new = sdscat(new,server.vm_swap_file);
8440 new = sdscatprintf(new,"%ld",(long) getpid());
8441 new = sdscat(new,p+2);
8442 zfree(server.vm_swap_file);
8443 server.vm_swap_file = new;
8444}
8445
8446static void vmInit(void) {
8447 off_t totsize;
8448 int pipefds[2];
8449 size_t stacksize;
8450
8451 if (server.vm_max_threads != 0)
8452 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8453
8454 expandVmSwapFilename();
8455 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8456 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8457 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8458 }
8459 if (server.vm_fp == NULL) {
8460 redisLog(REDIS_WARNING,
8461 "Impossible to open the swap file: %s. Exiting.",
8462 strerror(errno));
8463 exit(1);
8464 }
8465 server.vm_fd = fileno(server.vm_fp);
8466 server.vm_next_page = 0;
8467 server.vm_near_pages = 0;
8468 server.vm_stats_used_pages = 0;
8469 server.vm_stats_swapped_objects = 0;
8470 server.vm_stats_swapouts = 0;
8471 server.vm_stats_swapins = 0;
8472 totsize = server.vm_pages*server.vm_page_size;
8473 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8474 if (ftruncate(server.vm_fd,totsize) == -1) {
8475 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8476 strerror(errno));
8477 exit(1);
8478 } else {
8479 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8480 }
8481 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8482 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8483 (long long) (server.vm_pages+7)/8, server.vm_pages);
8484 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8485
8486 /* Initialize threaded I/O (used by Virtual Memory) */
8487 server.io_newjobs = listCreate();
8488 server.io_processing = listCreate();
8489 server.io_processed = listCreate();
8490 server.io_ready_clients = listCreate();
8491 pthread_mutex_init(&server.io_mutex,NULL);
8492 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8493 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8494 server.io_active_threads = 0;
8495 if (pipe(pipefds) == -1) {
8496 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8497 ,strerror(errno));
8498 exit(1);
8499 }
8500 server.io_ready_pipe_read = pipefds[0];
8501 server.io_ready_pipe_write = pipefds[1];
8502 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8503 /* LZF requires a lot of stack */
8504 pthread_attr_init(&server.io_threads_attr);
8505 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8506 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8507 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8508 /* Listen for events in the threaded I/O pipe */
8509 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8510 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8511 oom("creating file event");
8512}
8513
8514/* Mark the page as used */
8515static void vmMarkPageUsed(off_t page) {
8516 off_t byte = page/8;
8517 int bit = page&7;
8518 redisAssert(vmFreePage(page) == 1);
8519 server.vm_bitmap[byte] |= 1<<bit;
8520}
8521
8522/* Mark N contiguous pages as used, with 'page' being the first. */
8523static void vmMarkPagesUsed(off_t page, off_t count) {
8524 off_t j;
8525
8526 for (j = 0; j < count; j++)
8527 vmMarkPageUsed(page+j);
8528 server.vm_stats_used_pages += count;
8529 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8530 (long long)count, (long long)page);
8531}
8532
8533/* Mark the page as free */
8534static void vmMarkPageFree(off_t page) {
8535 off_t byte = page/8;
8536 int bit = page&7;
8537 redisAssert(vmFreePage(page) == 0);
8538 server.vm_bitmap[byte] &= ~(1<<bit);
8539}
8540
8541/* Mark N contiguous pages as free, with 'page' being the first. */
8542static void vmMarkPagesFree(off_t page, off_t count) {
8543 off_t j;
8544
8545 for (j = 0; j < count; j++)
8546 vmMarkPageFree(page+j);
8547 server.vm_stats_used_pages -= count;
8548 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8549 (long long)count, (long long)page);
8550}
8551
8552/* Test if the page is free */
8553static int vmFreePage(off_t page) {
8554 off_t byte = page/8;
8555 int bit = page&7;
8556 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8557}
8558
8559/* Find N contiguous free pages storing the first page of the cluster in *first.
8560 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8561 * REDIS_ERR is returned.
8562 *
8563 * This function uses a simple algorithm: we try to allocate
8564 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8565 * again from the start of the swap file searching for free spaces.
8566 *
8567 * If it looks pretty clear that there are no free pages near our offset
8568 * we try to find less populated places doing a forward jump of
8569 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8570 * without hurry, and then we jump again and so forth...
8571 *
8572 * This function can be improved using a free list to avoid to guess
8573 * too much, since we could collect data about freed pages.
8574 *
8575 * note: I implemented this function just after watching an episode of
8576 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8577 */
8578static int vmFindContiguousPages(off_t *first, off_t n) {
8579 off_t base, offset = 0, since_jump = 0, numfree = 0;
8580
8581 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8582 server.vm_near_pages = 0;
8583 server.vm_next_page = 0;
8584 }
8585 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8586 base = server.vm_next_page;
8587
8588 while(offset < server.vm_pages) {
8589 off_t this = base+offset;
8590
8591 /* If we overflow, restart from page zero */
8592 if (this >= server.vm_pages) {
8593 this -= server.vm_pages;
8594 if (this == 0) {
8595 /* Just overflowed, what we found on tail is no longer
8596 * interesting, as it's no longer contiguous. */
8597 numfree = 0;
8598 }
8599 }
8600 if (vmFreePage(this)) {
8601 /* This is a free page */
8602 numfree++;
8603 /* Already got N free pages? Return to the caller, with success */
8604 if (numfree == n) {
8605 *first = this-(n-1);
8606 server.vm_next_page = this+1;
8607 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8608 return REDIS_OK;
8609 }
8610 } else {
8611 /* The current one is not a free page */
8612 numfree = 0;
8613 }
8614
8615 /* Fast-forward if the current page is not free and we already
8616 * searched enough near this place. */
8617 since_jump++;
8618 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8619 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8620 since_jump = 0;
8621 /* Note that even if we rewind after the jump, we are don't need
8622 * to make sure numfree is set to zero as we only jump *if* it
8623 * is set to zero. */
8624 } else {
8625 /* Otherwise just check the next page */
8626 offset++;
8627 }
8628 }
8629 return REDIS_ERR;
8630}
8631
8632/* Write the specified object at the specified page of the swap file */
8633static int vmWriteObjectOnSwap(robj *o, off_t page) {
8634 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8635 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8636 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8637 redisLog(REDIS_WARNING,
8638 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8639 strerror(errno));
8640 return REDIS_ERR;
8641 }
8642 rdbSaveObject(server.vm_fp,o);
8643 fflush(server.vm_fp);
8644 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8645 return REDIS_OK;
8646}
8647
8648/* Swap the 'val' object relative to 'key' into disk. Store all the information
8649 * needed to later retrieve the object into the key object.
8650 * If we can't find enough contiguous empty pages to swap the object on disk
8651 * REDIS_ERR is returned. */
8652static int vmSwapObjectBlocking(robj *key, robj *val) {
8653 off_t pages = rdbSavedObjectPages(val,NULL);
8654 off_t page;
8655
8656 assert(key->storage == REDIS_VM_MEMORY);
8657 assert(key->refcount == 1);
8658 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8659 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8660 key->vm.page = page;
8661 key->vm.usedpages = pages;
8662 key->storage = REDIS_VM_SWAPPED;
8663 key->vtype = val->type;
8664 decrRefCount(val); /* Deallocate the object from memory. */
8665 vmMarkPagesUsed(page,pages);
8666 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8667 (unsigned char*) key->ptr,
8668 (unsigned long long) page, (unsigned long long) pages);
8669 server.vm_stats_swapped_objects++;
8670 server.vm_stats_swapouts++;
8671 return REDIS_OK;
8672}
8673
8674static robj *vmReadObjectFromSwap(off_t page, int type) {
8675 robj *o;
8676
8677 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8678 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8679 redisLog(REDIS_WARNING,
8680 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8681 strerror(errno));
8682 _exit(1);
8683 }
8684 o = rdbLoadObject(type,server.vm_fp);
8685 if (o == NULL) {
8686 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8687 _exit(1);
8688 }
8689 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8690 return o;
8691}
8692
8693/* Load the value object relative to the 'key' object from swap to memory.
8694 * The newly allocated object is returned.
8695 *
8696 * If preview is true the unserialized object is returned to the caller but
8697 * no changes are made to the key object, nor the pages are marked as freed */
8698static robj *vmGenericLoadObject(robj *key, int preview) {
8699 robj *val;
8700
8701 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8702 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8703 if (!preview) {
8704 key->storage = REDIS_VM_MEMORY;
8705 key->vm.atime = server.unixtime;
8706 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8707 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8708 (unsigned char*) key->ptr);
8709 server.vm_stats_swapped_objects--;
8710 } else {
8711 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8712 (unsigned char*) key->ptr);
8713 }
8714 server.vm_stats_swapins++;
8715 return val;
8716}
8717
8718/* Plain object loading, from swap to memory */
8719static robj *vmLoadObject(robj *key) {
8720 /* If we are loading the object in background, stop it, we
8721 * need to load this object synchronously ASAP. */
8722 if (key->storage == REDIS_VM_LOADING)
8723 vmCancelThreadedIOJob(key);
8724 return vmGenericLoadObject(key,0);
8725}
8726
8727/* Just load the value on disk, without to modify the key.
8728 * This is useful when we want to perform some operation on the value
8729 * without to really bring it from swap to memory, like while saving the
8730 * dataset or rewriting the append only log. */
8731static robj *vmPreviewObject(robj *key) {
8732 return vmGenericLoadObject(key,1);
8733}
8734
8735/* How a good candidate is this object for swapping?
8736 * The better candidate it is, the greater the returned value.
8737 *
8738 * Currently we try to perform a fast estimation of the object size in
8739 * memory, and combine it with aging informations.
8740 *
8741 * Basically swappability = idle-time * log(estimated size)
8742 *
8743 * Bigger objects are preferred over smaller objects, but not
8744 * proportionally, this is why we use the logarithm. This algorithm is
8745 * just a first try and will probably be tuned later. */
8746static double computeObjectSwappability(robj *o) {
8747 time_t age = server.unixtime - o->vm.atime;
8748 long asize = 0;
8749 list *l;
8750 dict *d;
8751 struct dictEntry *de;
8752 int z;
8753
8754 if (age <= 0) return 0;
8755 switch(o->type) {
8756 case REDIS_STRING:
8757 if (o->encoding != REDIS_ENCODING_RAW) {
8758 asize = sizeof(*o);
8759 } else {
8760 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8761 }
8762 break;
8763 case REDIS_LIST:
8764 l = o->ptr;
8765 listNode *ln = listFirst(l);
8766
8767 asize = sizeof(list);
8768 if (ln) {
8769 robj *ele = ln->value;
8770 long elesize;
8771
8772 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8773 (sizeof(*o)+sdslen(ele->ptr)) :
8774 sizeof(*o);
8775 asize += (sizeof(listNode)+elesize)*listLength(l);
8776 }
8777 break;
8778 case REDIS_SET:
8779 case REDIS_ZSET:
8780 z = (o->type == REDIS_ZSET);
8781 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8782
8783 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8784 if (z) asize += sizeof(zset)-sizeof(dict);
8785 if (dictSize(d)) {
8786 long elesize;
8787 robj *ele;
8788
8789 de = dictGetRandomKey(d);
8790 ele = dictGetEntryKey(de);
8791 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8792 (sizeof(*o)+sdslen(ele->ptr)) :
8793 sizeof(*o);
8794 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8795 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8796 }
8797 break;
8798 case REDIS_HASH:
8799 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8800 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8801 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8802 unsigned int klen, vlen;
8803 unsigned char *key, *val;
8804
8805 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8806 klen = 0;
8807 vlen = 0;
8808 }
8809 asize = len*(klen+vlen+3);
8810 } else if (o->encoding == REDIS_ENCODING_HT) {
8811 d = o->ptr;
8812 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8813 if (dictSize(d)) {
8814 long elesize;
8815 robj *ele;
8816
8817 de = dictGetRandomKey(d);
8818 ele = dictGetEntryKey(de);
8819 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8820 (sizeof(*o)+sdslen(ele->ptr)) :
8821 sizeof(*o);
8822 ele = dictGetEntryVal(de);
8823 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8824 (sizeof(*o)+sdslen(ele->ptr)) :
8825 sizeof(*o);
8826 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8827 }
8828 }
8829 break;
8830 }
8831 return (double)age*log(1+asize);
8832}
8833
8834/* Try to swap an object that's a good candidate for swapping.
8835 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8836 * to swap any object at all.
8837 *
8838 * If 'usethreaded' is true, Redis will try to swap the object in background
8839 * using I/O threads. */
8840static int vmSwapOneObject(int usethreads) {
8841 int j, i;
8842 struct dictEntry *best = NULL;
8843 double best_swappability = 0;
8844 redisDb *best_db = NULL;
8845 robj *key, *val;
8846
8847 for (j = 0; j < server.dbnum; j++) {
8848 redisDb *db = server.db+j;
8849 /* Why maxtries is set to 100?
8850 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8851 * are swappable objects */
8852 int maxtries = 100;
8853
8854 if (dictSize(db->dict) == 0) continue;
8855 for (i = 0; i < 5; i++) {
8856 dictEntry *de;
8857 double swappability;
8858
8859 if (maxtries) maxtries--;
8860 de = dictGetRandomKey(db->dict);
8861 key = dictGetEntryKey(de);
8862 val = dictGetEntryVal(de);
8863 /* Only swap objects that are currently in memory.
8864 *
8865 * Also don't swap shared objects if threaded VM is on, as we
8866 * try to ensure that the main thread does not touch the
8867 * object while the I/O thread is using it, but we can't
8868 * control other keys without adding additional mutex. */
8869 if (key->storage != REDIS_VM_MEMORY ||
8870 (server.vm_max_threads != 0 && val->refcount != 1)) {
8871 if (maxtries) i--; /* don't count this try */
8872 continue;
8873 }
8874 swappability = computeObjectSwappability(val);
8875 if (!best || swappability > best_swappability) {
8876 best = de;
8877 best_swappability = swappability;
8878 best_db = db;
8879 }
8880 }
8881 }
8882 if (best == NULL) return REDIS_ERR;
8883 key = dictGetEntryKey(best);
8884 val = dictGetEntryVal(best);
8885
8886 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8887 key->ptr, best_swappability);
8888
8889 /* Unshare the key if needed */
8890 if (key->refcount > 1) {
8891 robj *newkey = dupStringObject(key);
8892 decrRefCount(key);
8893 key = dictGetEntryKey(best) = newkey;
8894 }
8895 /* Swap it */
8896 if (usethreads) {
8897 vmSwapObjectThreaded(key,val,best_db);
8898 return REDIS_OK;
8899 } else {
8900 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8901 dictGetEntryVal(best) = NULL;
8902 return REDIS_OK;
8903 } else {
8904 return REDIS_ERR;
8905 }
8906 }
8907}
8908
8909static int vmSwapOneObjectBlocking() {
8910 return vmSwapOneObject(0);
8911}
8912
8913static int vmSwapOneObjectThreaded() {
8914 return vmSwapOneObject(1);
8915}
8916
8917/* Return true if it's safe to swap out objects in a given moment.
8918 * Basically we don't want to swap objects out while there is a BGSAVE
8919 * or a BGAEOREWRITE running in backgroud. */
8920static int vmCanSwapOut(void) {
8921 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8922}
8923
8924/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8925 * and was deleted. Otherwise 0 is returned. */
8926static int deleteIfSwapped(redisDb *db, robj *key) {
8927 dictEntry *de;
8928 robj *foundkey;
8929
8930 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8931 foundkey = dictGetEntryKey(de);
8932 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8933 deleteKey(db,key);
8934 return 1;
8935}
8936
8937/* =================== Virtual Memory - Threaded I/O ======================= */
8938
8939static void freeIOJob(iojob *j) {
8940 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8941 j->type == REDIS_IOJOB_DO_SWAP ||
8942 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8943 decrRefCount(j->val);
8944 /* We don't decrRefCount the j->key field as we did't incremented
8945 * the count creating IO Jobs. This is because the key field here is
8946 * just used as an indentifier and if a key is removed the Job should
8947 * never be touched again. */
8948 zfree(j);
8949}
8950
8951/* Every time a thread finished a Job, it writes a byte into the write side
8952 * of an unix pipe in order to "awake" the main thread, and this function
8953 * is called. */
8954static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8955 int mask)
8956{
8957 char buf[1];
8958 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8959 REDIS_NOTUSED(el);
8960 REDIS_NOTUSED(mask);
8961 REDIS_NOTUSED(privdata);
8962
8963 /* For every byte we read in the read side of the pipe, there is one
8964 * I/O job completed to process. */
8965 while((retval = read(fd,buf,1)) == 1) {
8966 iojob *j;
8967 listNode *ln;
8968 robj *key;
8969 struct dictEntry *de;
8970
8971 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8972
8973 /* Get the processed element (the oldest one) */
8974 lockThreadedIO();
8975 assert(listLength(server.io_processed) != 0);
8976 if (toprocess == -1) {
8977 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8978 if (toprocess <= 0) toprocess = 1;
8979 }
8980 ln = listFirst(server.io_processed);
8981 j = ln->value;
8982 listDelNode(server.io_processed,ln);
8983 unlockThreadedIO();
8984 /* If this job is marked as canceled, just ignore it */
8985 if (j->canceled) {
8986 freeIOJob(j);
8987 continue;
8988 }
8989 /* Post process it in the main thread, as there are things we
8990 * can do just here to avoid race conditions and/or invasive locks */
8991 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8992 de = dictFind(j->db->dict,j->key);
8993 assert(de != NULL);
8994 key = dictGetEntryKey(de);
8995 if (j->type == REDIS_IOJOB_LOAD) {
8996 redisDb *db;
8997
8998 /* Key loaded, bring it at home */
8999 key->storage = REDIS_VM_MEMORY;
9000 key->vm.atime = server.unixtime;
9001 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9002 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9003 (unsigned char*) key->ptr);
9004 server.vm_stats_swapped_objects--;
9005 server.vm_stats_swapins++;
9006 dictGetEntryVal(de) = j->val;
9007 incrRefCount(j->val);
9008 db = j->db;
9009 freeIOJob(j);
9010 /* Handle clients waiting for this key to be loaded. */
9011 handleClientsBlockedOnSwappedKey(db,key);
9012 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9013 /* Now we know the amount of pages required to swap this object.
9014 * Let's find some space for it, and queue this task again
9015 * rebranded as REDIS_IOJOB_DO_SWAP. */
9016 if (!vmCanSwapOut() ||
9017 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9018 {
9019 /* Ooops... no space or we can't swap as there is
9020 * a fork()ed Redis trying to save stuff on disk. */
9021 freeIOJob(j);
9022 key->storage = REDIS_VM_MEMORY; /* undo operation */
9023 } else {
9024 /* Note that we need to mark this pages as used now,
9025 * if the job will be canceled, we'll mark them as freed
9026 * again. */
9027 vmMarkPagesUsed(j->page,j->pages);
9028 j->type = REDIS_IOJOB_DO_SWAP;
9029 lockThreadedIO();
9030 queueIOJob(j);
9031 unlockThreadedIO();
9032 }
9033 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9034 robj *val;
9035
9036 /* Key swapped. We can finally free some memory. */
9037 if (key->storage != REDIS_VM_SWAPPING) {
9038 printf("key->storage: %d\n",key->storage);
9039 printf("key->name: %s\n",(char*)key->ptr);
9040 printf("key->refcount: %d\n",key->refcount);
9041 printf("val: %p\n",(void*)j->val);
9042 printf("val->type: %d\n",j->val->type);
9043 printf("val->ptr: %s\n",(char*)j->val->ptr);
9044 }
9045 redisAssert(key->storage == REDIS_VM_SWAPPING);
9046 val = dictGetEntryVal(de);
9047 key->vm.page = j->page;
9048 key->vm.usedpages = j->pages;
9049 key->storage = REDIS_VM_SWAPPED;
9050 key->vtype = j->val->type;
9051 decrRefCount(val); /* Deallocate the object from memory. */
9052 dictGetEntryVal(de) = NULL;
9053 redisLog(REDIS_DEBUG,
9054 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9055 (unsigned char*) key->ptr,
9056 (unsigned long long) j->page, (unsigned long long) j->pages);
9057 server.vm_stats_swapped_objects++;
9058 server.vm_stats_swapouts++;
9059 freeIOJob(j);
9060 /* Put a few more swap requests in queue if we are still
9061 * out of memory */
9062 if (trytoswap && vmCanSwapOut() &&
9063 zmalloc_used_memory() > server.vm_max_memory)
9064 {
9065 int more = 1;
9066 while(more) {
9067 lockThreadedIO();
9068 more = listLength(server.io_newjobs) <
9069 (unsigned) server.vm_max_threads;
9070 unlockThreadedIO();
9071 /* Don't waste CPU time if swappable objects are rare. */
9072 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9073 trytoswap = 0;
9074 break;
9075 }
9076 }
9077 }
9078 }
9079 processed++;
9080 if (processed == toprocess) return;
9081 }
9082 if (retval < 0 && errno != EAGAIN) {
9083 redisLog(REDIS_WARNING,
9084 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9085 strerror(errno));
9086 }
9087}
9088
9089static void lockThreadedIO(void) {
9090 pthread_mutex_lock(&server.io_mutex);
9091}
9092
9093static void unlockThreadedIO(void) {
9094 pthread_mutex_unlock(&server.io_mutex);
9095}
9096
9097/* Remove the specified object from the threaded I/O queue if still not
9098 * processed, otherwise make sure to flag it as canceled. */
9099static void vmCancelThreadedIOJob(robj *o) {
9100 list *lists[3] = {
9101 server.io_newjobs, /* 0 */
9102 server.io_processing, /* 1 */
9103 server.io_processed /* 2 */
9104 };
9105 int i;
9106
9107 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9108again:
9109 lockThreadedIO();
9110 /* Search for a matching key in one of the queues */
9111 for (i = 0; i < 3; i++) {
9112 listNode *ln;
9113 listIter li;
9114
9115 listRewind(lists[i],&li);
9116 while ((ln = listNext(&li)) != NULL) {
9117 iojob *job = ln->value;
9118
9119 if (job->canceled) continue; /* Skip this, already canceled. */
9120 if (job->key == o) {
9121 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9122 (void*)job, (char*)o->ptr, job->type, i);
9123 /* Mark the pages as free since the swap didn't happened
9124 * or happened but is now discarded. */
9125 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9126 vmMarkPagesFree(job->page,job->pages);
9127 /* Cancel the job. It depends on the list the job is
9128 * living in. */
9129 switch(i) {
9130 case 0: /* io_newjobs */
9131 /* If the job was yet not processed the best thing to do
9132 * is to remove it from the queue at all */
9133 freeIOJob(job);
9134 listDelNode(lists[i],ln);
9135 break;
9136 case 1: /* io_processing */
9137 /* Oh Shi- the thread is messing with the Job:
9138 *
9139 * Probably it's accessing the object if this is a
9140 * PREPARE_SWAP or DO_SWAP job.
9141 * If it's a LOAD job it may be reading from disk and
9142 * if we don't wait for the job to terminate before to
9143 * cancel it, maybe in a few microseconds data can be
9144 * corrupted in this pages. So the short story is:
9145 *
9146 * Better to wait for the job to move into the
9147 * next queue (processed)... */
9148
9149 /* We try again and again until the job is completed. */
9150 unlockThreadedIO();
9151 /* But let's wait some time for the I/O thread
9152 * to finish with this job. After all this condition
9153 * should be very rare. */
9154 usleep(1);
9155 goto again;
9156 case 2: /* io_processed */
9157 /* The job was already processed, that's easy...
9158 * just mark it as canceled so that we'll ignore it
9159 * when processing completed jobs. */
9160 job->canceled = 1;
9161 break;
9162 }
9163 /* Finally we have to adjust the storage type of the object
9164 * in order to "UNDO" the operaiton. */
9165 if (o->storage == REDIS_VM_LOADING)
9166 o->storage = REDIS_VM_SWAPPED;
9167 else if (o->storage == REDIS_VM_SWAPPING)
9168 o->storage = REDIS_VM_MEMORY;
9169 unlockThreadedIO();
9170 return;
9171 }
9172 }
9173 }
9174 unlockThreadedIO();
9175 assert(1 != 1); /* We should never reach this */
9176}
9177
9178static void *IOThreadEntryPoint(void *arg) {
9179 iojob *j;
9180 listNode *ln;
9181 REDIS_NOTUSED(arg);
9182
9183 pthread_detach(pthread_self());
9184 while(1) {
9185 /* Get a new job to process */
9186 lockThreadedIO();
9187 if (listLength(server.io_newjobs) == 0) {
9188 /* No new jobs in queue, exit. */
9189 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9190 (long) pthread_self());
9191 server.io_active_threads--;
9192 unlockThreadedIO();
9193 return NULL;
9194 }
9195 ln = listFirst(server.io_newjobs);
9196 j = ln->value;
9197 listDelNode(server.io_newjobs,ln);
9198 /* Add the job in the processing queue */
9199 j->thread = pthread_self();
9200 listAddNodeTail(server.io_processing,j);
9201 ln = listLast(server.io_processing); /* We use ln later to remove it */
9202 unlockThreadedIO();
9203 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9204 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9205
9206 /* Process the Job */
9207 if (j->type == REDIS_IOJOB_LOAD) {
9208 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9209 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9210 FILE *fp = fopen("/dev/null","w+");
9211 j->pages = rdbSavedObjectPages(j->val,fp);
9212 fclose(fp);
9213 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9214 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9215 j->canceled = 1;
9216 }
9217
9218 /* Done: insert the job into the processed queue */
9219 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9220 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9221 lockThreadedIO();
9222 listDelNode(server.io_processing,ln);
9223 listAddNodeTail(server.io_processed,j);
9224 unlockThreadedIO();
9225
9226 /* Signal the main thread there is new stuff to process */
9227 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9228 }
9229 return NULL; /* never reached */
9230}
9231
9232static void spawnIOThread(void) {
9233 pthread_t thread;
9234 sigset_t mask, omask;
9235 int err;
9236
9237 sigemptyset(&mask);
9238 sigaddset(&mask,SIGCHLD);
9239 sigaddset(&mask,SIGHUP);
9240 sigaddset(&mask,SIGPIPE);
9241 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9242 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9243 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9244 strerror(err));
9245 usleep(1000000);
9246 }
9247 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9248 server.io_active_threads++;
9249}
9250
9251/* We need to wait for the last thread to exit before we are able to
9252 * fork() in order to BGSAVE or BGREWRITEAOF. */
9253static void waitEmptyIOJobsQueue(void) {
9254 while(1) {
9255 int io_processed_len;
9256
9257 lockThreadedIO();
9258 if (listLength(server.io_newjobs) == 0 &&
9259 listLength(server.io_processing) == 0 &&
9260 server.io_active_threads == 0)
9261 {
9262 unlockThreadedIO();
9263 return;
9264 }
9265 /* While waiting for empty jobs queue condition we post-process some
9266 * finshed job, as I/O threads may be hanging trying to write against
9267 * the io_ready_pipe_write FD but there are so much pending jobs that
9268 * it's blocking. */
9269 io_processed_len = listLength(server.io_processed);
9270 unlockThreadedIO();
9271 if (io_processed_len) {
9272 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9273 usleep(1000); /* 1 millisecond */
9274 } else {
9275 usleep(10000); /* 10 milliseconds */
9276 }
9277 }
9278}
9279
9280static void vmReopenSwapFile(void) {
9281 /* Note: we don't close the old one as we are in the child process
9282 * and don't want to mess at all with the original file object. */
9283 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9284 if (server.vm_fp == NULL) {
9285 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9286 server.vm_swap_file);
9287 _exit(1);
9288 }
9289 server.vm_fd = fileno(server.vm_fp);
9290}
9291
9292/* This function must be called while with threaded IO locked */
9293static void queueIOJob(iojob *j) {
9294 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9295 (void*)j, j->type, (char*)j->key->ptr);
9296 listAddNodeTail(server.io_newjobs,j);
9297 if (server.io_active_threads < server.vm_max_threads)
9298 spawnIOThread();
9299}
9300
9301static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9302 iojob *j;
9303
9304 assert(key->storage == REDIS_VM_MEMORY);
9305 assert(key->refcount == 1);
9306
9307 j = zmalloc(sizeof(*j));
9308 j->type = REDIS_IOJOB_PREPARE_SWAP;
9309 j->db = db;
9310 j->key = key;
9311 j->val = val;
9312 incrRefCount(val);
9313 j->canceled = 0;
9314 j->thread = (pthread_t) -1;
9315 key->storage = REDIS_VM_SWAPPING;
9316
9317 lockThreadedIO();
9318 queueIOJob(j);
9319 unlockThreadedIO();
9320 return REDIS_OK;
9321}
9322
9323/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9324
9325/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9326 * If there is not already a job loading the key, it is craeted.
9327 * The key is added to the io_keys list in the client structure, and also
9328 * in the hash table mapping swapped keys to waiting clients, that is,
9329 * server.io_waited_keys. */
9330static int waitForSwappedKey(redisClient *c, robj *key) {
9331 struct dictEntry *de;
9332 robj *o;
9333 list *l;
9334
9335 /* If the key does not exist or is already in RAM we don't need to
9336 * block the client at all. */
9337 de = dictFind(c->db->dict,key);
9338 if (de == NULL) return 0;
9339 o = dictGetEntryKey(de);
9340 if (o->storage == REDIS_VM_MEMORY) {
9341 return 0;
9342 } else if (o->storage == REDIS_VM_SWAPPING) {
9343 /* We were swapping the key, undo it! */
9344 vmCancelThreadedIOJob(o);
9345 return 0;
9346 }
9347
9348 /* OK: the key is either swapped, or being loaded just now. */
9349
9350 /* Add the key to the list of keys this client is waiting for.
9351 * This maps clients to keys they are waiting for. */
9352 listAddNodeTail(c->io_keys,key);
9353 incrRefCount(key);
9354
9355 /* Add the client to the swapped keys => clients waiting map. */
9356 de = dictFind(c->db->io_keys,key);
9357 if (de == NULL) {
9358 int retval;
9359
9360 /* For every key we take a list of clients blocked for it */
9361 l = listCreate();
9362 retval = dictAdd(c->db->io_keys,key,l);
9363 incrRefCount(key);
9364 assert(retval == DICT_OK);
9365 } else {
9366 l = dictGetEntryVal(de);
9367 }
9368 listAddNodeTail(l,c);
9369
9370 /* Are we already loading the key from disk? If not create a job */
9371 if (o->storage == REDIS_VM_SWAPPED) {
9372 iojob *j;
9373
9374 o->storage = REDIS_VM_LOADING;
9375 j = zmalloc(sizeof(*j));
9376 j->type = REDIS_IOJOB_LOAD;
9377 j->db = c->db;
9378 j->key = o;
9379 j->key->vtype = o->vtype;
9380 j->page = o->vm.page;
9381 j->val = NULL;
9382 j->canceled = 0;
9383 j->thread = (pthread_t) -1;
9384 lockThreadedIO();
9385 queueIOJob(j);
9386 unlockThreadedIO();
9387 }
9388 return 1;
9389}
9390
9391/* Preload keys needed for the ZUNION and ZINTER commands. */
9392static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9393 int i, num;
9394 num = atoi(c->argv[2]->ptr);
9395 for (i = 0; i < num; i++) {
9396 waitForSwappedKey(c,c->argv[3+i]);
9397 }
9398}
9399
9400/* Is this client attempting to run a command against swapped keys?
9401 * If so, block it ASAP, load the keys in background, then resume it.
9402 *
9403 * The important idea about this function is that it can fail! If keys will
9404 * still be swapped when the client is resumed, this key lookups will
9405 * just block loading keys from disk. In practical terms this should only
9406 * happen with SORT BY command or if there is a bug in this function.
9407 *
9408 * Return 1 if the client is marked as blocked, 0 if the client can
9409 * continue as the keys it is going to access appear to be in memory. */
9410static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9411 int j, last;
9412
9413 if (cmd->vm_preload_proc != NULL) {
9414 cmd->vm_preload_proc(c);
9415 } else {
9416 if (cmd->vm_firstkey == 0) return 0;
9417 last = cmd->vm_lastkey;
9418 if (last < 0) last = c->argc+last;
9419 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9420 waitForSwappedKey(c,c->argv[j]);
9421 }
9422
9423 /* If the client was blocked for at least one key, mark it as blocked. */
9424 if (listLength(c->io_keys)) {
9425 c->flags |= REDIS_IO_WAIT;
9426 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9427 server.vm_blocked_clients++;
9428 return 1;
9429 } else {
9430 return 0;
9431 }
9432}
9433
9434/* Remove the 'key' from the list of blocked keys for a given client.
9435 *
9436 * The function returns 1 when there are no longer blocking keys after
9437 * the current one was removed (and the client can be unblocked). */
9438static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9439 list *l;
9440 listNode *ln;
9441 listIter li;
9442 struct dictEntry *de;
9443
9444 /* Remove the key from the list of keys this client is waiting for. */
9445 listRewind(c->io_keys,&li);
9446 while ((ln = listNext(&li)) != NULL) {
9447 if (compareStringObjects(ln->value,key) == 0) {
9448 listDelNode(c->io_keys,ln);
9449 break;
9450 }
9451 }
9452 assert(ln != NULL);
9453
9454 /* Remove the client form the key => waiting clients map. */
9455 de = dictFind(c->db->io_keys,key);
9456 assert(de != NULL);
9457 l = dictGetEntryVal(de);
9458 ln = listSearchKey(l,c);
9459 assert(ln != NULL);
9460 listDelNode(l,ln);
9461 if (listLength(l) == 0)
9462 dictDelete(c->db->io_keys,key);
9463
9464 return listLength(c->io_keys) == 0;
9465}
9466
9467static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9468 struct dictEntry *de;
9469 list *l;
9470 listNode *ln;
9471 int len;
9472
9473 de = dictFind(db->io_keys,key);
9474 if (!de) return;
9475
9476 l = dictGetEntryVal(de);
9477 len = listLength(l);
9478 /* Note: we can't use something like while(listLength(l)) as the list
9479 * can be freed by the calling function when we remove the last element. */
9480 while (len--) {
9481 ln = listFirst(l);
9482 redisClient *c = ln->value;
9483
9484 if (dontWaitForSwappedKey(c,key)) {
9485 /* Put the client in the list of clients ready to go as we
9486 * loaded all the keys about it. */
9487 listAddNodeTail(server.io_ready_clients,c);
9488 }
9489 }
9490}
9491
9492/* =========================== Remote Configuration ========================= */
9493
9494static void configSetCommand(redisClient *c) {
9495 robj *o = getDecodedObject(c->argv[3]);
9496 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9497 zfree(server.dbfilename);
9498 server.dbfilename = zstrdup(o->ptr);
9499 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9500 zfree(server.requirepass);
9501 server.requirepass = zstrdup(o->ptr);
9502 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9503 zfree(server.masterauth);
9504 server.masterauth = zstrdup(o->ptr);
9505 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9506 server.maxmemory = strtoll(o->ptr, NULL, 10);
9507 } else {
9508 addReplySds(c,sdscatprintf(sdsempty(),
9509 "-ERR not supported CONFIG parameter %s\r\n",
9510 (char*)c->argv[2]->ptr));
9511 decrRefCount(o);
9512 return;
9513 }
9514 decrRefCount(o);
9515 addReply(c,shared.ok);
9516}
9517
9518static void configGetCommand(redisClient *c) {
9519 robj *o = getDecodedObject(c->argv[2]);
9520 robj *lenobj = createObject(REDIS_STRING,NULL);
9521 char *pattern = o->ptr;
9522 int matches = 0;
9523
9524 addReply(c,lenobj);
9525 decrRefCount(lenobj);
9526
9527 if (stringmatch(pattern,"dbfilename",0)) {
9528 addReplyBulkCString(c,"dbfilename");
9529 addReplyBulkCString(c,server.dbfilename);
9530 matches++;
9531 }
9532 if (stringmatch(pattern,"requirepass",0)) {
9533 addReplyBulkCString(c,"requirepass");
9534 addReplyBulkCString(c,server.requirepass);
9535 matches++;
9536 }
9537 if (stringmatch(pattern,"masterauth",0)) {
9538 addReplyBulkCString(c,"masterauth");
9539 addReplyBulkCString(c,server.masterauth);
9540 matches++;
9541 }
9542 if (stringmatch(pattern,"maxmemory",0)) {
9543 char buf[128];
9544
9545 snprintf(buf,128,"%llu\n",server.maxmemory);
9546 addReplyBulkCString(c,"maxmemory");
9547 addReplyBulkCString(c,buf);
9548 matches++;
9549 }
9550 decrRefCount(o);
9551 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9552}
9553
9554static void configCommand(redisClient *c) {
9555 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9556 if (c->argc != 4) goto badarity;
9557 configSetCommand(c);
9558 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9559 if (c->argc != 3) goto badarity;
9560 configGetCommand(c);
9561 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9562 if (c->argc != 2) goto badarity;
9563 server.stat_numcommands = 0;
9564 server.stat_numconnections = 0;
9565 server.stat_expiredkeys = 0;
9566 server.stat_starttime = time(NULL);
9567 addReply(c,shared.ok);
9568 } else {
9569 addReplySds(c,sdscatprintf(sdsempty(),
9570 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9571 }
9572 return;
9573
9574badarity:
9575 addReplySds(c,sdscatprintf(sdsempty(),
9576 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9577 (char*) c->argv[1]->ptr));
9578}
9579
9580/* =========================== Pubsub implementation ======================== */
9581
9582static void freePubsubPattern(void *p) {
9583 pubsubPattern *pat = p;
9584
9585 decrRefCount(pat->pattern);
9586 zfree(pat);
9587}
9588
9589static int listMatchPubsubPattern(void *a, void *b) {
9590 pubsubPattern *pa = a, *pb = b;
9591
9592 return (pa->client == pb->client) &&
9593 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9594}
9595
9596/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9597 * 0 if the client was already subscribed to that channel. */
9598static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9599 struct dictEntry *de;
9600 list *clients = NULL;
9601 int retval = 0;
9602
9603 /* Add the channel to the client -> channels hash table */
9604 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9605 retval = 1;
9606 incrRefCount(channel);
9607 /* Add the client to the channel -> list of clients hash table */
9608 de = dictFind(server.pubsub_channels,channel);
9609 if (de == NULL) {
9610 clients = listCreate();
9611 dictAdd(server.pubsub_channels,channel,clients);
9612 incrRefCount(channel);
9613 } else {
9614 clients = dictGetEntryVal(de);
9615 }
9616 listAddNodeTail(clients,c);
9617 }
9618 /* Notify the client */
9619 addReply(c,shared.mbulk3);
9620 addReply(c,shared.subscribebulk);
9621 addReplyBulk(c,channel);
9622 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9623 return retval;
9624}
9625
9626/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9627 * 0 if the client was not subscribed to the specified channel. */
9628static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9629 struct dictEntry *de;
9630 list *clients;
9631 listNode *ln;
9632 int retval = 0;
9633
9634 /* Remove the channel from the client -> channels hash table */
9635 incrRefCount(channel); /* channel may be just a pointer to the same object
9636 we have in the hash tables. Protect it... */
9637 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9638 retval = 1;
9639 /* Remove the client from the channel -> clients list hash table */
9640 de = dictFind(server.pubsub_channels,channel);
9641 assert(de != NULL);
9642 clients = dictGetEntryVal(de);
9643 ln = listSearchKey(clients,c);
9644 assert(ln != NULL);
9645 listDelNode(clients,ln);
9646 if (listLength(clients) == 0) {
9647 /* Free the list and associated hash entry at all if this was
9648 * the latest client, so that it will be possible to abuse
9649 * Redis PUBSUB creating millions of channels. */
9650 dictDelete(server.pubsub_channels,channel);
9651 }
9652 }
9653 /* Notify the client */
9654 if (notify) {
9655 addReply(c,shared.mbulk3);
9656 addReply(c,shared.unsubscribebulk);
9657 addReplyBulk(c,channel);
9658 addReplyLong(c,dictSize(c->pubsub_channels)+
9659 listLength(c->pubsub_patterns));
9660
9661 }
9662 decrRefCount(channel); /* it is finally safe to release it */
9663 return retval;
9664}
9665
9666/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9667static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9668 int retval = 0;
9669
9670 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9671 retval = 1;
9672 pubsubPattern *pat;
9673 listAddNodeTail(c->pubsub_patterns,pattern);
9674 incrRefCount(pattern);
9675 pat = zmalloc(sizeof(*pat));
9676 pat->pattern = getDecodedObject(pattern);
9677 pat->client = c;
9678 listAddNodeTail(server.pubsub_patterns,pat);
9679 }
9680 /* Notify the client */
9681 addReply(c,shared.mbulk3);
9682 addReply(c,shared.psubscribebulk);
9683 addReplyBulk(c,pattern);
9684 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9685 return retval;
9686}
9687
9688/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9689 * 0 if the client was not subscribed to the specified channel. */
9690static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9691 listNode *ln;
9692 pubsubPattern pat;
9693 int retval = 0;
9694
9695 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9696 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9697 retval = 1;
9698 listDelNode(c->pubsub_patterns,ln);
9699 pat.client = c;
9700 pat.pattern = pattern;
9701 ln = listSearchKey(server.pubsub_patterns,&pat);
9702 listDelNode(server.pubsub_patterns,ln);
9703 }
9704 /* Notify the client */
9705 if (notify) {
9706 addReply(c,shared.mbulk3);
9707 addReply(c,shared.punsubscribebulk);
9708 addReplyBulk(c,pattern);
9709 addReplyLong(c,dictSize(c->pubsub_channels)+
9710 listLength(c->pubsub_patterns));
9711 }
9712 decrRefCount(pattern);
9713 return retval;
9714}
9715
9716/* Unsubscribe from all the channels. Return the number of channels the
9717 * client was subscribed from. */
9718static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9719 dictIterator *di = dictGetIterator(c->pubsub_channels);
9720 dictEntry *de;
9721 int count = 0;
9722
9723 while((de = dictNext(di)) != NULL) {
9724 robj *channel = dictGetEntryKey(de);
9725
9726 count += pubsubUnsubscribeChannel(c,channel,notify);
9727 }
9728 dictReleaseIterator(di);
9729 return count;
9730}
9731
9732/* Unsubscribe from all the patterns. Return the number of patterns the
9733 * client was subscribed from. */
9734static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9735 listNode *ln;
9736 listIter li;
9737 int count = 0;
9738
9739 listRewind(c->pubsub_patterns,&li);
9740 while ((ln = listNext(&li)) != NULL) {
9741 robj *pattern = ln->value;
9742
9743 count += pubsubUnsubscribePattern(c,pattern,notify);
9744 }
9745 return count;
9746}
9747
9748/* Publish a message */
9749static int pubsubPublishMessage(robj *channel, robj *message) {
9750 int receivers = 0;
9751 struct dictEntry *de;
9752 listNode *ln;
9753 listIter li;
9754
9755 /* Send to clients listening for that channel */
9756 de = dictFind(server.pubsub_channels,channel);
9757 if (de) {
9758 list *list = dictGetEntryVal(de);
9759 listNode *ln;
9760 listIter li;
9761
9762 listRewind(list,&li);
9763 while ((ln = listNext(&li)) != NULL) {
9764 redisClient *c = ln->value;
9765
9766 addReply(c,shared.mbulk3);
9767 addReply(c,shared.messagebulk);
9768 addReplyBulk(c,channel);
9769 addReplyBulk(c,message);
9770 receivers++;
9771 }
9772 }
9773 /* Send to clients listening to matching channels */
9774 if (listLength(server.pubsub_patterns)) {
9775 listRewind(server.pubsub_patterns,&li);
9776 channel = getDecodedObject(channel);
9777 while ((ln = listNext(&li)) != NULL) {
9778 pubsubPattern *pat = ln->value;
9779
9780 if (stringmatchlen((char*)pat->pattern->ptr,
9781 sdslen(pat->pattern->ptr),
9782 (char*)channel->ptr,
9783 sdslen(channel->ptr),0)) {
9784 addReply(pat->client,shared.mbulk3);
9785 addReply(pat->client,shared.messagebulk);
9786 addReplyBulk(pat->client,channel);
9787 addReplyBulk(pat->client,message);
9788 receivers++;
9789 }
9790 }
9791 decrRefCount(channel);
9792 }
9793 return receivers;
9794}
9795
9796static void subscribeCommand(redisClient *c) {
9797 int j;
9798
9799 for (j = 1; j < c->argc; j++)
9800 pubsubSubscribeChannel(c,c->argv[j]);
9801}
9802
9803static void unsubscribeCommand(redisClient *c) {
9804 if (c->argc == 1) {
9805 pubsubUnsubscribeAllChannels(c,1);
9806 return;
9807 } else {
9808 int j;
9809
9810 for (j = 1; j < c->argc; j++)
9811 pubsubUnsubscribeChannel(c,c->argv[j],1);
9812 }
9813}
9814
9815static void psubscribeCommand(redisClient *c) {
9816 int j;
9817
9818 for (j = 1; j < c->argc; j++)
9819 pubsubSubscribePattern(c,c->argv[j]);
9820}
9821
9822static void punsubscribeCommand(redisClient *c) {
9823 if (c->argc == 1) {
9824 pubsubUnsubscribeAllPatterns(c,1);
9825 return;
9826 } else {
9827 int j;
9828
9829 for (j = 1; j < c->argc; j++)
9830 pubsubUnsubscribePattern(c,c->argv[j],1);
9831 }
9832}
9833
9834static void publishCommand(redisClient *c) {
9835 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9836 addReplyLong(c,receivers);
9837}
9838
9839/* ================================= Debugging ============================== */
9840
9841static void debugCommand(redisClient *c) {
9842 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9843 *((char*)-1) = 'x';
9844 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9845 if (rdbSave(server.dbfilename) != REDIS_OK) {
9846 addReply(c,shared.err);
9847 return;
9848 }
9849 emptyDb();
9850 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9851 addReply(c,shared.err);
9852 return;
9853 }
9854 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9855 addReply(c,shared.ok);
9856 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9857 emptyDb();
9858 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9859 addReply(c,shared.err);
9860 return;
9861 }
9862 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9863 addReply(c,shared.ok);
9864 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9865 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9866 robj *key, *val;
9867
9868 if (!de) {
9869 addReply(c,shared.nokeyerr);
9870 return;
9871 }
9872 key = dictGetEntryKey(de);
9873 val = dictGetEntryVal(de);
9874 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9875 key->storage == REDIS_VM_SWAPPING)) {
9876 char *strenc;
9877 char buf[128];
9878
9879 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9880 strenc = strencoding[val->encoding];
9881 } else {
9882 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9883 strenc = buf;
9884 }
9885 addReplySds(c,sdscatprintf(sdsempty(),
9886 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9887 "encoding:%s serializedlength:%lld\r\n",
9888 (void*)key, key->refcount, (void*)val, val->refcount,
9889 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9890 } else {
9891 addReplySds(c,sdscatprintf(sdsempty(),
9892 "+Key at:%p refcount:%d, value swapped at: page %llu "
9893 "using %llu pages\r\n",
9894 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9895 (unsigned long long) key->vm.usedpages));
9896 }
9897 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9898 lookupKeyRead(c->db,c->argv[2]);
9899 addReply(c,shared.ok);
9900 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9901 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9902 robj *key, *val;
9903
9904 if (!server.vm_enabled) {
9905 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9906 return;
9907 }
9908 if (!de) {
9909 addReply(c,shared.nokeyerr);
9910 return;
9911 }
9912 key = dictGetEntryKey(de);
9913 val = dictGetEntryVal(de);
9914 /* If the key is shared we want to create a copy */
9915 if (key->refcount > 1) {
9916 robj *newkey = dupStringObject(key);
9917 decrRefCount(key);
9918 key = dictGetEntryKey(de) = newkey;
9919 }
9920 /* Swap it */
9921 if (key->storage != REDIS_VM_MEMORY) {
9922 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9923 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9924 dictGetEntryVal(de) = NULL;
9925 addReply(c,shared.ok);
9926 } else {
9927 addReply(c,shared.err);
9928 }
9929 } else {
9930 addReplySds(c,sdsnew(
9931 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9932 }
9933}
9934
9935static void _redisAssert(char *estr, char *file, int line) {
9936 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9937 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9938#ifdef HAVE_BACKTRACE
9939 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9940 *((char*)-1) = 'x';
9941#endif
9942}
9943
9944static void _redisPanic(char *msg, char *file, int line) {
9945 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
9946 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
9947#ifdef HAVE_BACKTRACE
9948 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9949 *((char*)-1) = 'x';
9950#endif
9951}
9952
9953/* =================================== Main! ================================ */
9954
9955#ifdef __linux__
9956int linuxOvercommitMemoryValue(void) {
9957 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9958 char buf[64];
9959
9960 if (!fp) return -1;
9961 if (fgets(buf,64,fp) == NULL) {
9962 fclose(fp);
9963 return -1;
9964 }
9965 fclose(fp);
9966
9967 return atoi(buf);
9968}
9969
9970void linuxOvercommitMemoryWarning(void) {
9971 if (linuxOvercommitMemoryValue() == 0) {
9972 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9973 }
9974}
9975#endif /* __linux__ */
9976
9977static void daemonize(void) {
9978 int fd;
9979 FILE *fp;
9980
9981 if (fork() != 0) exit(0); /* parent exits */
9982 setsid(); /* create a new session */
9983
9984 /* Every output goes to /dev/null. If Redis is daemonized but
9985 * the 'logfile' is set to 'stdout' in the configuration file
9986 * it will not log at all. */
9987 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9988 dup2(fd, STDIN_FILENO);
9989 dup2(fd, STDOUT_FILENO);
9990 dup2(fd, STDERR_FILENO);
9991 if (fd > STDERR_FILENO) close(fd);
9992 }
9993 /* Try to write the pid file */
9994 fp = fopen(server.pidfile,"w");
9995 if (fp) {
9996 fprintf(fp,"%d\n",getpid());
9997 fclose(fp);
9998 }
9999}
10000
10001static void version() {
10002 printf("Redis server version %s\n", REDIS_VERSION);
10003 exit(0);
10004}
10005
10006static void usage() {
10007 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10008 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10009 exit(1);
10010}
10011
10012int main(int argc, char **argv) {
10013 time_t start;
10014
10015 initServerConfig();
10016 if (argc == 2) {
10017 if (strcmp(argv[1], "-v") == 0 ||
10018 strcmp(argv[1], "--version") == 0) version();
10019 if (strcmp(argv[1], "--help") == 0) usage();
10020 resetServerSaveParams();
10021 loadServerConfig(argv[1]);
10022 } else if ((argc > 2)) {
10023 usage();
10024 } else {
10025 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10026 }
10027 if (server.daemonize) daemonize();
10028 initServer();
10029 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10030#ifdef __linux__
10031 linuxOvercommitMemoryWarning();
10032#endif
10033 start = time(NULL);
10034 if (server.appendonly) {
10035 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10036 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10037 } else {
10038 if (rdbLoad(server.dbfilename) == REDIS_OK)
10039 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10040 }
10041 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10042 aeSetBeforeSleepProc(server.el,beforeSleep);
10043 aeMain(server.el);
10044 aeDeleteEventLoop(server.el);
10045 return 0;
10046}
10047
10048/* ============================= Backtrace support ========================= */
10049
10050#ifdef HAVE_BACKTRACE
10051static char *findFuncName(void *pointer, unsigned long *offset);
10052
10053static void *getMcontextEip(ucontext_t *uc) {
10054#if defined(__FreeBSD__)
10055 return (void*) uc->uc_mcontext.mc_eip;
10056#elif defined(__dietlibc__)
10057 return (void*) uc->uc_mcontext.eip;
10058#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10059 #if __x86_64__
10060 return (void*) uc->uc_mcontext->__ss.__rip;
10061 #else
10062 return (void*) uc->uc_mcontext->__ss.__eip;
10063 #endif
10064#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10065 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10066 return (void*) uc->uc_mcontext->__ss.__rip;
10067 #else
10068 return (void*) uc->uc_mcontext->__ss.__eip;
10069 #endif
10070#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10071 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10072#elif defined(__ia64__) /* Linux IA64 */
10073 return (void*) uc->uc_mcontext.sc_ip;
10074#else
10075 return NULL;
10076#endif
10077}
10078
10079static void segvHandler(int sig, siginfo_t *info, void *secret) {
10080 void *trace[100];
10081 char **messages = NULL;
10082 int i, trace_size = 0;
10083 unsigned long offset=0;
10084 ucontext_t *uc = (ucontext_t*) secret;
10085 sds infostring;
10086 REDIS_NOTUSED(info);
10087
10088 redisLog(REDIS_WARNING,
10089 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10090 infostring = genRedisInfoString();
10091 redisLog(REDIS_WARNING, "%s",infostring);
10092 /* It's not safe to sdsfree() the returned string under memory
10093 * corruption conditions. Let it leak as we are going to abort */
10094
10095 trace_size = backtrace(trace, 100);
10096 /* overwrite sigaction with caller's address */
10097 if (getMcontextEip(uc) != NULL) {
10098 trace[1] = getMcontextEip(uc);
10099 }
10100 messages = backtrace_symbols(trace, trace_size);
10101
10102 for (i=1; i<trace_size; ++i) {
10103 char *fn = findFuncName(trace[i], &offset), *p;
10104
10105 p = strchr(messages[i],'+');
10106 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10107 redisLog(REDIS_WARNING,"%s", messages[i]);
10108 } else {
10109 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10110 }
10111 }
10112 /* free(messages); Don't call free() with possibly corrupted memory. */
10113 _exit(0);
10114}
10115
10116static void setupSigSegvAction(void) {
10117 struct sigaction act;
10118
10119 sigemptyset (&act.sa_mask);
10120 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10121 * is used. Otherwise, sa_handler is used */
10122 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10123 act.sa_sigaction = segvHandler;
10124 sigaction (SIGSEGV, &act, NULL);
10125 sigaction (SIGBUS, &act, NULL);
10126 sigaction (SIGFPE, &act, NULL);
10127 sigaction (SIGILL, &act, NULL);
10128 sigaction (SIGBUS, &act, NULL);
10129 return;
10130}
10131
10132#include "staticsymbols.h"
10133/* This function try to convert a pointer into a function name. It's used in
10134 * oreder to provide a backtrace under segmentation fault that's able to
10135 * display functions declared as static (otherwise the backtrace is useless). */
10136static char *findFuncName(void *pointer, unsigned long *offset){
10137 int i, ret = -1;
10138 unsigned long off, minoff = 0;
10139
10140 /* Try to match against the Symbol with the smallest offset */
10141 for (i=0; symsTable[i].pointer; i++) {
10142 unsigned long lp = (unsigned long) pointer;
10143
10144 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10145 off=lp-symsTable[i].pointer;
10146 if (ret < 0 || off < minoff) {
10147 minoff=off;
10148 ret=i;
10149 }
10150 }
10151 }
10152 if (ret == -1) return NULL;
10153 *offset = minoff;
10154 return symsTable[ret].name;
10155}
10156#else /* HAVE_BACKTRACE */
10157static void setupSigSegvAction(void) {
10158}
10159#endif /* HAVE_BACKTRACE */
10160
10161
10162
10163/* The End */
10164
10165
10166