]> git.saurik.com Git - redis.git/blob - redis.c
6ba7a3b9deae6cb3c6997e753bb4902cdb914b2c
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int shutdown_asap;
373 time_t lastfsync;
374 int appendfd;
375 int appendseldb;
376 char *pidfile;
377 pid_t bgsavechildpid;
378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
380 sds aofbuf; /* AOF buffer, written before entering the event loop */
381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
386 char *appendfilename;
387 char *requirepass;
388 int rdbcompression;
389 int activerehashing;
390 /* Replication related */
391 int isslave;
392 char *masterauth;
393 char *masterhost;
394 int masterport;
395 redisClient *master; /* client that is master for this slave */
396 int replstate;
397 unsigned int maxclients;
398 unsigned long long maxmemory;
399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
406 /* Virtual memory configuration */
407 int vm_enabled;
408 char *vm_swap_file;
409 off_t vm_page_size;
410 off_t vm_pages;
411 unsigned long long vm_max_memory;
412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
421 time_t unixtime; /* Unix time sampled every second. */
422 /* Virtual memory I/O threads stuff */
423 /* An I/O thread process an element taken from the io_jobs queue and
424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
447 /* Pubsub */
448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
450 /* Misc */
451 FILE *devnull;
452 };
453
454 typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457 } pubsubPattern;
458
459 typedef void redisCommandProc(redisClient *c);
460 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
461 struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
469 redisVmPreloadProc *vm_preload_proc;
470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
474 };
475
476 struct redisFunctionSym {
477 char *name;
478 unsigned long pointer;
479 };
480
481 typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487 } redisSortObject;
488
489 typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492 } redisSortOperation;
493
494 /* ZSETs use a specialized version of Skiplists */
495
496 typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
498 struct zskiplistNode *backward;
499 unsigned int *span;
500 double score;
501 robj *obj;
502 } zskiplistNode;
503
504 typedef struct zskiplist {
505 struct zskiplistNode *header, *tail;
506 unsigned long length;
507 int level;
508 } zskiplist;
509
510 typedef struct zset {
511 dict *dict;
512 zskiplist *zsl;
513 } zset;
514
515 /* Our shared "common" objects */
516
517 #define REDIS_SHARED_INTEGERS 10000
518 struct sharedObjectsStruct {
519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
520 *colon, *nullbulk, *nullmultibulk, *queued,
521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
523 *select0, *select1, *select2, *select3, *select4,
524 *select5, *select6, *select7, *select8, *select9,
525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
528 } shared;
529
530 /* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
536 /* VM threaded I/O request message */
537 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
540 typedef struct iojob {
541 int type; /* Request type, REDIS_IOJOB_* */
542 redisDb *db;/* Redis database */
543 robj *key; /* This I/O request is about swapping this key */
544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550 } iojob;
551
552 /*================================ Prototypes =============================== */
553
554 static void freeStringObject(robj *o);
555 static void freeListObject(robj *o);
556 static void freeSetObject(robj *o);
557 static void decrRefCount(void *o);
558 static robj *createObject(int type, void *ptr);
559 static void freeClient(redisClient *c);
560 static int rdbLoad(char *filename);
561 static void addReply(redisClient *c, robj *obj);
562 static void addReplySds(redisClient *c, sds s);
563 static void incrRefCount(robj *o);
564 static int rdbSaveBackground(char *filename);
565 static robj *createStringObject(char *ptr, size_t len);
566 static robj *dupStringObject(robj *o);
567 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
568 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
569 static void flushAppendOnlyFile(void);
570 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
571 static int syncWithMaster(void);
572 static robj *tryObjectEncoding(robj *o);
573 static robj *getDecodedObject(robj *o);
574 static int removeExpire(redisDb *db, robj *key);
575 static int expireIfNeeded(redisDb *db, robj *key);
576 static int deleteIfVolatile(redisDb *db, robj *key);
577 static int deleteIfSwapped(redisDb *db, robj *key);
578 static int deleteKey(redisDb *db, robj *key);
579 static time_t getExpire(redisDb *db, robj *key);
580 static int setExpire(redisDb *db, robj *key, time_t when);
581 static void updateSlavesWaitingBgsave(int bgsaveerr);
582 static void freeMemoryIfNeeded(void);
583 static int processCommand(redisClient *c);
584 static void setupSigSegvAction(void);
585 static void rdbRemoveTempFile(pid_t childpid);
586 static void aofRemoveTempFile(pid_t childpid);
587 static size_t stringObjectLen(robj *o);
588 static void processInputBuffer(redisClient *c);
589 static zskiplist *zslCreate(void);
590 static void zslFree(zskiplist *zsl);
591 static void zslInsert(zskiplist *zsl, double score, robj *obj);
592 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
593 static void initClientMultiState(redisClient *c);
594 static void freeClientMultiState(redisClient *c);
595 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
596 static void unblockClientWaitingData(redisClient *c);
597 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
598 static void vmInit(void);
599 static void vmMarkPagesFree(off_t page, off_t count);
600 static robj *vmLoadObject(robj *key);
601 static robj *vmPreviewObject(robj *key);
602 static int vmSwapOneObjectBlocking(void);
603 static int vmSwapOneObjectThreaded(void);
604 static int vmCanSwapOut(void);
605 static int tryFreeOneObjectFromFreelist(void);
606 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmCancelThreadedIOJob(robj *o);
609 static void lockThreadedIO(void);
610 static void unlockThreadedIO(void);
611 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612 static void freeIOJob(iojob *j);
613 static void queueIOJob(iojob *j);
614 static int vmWriteObjectOnSwap(robj *o, off_t page);
615 static robj *vmReadObjectFromSwap(off_t page, int type);
616 static void waitEmptyIOJobsQueue(void);
617 static void vmReopenSwapFile(void);
618 static int vmFreePage(off_t page);
619 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
620 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
622 static int dontWaitForSwappedKey(redisClient *c, robj *key);
623 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625 static struct redisCommand *lookupCommand(char *name);
626 static void call(redisClient *c, struct redisCommand *cmd);
627 static void resetClient(redisClient *c);
628 static void convertToRealHash(robj *o);
629 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631 static void freePubsubPattern(void *p);
632 static int listMatchPubsubPattern(void *a, void *b);
633 static int compareStringObjects(robj *a, robj *b);
634 static int equalStringObjects(robj *a, robj *b);
635 static void usage();
636 static int rewriteAppendOnlyFileBackground(void);
637 static int vmSwapObjectBlocking(robj *key, robj *val);
638 static int prepareForShutdown();
639 static void touchWatchedKey(redisDb *db, robj *key);
640 static void touchWatchedKeysOnFlush(int dbid);
641 static void unwatchAllKeys(redisClient *c);
642
643 static void authCommand(redisClient *c);
644 static void pingCommand(redisClient *c);
645 static void echoCommand(redisClient *c);
646 static void setCommand(redisClient *c);
647 static void setnxCommand(redisClient *c);
648 static void setexCommand(redisClient *c);
649 static void getCommand(redisClient *c);
650 static void delCommand(redisClient *c);
651 static void existsCommand(redisClient *c);
652 static void incrCommand(redisClient *c);
653 static void decrCommand(redisClient *c);
654 static void incrbyCommand(redisClient *c);
655 static void decrbyCommand(redisClient *c);
656 static void selectCommand(redisClient *c);
657 static void randomkeyCommand(redisClient *c);
658 static void keysCommand(redisClient *c);
659 static void dbsizeCommand(redisClient *c);
660 static void lastsaveCommand(redisClient *c);
661 static void saveCommand(redisClient *c);
662 static void bgsaveCommand(redisClient *c);
663 static void bgrewriteaofCommand(redisClient *c);
664 static void shutdownCommand(redisClient *c);
665 static void moveCommand(redisClient *c);
666 static void renameCommand(redisClient *c);
667 static void renamenxCommand(redisClient *c);
668 static void lpushCommand(redisClient *c);
669 static void rpushCommand(redisClient *c);
670 static void lpopCommand(redisClient *c);
671 static void rpopCommand(redisClient *c);
672 static void llenCommand(redisClient *c);
673 static void lindexCommand(redisClient *c);
674 static void lrangeCommand(redisClient *c);
675 static void ltrimCommand(redisClient *c);
676 static void typeCommand(redisClient *c);
677 static void lsetCommand(redisClient *c);
678 static void saddCommand(redisClient *c);
679 static void sremCommand(redisClient *c);
680 static void smoveCommand(redisClient *c);
681 static void sismemberCommand(redisClient *c);
682 static void scardCommand(redisClient *c);
683 static void spopCommand(redisClient *c);
684 static void srandmemberCommand(redisClient *c);
685 static void sinterCommand(redisClient *c);
686 static void sinterstoreCommand(redisClient *c);
687 static void sunionCommand(redisClient *c);
688 static void sunionstoreCommand(redisClient *c);
689 static void sdiffCommand(redisClient *c);
690 static void sdiffstoreCommand(redisClient *c);
691 static void syncCommand(redisClient *c);
692 static void flushdbCommand(redisClient *c);
693 static void flushallCommand(redisClient *c);
694 static void sortCommand(redisClient *c);
695 static void lremCommand(redisClient *c);
696 static void rpoplpushcommand(redisClient *c);
697 static void infoCommand(redisClient *c);
698 static void mgetCommand(redisClient *c);
699 static void monitorCommand(redisClient *c);
700 static void expireCommand(redisClient *c);
701 static void expireatCommand(redisClient *c);
702 static void getsetCommand(redisClient *c);
703 static void ttlCommand(redisClient *c);
704 static void slaveofCommand(redisClient *c);
705 static void debugCommand(redisClient *c);
706 static void msetCommand(redisClient *c);
707 static void msetnxCommand(redisClient *c);
708 static void zaddCommand(redisClient *c);
709 static void zincrbyCommand(redisClient *c);
710 static void zrangeCommand(redisClient *c);
711 static void zrangebyscoreCommand(redisClient *c);
712 static void zcountCommand(redisClient *c);
713 static void zrevrangeCommand(redisClient *c);
714 static void zcardCommand(redisClient *c);
715 static void zremCommand(redisClient *c);
716 static void zscoreCommand(redisClient *c);
717 static void zremrangebyscoreCommand(redisClient *c);
718 static void multiCommand(redisClient *c);
719 static void execCommand(redisClient *c);
720 static void discardCommand(redisClient *c);
721 static void blpopCommand(redisClient *c);
722 static void brpopCommand(redisClient *c);
723 static void appendCommand(redisClient *c);
724 static void substrCommand(redisClient *c);
725 static void zrankCommand(redisClient *c);
726 static void zrevrankCommand(redisClient *c);
727 static void hsetCommand(redisClient *c);
728 static void hsetnxCommand(redisClient *c);
729 static void hgetCommand(redisClient *c);
730 static void hmsetCommand(redisClient *c);
731 static void hmgetCommand(redisClient *c);
732 static void hdelCommand(redisClient *c);
733 static void hlenCommand(redisClient *c);
734 static void zremrangebyrankCommand(redisClient *c);
735 static void zunionstoreCommand(redisClient *c);
736 static void zinterstoreCommand(redisClient *c);
737 static void hkeysCommand(redisClient *c);
738 static void hvalsCommand(redisClient *c);
739 static void hgetallCommand(redisClient *c);
740 static void hexistsCommand(redisClient *c);
741 static void configCommand(redisClient *c);
742 static void hincrbyCommand(redisClient *c);
743 static void subscribeCommand(redisClient *c);
744 static void unsubscribeCommand(redisClient *c);
745 static void psubscribeCommand(redisClient *c);
746 static void punsubscribeCommand(redisClient *c);
747 static void publishCommand(redisClient *c);
748 static void watchCommand(redisClient *c);
749 static void unwatchCommand(redisClient *c);
750
751 /*================================= Globals ================================= */
752
753 /* Global vars */
754 static struct redisServer server; /* server global state */
755 static struct redisCommand *commandTable;
756 static unsigned int commandTableSize;
757 static struct redisCommand readonlyCommandTable[] = {
758 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
761 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
762 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
763 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
765 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
769 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
781 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
782 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
785 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
794 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
795 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
802 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
803 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
817 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
823 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
828 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
840 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
846 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
848 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
859 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
864 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {NULL,NULL,0,0,NULL,0,0,0}
867 };
868
869 /*============================ Utility functions ============================ */
870
871 /* Glob-style pattern matching. */
872 static int stringmatchlen(const char *pattern, int patternLen,
873 const char *string, int stringLen, int nocase)
874 {
875 while(patternLen) {
876 switch(pattern[0]) {
877 case '*':
878 while (pattern[1] == '*') {
879 pattern++;
880 patternLen--;
881 }
882 if (patternLen == 1)
883 return 1; /* match */
884 while(stringLen) {
885 if (stringmatchlen(pattern+1, patternLen-1,
886 string, stringLen, nocase))
887 return 1; /* match */
888 string++;
889 stringLen--;
890 }
891 return 0; /* no match */
892 break;
893 case '?':
894 if (stringLen == 0)
895 return 0; /* no match */
896 string++;
897 stringLen--;
898 break;
899 case '[':
900 {
901 int not, match;
902
903 pattern++;
904 patternLen--;
905 not = pattern[0] == '^';
906 if (not) {
907 pattern++;
908 patternLen--;
909 }
910 match = 0;
911 while(1) {
912 if (pattern[0] == '\\') {
913 pattern++;
914 patternLen--;
915 if (pattern[0] == string[0])
916 match = 1;
917 } else if (pattern[0] == ']') {
918 break;
919 } else if (patternLen == 0) {
920 pattern--;
921 patternLen++;
922 break;
923 } else if (pattern[1] == '-' && patternLen >= 3) {
924 int start = pattern[0];
925 int end = pattern[2];
926 int c = string[0];
927 if (start > end) {
928 int t = start;
929 start = end;
930 end = t;
931 }
932 if (nocase) {
933 start = tolower(start);
934 end = tolower(end);
935 c = tolower(c);
936 }
937 pattern += 2;
938 patternLen -= 2;
939 if (c >= start && c <= end)
940 match = 1;
941 } else {
942 if (!nocase) {
943 if (pattern[0] == string[0])
944 match = 1;
945 } else {
946 if (tolower((int)pattern[0]) == tolower((int)string[0]))
947 match = 1;
948 }
949 }
950 pattern++;
951 patternLen--;
952 }
953 if (not)
954 match = !match;
955 if (!match)
956 return 0; /* no match */
957 string++;
958 stringLen--;
959 break;
960 }
961 case '\\':
962 if (patternLen >= 2) {
963 pattern++;
964 patternLen--;
965 }
966 /* fall through */
967 default:
968 if (!nocase) {
969 if (pattern[0] != string[0])
970 return 0; /* no match */
971 } else {
972 if (tolower((int)pattern[0]) != tolower((int)string[0]))
973 return 0; /* no match */
974 }
975 string++;
976 stringLen--;
977 break;
978 }
979 pattern++;
980 patternLen--;
981 if (stringLen == 0) {
982 while(*pattern == '*') {
983 pattern++;
984 patternLen--;
985 }
986 break;
987 }
988 }
989 if (patternLen == 0 && stringLen == 0)
990 return 1;
991 return 0;
992 }
993
994 static int stringmatch(const char *pattern, const char *string, int nocase) {
995 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
996 }
997
998 /* Convert a string representing an amount of memory into the number of
999 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1000 * (1024*1024*1024).
1001 *
1002 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1003 * set to 0 */
1004 static long long memtoll(const char *p, int *err) {
1005 const char *u;
1006 char buf[128];
1007 long mul; /* unit multiplier */
1008 long long val;
1009 unsigned int digits;
1010
1011 if (err) *err = 0;
1012 /* Search the first non digit character. */
1013 u = p;
1014 if (*u == '-') u++;
1015 while(*u && isdigit(*u)) u++;
1016 if (*u == '\0' || !strcasecmp(u,"b")) {
1017 mul = 1;
1018 } else if (!strcasecmp(u,"k")) {
1019 mul = 1000;
1020 } else if (!strcasecmp(u,"kb")) {
1021 mul = 1024;
1022 } else if (!strcasecmp(u,"m")) {
1023 mul = 1000*1000;
1024 } else if (!strcasecmp(u,"mb")) {
1025 mul = 1024*1024;
1026 } else if (!strcasecmp(u,"g")) {
1027 mul = 1000L*1000*1000;
1028 } else if (!strcasecmp(u,"gb")) {
1029 mul = 1024L*1024*1024;
1030 } else {
1031 if (err) *err = 1;
1032 mul = 1;
1033 }
1034 digits = u-p;
1035 if (digits >= sizeof(buf)) {
1036 if (err) *err = 1;
1037 return LLONG_MAX;
1038 }
1039 memcpy(buf,p,digits);
1040 buf[digits] = '\0';
1041 val = strtoll(buf,NULL,10);
1042 return val*mul;
1043 }
1044
1045 /* Convert a long long into a string. Returns the number of
1046 * characters needed to represent the number, that can be shorter if passed
1047 * buffer length is not enough to store the whole number. */
1048 static int ll2string(char *s, size_t len, long long value) {
1049 char buf[32], *p;
1050 unsigned long long v;
1051 size_t l;
1052
1053 if (len == 0) return 0;
1054 v = (value < 0) ? -value : value;
1055 p = buf+31; /* point to the last character */
1056 do {
1057 *p-- = '0'+(v%10);
1058 v /= 10;
1059 } while(v);
1060 if (value < 0) *p-- = '-';
1061 p++;
1062 l = 32-(p-buf);
1063 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1064 memcpy(s,p,l);
1065 s[l] = '\0';
1066 return l;
1067 }
1068
1069 static void redisLog(int level, const char *fmt, ...) {
1070 va_list ap;
1071 FILE *fp;
1072
1073 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1074 if (!fp) return;
1075
1076 va_start(ap, fmt);
1077 if (level >= server.verbosity) {
1078 char *c = ".-*#";
1079 char buf[64];
1080 time_t now;
1081
1082 now = time(NULL);
1083 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1084 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1085 vfprintf(fp, fmt, ap);
1086 fprintf(fp,"\n");
1087 fflush(fp);
1088 }
1089 va_end(ap);
1090
1091 if (server.logfile) fclose(fp);
1092 }
1093
1094 /*====================== Hash table type implementation ==================== */
1095
1096 /* This is an hash table type that uses the SDS dynamic strings libary as
1097 * keys and radis objects as values (objects can hold SDS strings,
1098 * lists, sets). */
1099
1100 static void dictVanillaFree(void *privdata, void *val)
1101 {
1102 DICT_NOTUSED(privdata);
1103 zfree(val);
1104 }
1105
1106 static void dictListDestructor(void *privdata, void *val)
1107 {
1108 DICT_NOTUSED(privdata);
1109 listRelease((list*)val);
1110 }
1111
1112 static int sdsDictKeyCompare(void *privdata, const void *key1,
1113 const void *key2)
1114 {
1115 int l1,l2;
1116 DICT_NOTUSED(privdata);
1117
1118 l1 = sdslen((sds)key1);
1119 l2 = sdslen((sds)key2);
1120 if (l1 != l2) return 0;
1121 return memcmp(key1, key2, l1) == 0;
1122 }
1123
1124 static void dictRedisObjectDestructor(void *privdata, void *val)
1125 {
1126 DICT_NOTUSED(privdata);
1127
1128 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1129 decrRefCount(val);
1130 }
1131
1132 static int dictObjKeyCompare(void *privdata, const void *key1,
1133 const void *key2)
1134 {
1135 const robj *o1 = key1, *o2 = key2;
1136 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1137 }
1138
1139 static unsigned int dictObjHash(const void *key) {
1140 const robj *o = key;
1141 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1142 }
1143
1144 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1145 const void *key2)
1146 {
1147 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1148 int cmp;
1149
1150 if (o1->encoding == REDIS_ENCODING_INT &&
1151 o2->encoding == REDIS_ENCODING_INT)
1152 return o1->ptr == o2->ptr;
1153
1154 o1 = getDecodedObject(o1);
1155 o2 = getDecodedObject(o2);
1156 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1157 decrRefCount(o1);
1158 decrRefCount(o2);
1159 return cmp;
1160 }
1161
1162 static unsigned int dictEncObjHash(const void *key) {
1163 robj *o = (robj*) key;
1164
1165 if (o->encoding == REDIS_ENCODING_RAW) {
1166 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1167 } else {
1168 if (o->encoding == REDIS_ENCODING_INT) {
1169 char buf[32];
1170 int len;
1171
1172 len = ll2string(buf,32,(long)o->ptr);
1173 return dictGenHashFunction((unsigned char*)buf, len);
1174 } else {
1175 unsigned int hash;
1176
1177 o = getDecodedObject(o);
1178 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1179 decrRefCount(o);
1180 return hash;
1181 }
1182 }
1183 }
1184
1185 /* Sets type and expires */
1186 static dictType setDictType = {
1187 dictEncObjHash, /* hash function */
1188 NULL, /* key dup */
1189 NULL, /* val dup */
1190 dictEncObjKeyCompare, /* key compare */
1191 dictRedisObjectDestructor, /* key destructor */
1192 NULL /* val destructor */
1193 };
1194
1195 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1196 static dictType zsetDictType = {
1197 dictEncObjHash, /* hash function */
1198 NULL, /* key dup */
1199 NULL, /* val dup */
1200 dictEncObjKeyCompare, /* key compare */
1201 dictRedisObjectDestructor, /* key destructor */
1202 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1203 };
1204
1205 /* Db->dict */
1206 static dictType dbDictType = {
1207 dictObjHash, /* hash function */
1208 NULL, /* key dup */
1209 NULL, /* val dup */
1210 dictObjKeyCompare, /* key compare */
1211 dictRedisObjectDestructor, /* key destructor */
1212 dictRedisObjectDestructor /* val destructor */
1213 };
1214
1215 /* Db->expires */
1216 static dictType keyptrDictType = {
1217 dictObjHash, /* hash function */
1218 NULL, /* key dup */
1219 NULL, /* val dup */
1220 dictObjKeyCompare, /* key compare */
1221 dictRedisObjectDestructor, /* key destructor */
1222 NULL /* val destructor */
1223 };
1224
1225 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1226 static dictType hashDictType = {
1227 dictEncObjHash, /* hash function */
1228 NULL, /* key dup */
1229 NULL, /* val dup */
1230 dictEncObjKeyCompare, /* key compare */
1231 dictRedisObjectDestructor, /* key destructor */
1232 dictRedisObjectDestructor /* val destructor */
1233 };
1234
1235 /* Keylist hash table type has unencoded redis objects as keys and
1236 * lists as values. It's used for blocking operations (BLPOP) and to
1237 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1238 static dictType keylistDictType = {
1239 dictObjHash, /* hash function */
1240 NULL, /* key dup */
1241 NULL, /* val dup */
1242 dictObjKeyCompare, /* key compare */
1243 dictRedisObjectDestructor, /* key destructor */
1244 dictListDestructor /* val destructor */
1245 };
1246
1247 static void version();
1248
1249 /* ========================= Random utility functions ======================= */
1250
1251 /* Redis generally does not try to recover from out of memory conditions
1252 * when allocating objects or strings, it is not clear if it will be possible
1253 * to report this condition to the client since the networking layer itself
1254 * is based on heap allocation for send buffers, so we simply abort.
1255 * At least the code will be simpler to read... */
1256 static void oom(const char *msg) {
1257 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1258 sleep(1);
1259 abort();
1260 }
1261
1262 /* ====================== Redis server networking stuff ===================== */
1263 static void closeTimedoutClients(void) {
1264 redisClient *c;
1265 listNode *ln;
1266 time_t now = time(NULL);
1267 listIter li;
1268
1269 listRewind(server.clients,&li);
1270 while ((ln = listNext(&li)) != NULL) {
1271 c = listNodeValue(ln);
1272 if (server.maxidletime &&
1273 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1274 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1275 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1276 listLength(c->pubsub_patterns) == 0 &&
1277 (now - c->lastinteraction > server.maxidletime))
1278 {
1279 redisLog(REDIS_VERBOSE,"Closing idle client");
1280 freeClient(c);
1281 } else if (c->flags & REDIS_BLOCKED) {
1282 if (c->blockingto != 0 && c->blockingto < now) {
1283 addReply(c,shared.nullmultibulk);
1284 unblockClientWaitingData(c);
1285 }
1286 }
1287 }
1288 }
1289
1290 static int htNeedsResize(dict *dict) {
1291 long long size, used;
1292
1293 size = dictSlots(dict);
1294 used = dictSize(dict);
1295 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1296 (used*100/size < REDIS_HT_MINFILL));
1297 }
1298
1299 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1300 * we resize the hash table to save memory */
1301 static void tryResizeHashTables(void) {
1302 int j;
1303
1304 for (j = 0; j < server.dbnum; j++) {
1305 if (htNeedsResize(server.db[j].dict))
1306 dictResize(server.db[j].dict);
1307 if (htNeedsResize(server.db[j].expires))
1308 dictResize(server.db[j].expires);
1309 }
1310 }
1311
1312 /* Our hash table implementation performs rehashing incrementally while
1313 * we write/read from the hash table. Still if the server is idle, the hash
1314 * table will use two tables for a long time. So we try to use 1 millisecond
1315 * of CPU time at every serverCron() loop in order to rehash some key. */
1316 static void incrementallyRehash(void) {
1317 int j;
1318
1319 for (j = 0; j < server.dbnum; j++) {
1320 if (dictIsRehashing(server.db[j].dict)) {
1321 dictRehashMilliseconds(server.db[j].dict,1);
1322 break; /* already used our millisecond for this loop... */
1323 }
1324 }
1325 }
1326
1327 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1328 void backgroundSaveDoneHandler(int statloc) {
1329 int exitcode = WEXITSTATUS(statloc);
1330 int bysignal = WIFSIGNALED(statloc);
1331
1332 if (!bysignal && exitcode == 0) {
1333 redisLog(REDIS_NOTICE,
1334 "Background saving terminated with success");
1335 server.dirty = 0;
1336 server.lastsave = time(NULL);
1337 } else if (!bysignal && exitcode != 0) {
1338 redisLog(REDIS_WARNING, "Background saving error");
1339 } else {
1340 redisLog(REDIS_WARNING,
1341 "Background saving terminated by signal %d", WTERMSIG(statloc));
1342 rdbRemoveTempFile(server.bgsavechildpid);
1343 }
1344 server.bgsavechildpid = -1;
1345 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1346 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1347 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1348 }
1349
1350 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1351 * Handle this. */
1352 void backgroundRewriteDoneHandler(int statloc) {
1353 int exitcode = WEXITSTATUS(statloc);
1354 int bysignal = WIFSIGNALED(statloc);
1355
1356 if (!bysignal && exitcode == 0) {
1357 int fd;
1358 char tmpfile[256];
1359
1360 redisLog(REDIS_NOTICE,
1361 "Background append only file rewriting terminated with success");
1362 /* Now it's time to flush the differences accumulated by the parent */
1363 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1364 fd = open(tmpfile,O_WRONLY|O_APPEND);
1365 if (fd == -1) {
1366 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1367 goto cleanup;
1368 }
1369 /* Flush our data... */
1370 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1371 (signed) sdslen(server.bgrewritebuf)) {
1372 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1373 close(fd);
1374 goto cleanup;
1375 }
1376 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1377 /* Now our work is to rename the temp file into the stable file. And
1378 * switch the file descriptor used by the server for append only. */
1379 if (rename(tmpfile,server.appendfilename) == -1) {
1380 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1381 close(fd);
1382 goto cleanup;
1383 }
1384 /* Mission completed... almost */
1385 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1386 if (server.appendfd != -1) {
1387 /* If append only is actually enabled... */
1388 close(server.appendfd);
1389 server.appendfd = fd;
1390 fsync(fd);
1391 server.appendseldb = -1; /* Make sure it will issue SELECT */
1392 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1393 } else {
1394 /* If append only is disabled we just generate a dump in this
1395 * format. Why not? */
1396 close(fd);
1397 }
1398 } else if (!bysignal && exitcode != 0) {
1399 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1400 } else {
1401 redisLog(REDIS_WARNING,
1402 "Background append only file rewriting terminated by signal %d",
1403 WTERMSIG(statloc));
1404 }
1405 cleanup:
1406 sdsfree(server.bgrewritebuf);
1407 server.bgrewritebuf = sdsempty();
1408 aofRemoveTempFile(server.bgrewritechildpid);
1409 server.bgrewritechildpid = -1;
1410 }
1411
1412 /* This function is called once a background process of some kind terminates,
1413 * as we want to avoid resizing the hash tables when there is a child in order
1414 * to play well with copy-on-write (otherwise when a resize happens lots of
1415 * memory pages are copied). The goal of this function is to update the ability
1416 * for dict.c to resize the hash tables accordingly to the fact we have o not
1417 * running childs. */
1418 static void updateDictResizePolicy(void) {
1419 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1420 dictEnableResize();
1421 else
1422 dictDisableResize();
1423 }
1424
1425 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1426 int j, loops = server.cronloops++;
1427 REDIS_NOTUSED(eventLoop);
1428 REDIS_NOTUSED(id);
1429 REDIS_NOTUSED(clientData);
1430
1431 /* We take a cached value of the unix time in the global state because
1432 * with virtual memory and aging there is to store the current time
1433 * in objects at every object access, and accuracy is not needed.
1434 * To access a global var is faster than calling time(NULL) */
1435 server.unixtime = time(NULL);
1436
1437 /* We received a SIGTERM, shutting down here in a safe way, as it is
1438 * not ok doing so inside the signal handler. */
1439 if (server.shutdown_asap) {
1440 if (prepareForShutdown() == REDIS_OK) exit(0);
1441 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1442 }
1443
1444 /* Show some info about non-empty databases */
1445 for (j = 0; j < server.dbnum; j++) {
1446 long long size, used, vkeys;
1447
1448 size = dictSlots(server.db[j].dict);
1449 used = dictSize(server.db[j].dict);
1450 vkeys = dictSize(server.db[j].expires);
1451 if (!(loops % 50) && (used || vkeys)) {
1452 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1453 /* dictPrintStats(server.dict); */
1454 }
1455 }
1456
1457 /* We don't want to resize the hash tables while a bacground saving
1458 * is in progress: the saving child is created using fork() that is
1459 * implemented with a copy-on-write semantic in most modern systems, so
1460 * if we resize the HT while there is the saving child at work actually
1461 * a lot of memory movements in the parent will cause a lot of pages
1462 * copied. */
1463 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1464 if (!(loops % 10)) tryResizeHashTables();
1465 if (server.activerehashing) incrementallyRehash();
1466 }
1467
1468 /* Show information about connected clients */
1469 if (!(loops % 50)) {
1470 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1471 listLength(server.clients)-listLength(server.slaves),
1472 listLength(server.slaves),
1473 zmalloc_used_memory());
1474 }
1475
1476 /* Close connections of timedout clients */
1477 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1478 closeTimedoutClients();
1479
1480 /* Check if a background saving or AOF rewrite in progress terminated */
1481 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1482 int statloc;
1483 pid_t pid;
1484
1485 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1486 if (pid == server.bgsavechildpid) {
1487 backgroundSaveDoneHandler(statloc);
1488 } else {
1489 backgroundRewriteDoneHandler(statloc);
1490 }
1491 updateDictResizePolicy();
1492 }
1493 } else {
1494 /* If there is not a background saving in progress check if
1495 * we have to save now */
1496 time_t now = time(NULL);
1497 for (j = 0; j < server.saveparamslen; j++) {
1498 struct saveparam *sp = server.saveparams+j;
1499
1500 if (server.dirty >= sp->changes &&
1501 now-server.lastsave > sp->seconds) {
1502 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1503 sp->changes, sp->seconds);
1504 rdbSaveBackground(server.dbfilename);
1505 break;
1506 }
1507 }
1508 }
1509
1510 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1511 * will use few CPU cycles if there are few expiring keys, otherwise
1512 * it will get more aggressive to avoid that too much memory is used by
1513 * keys that can be removed from the keyspace. */
1514 for (j = 0; j < server.dbnum; j++) {
1515 int expired;
1516 redisDb *db = server.db+j;
1517
1518 /* Continue to expire if at the end of the cycle more than 25%
1519 * of the keys were expired. */
1520 do {
1521 long num = dictSize(db->expires);
1522 time_t now = time(NULL);
1523
1524 expired = 0;
1525 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1526 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1527 while (num--) {
1528 dictEntry *de;
1529 time_t t;
1530
1531 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1532 t = (time_t) dictGetEntryVal(de);
1533 if (now > t) {
1534 deleteKey(db,dictGetEntryKey(de));
1535 expired++;
1536 server.stat_expiredkeys++;
1537 }
1538 }
1539 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1540 }
1541
1542 /* Swap a few keys on disk if we are over the memory limit and VM
1543 * is enbled. Try to free objects from the free list first. */
1544 if (vmCanSwapOut()) {
1545 while (server.vm_enabled && zmalloc_used_memory() >
1546 server.vm_max_memory)
1547 {
1548 int retval;
1549
1550 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1551 retval = (server.vm_max_threads == 0) ?
1552 vmSwapOneObjectBlocking() :
1553 vmSwapOneObjectThreaded();
1554 if (retval == REDIS_ERR && !(loops % 300) &&
1555 zmalloc_used_memory() >
1556 (server.vm_max_memory+server.vm_max_memory/10))
1557 {
1558 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1559 }
1560 /* Note that when using threade I/O we free just one object,
1561 * because anyway when the I/O thread in charge to swap this
1562 * object out will finish, the handler of completed jobs
1563 * will try to swap more objects if we are still out of memory. */
1564 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1565 }
1566 }
1567
1568 /* Check if we should connect to a MASTER */
1569 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1570 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1571 if (syncWithMaster() == REDIS_OK) {
1572 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1573 if (server.appendonly) rewriteAppendOnlyFileBackground();
1574 }
1575 }
1576 return 100;
1577 }
1578
1579 /* This function gets called every time Redis is entering the
1580 * main loop of the event driven library, that is, before to sleep
1581 * for ready file descriptors. */
1582 static void beforeSleep(struct aeEventLoop *eventLoop) {
1583 REDIS_NOTUSED(eventLoop);
1584
1585 /* Awake clients that got all the swapped keys they requested */
1586 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1587 listIter li;
1588 listNode *ln;
1589
1590 listRewind(server.io_ready_clients,&li);
1591 while((ln = listNext(&li))) {
1592 redisClient *c = ln->value;
1593 struct redisCommand *cmd;
1594
1595 /* Resume the client. */
1596 listDelNode(server.io_ready_clients,ln);
1597 c->flags &= (~REDIS_IO_WAIT);
1598 server.vm_blocked_clients--;
1599 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1600 readQueryFromClient, c);
1601 cmd = lookupCommand(c->argv[0]->ptr);
1602 assert(cmd != NULL);
1603 call(c,cmd);
1604 resetClient(c);
1605 /* There may be more data to process in the input buffer. */
1606 if (c->querybuf && sdslen(c->querybuf) > 0)
1607 processInputBuffer(c);
1608 }
1609 }
1610 /* Write the AOF buffer on disk */
1611 flushAppendOnlyFile();
1612 }
1613
1614 static void createSharedObjects(void) {
1615 int j;
1616
1617 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1618 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1619 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1620 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1621 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1622 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1623 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1624 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1625 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1626 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1627 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1628 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1629 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1630 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1631 "-ERR no such key\r\n"));
1632 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1633 "-ERR syntax error\r\n"));
1634 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1635 "-ERR source and destination objects are the same\r\n"));
1636 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1637 "-ERR index out of range\r\n"));
1638 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1639 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1640 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1641 shared.select0 = createStringObject("select 0\r\n",10);
1642 shared.select1 = createStringObject("select 1\r\n",10);
1643 shared.select2 = createStringObject("select 2\r\n",10);
1644 shared.select3 = createStringObject("select 3\r\n",10);
1645 shared.select4 = createStringObject("select 4\r\n",10);
1646 shared.select5 = createStringObject("select 5\r\n",10);
1647 shared.select6 = createStringObject("select 6\r\n",10);
1648 shared.select7 = createStringObject("select 7\r\n",10);
1649 shared.select8 = createStringObject("select 8\r\n",10);
1650 shared.select9 = createStringObject("select 9\r\n",10);
1651 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1652 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1653 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1654 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1655 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1656 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1657 shared.mbulk3 = createStringObject("*3\r\n",4);
1658 shared.mbulk4 = createStringObject("*4\r\n",4);
1659 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1660 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1661 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1662 }
1663 }
1664
1665 static void appendServerSaveParams(time_t seconds, int changes) {
1666 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1667 server.saveparams[server.saveparamslen].seconds = seconds;
1668 server.saveparams[server.saveparamslen].changes = changes;
1669 server.saveparamslen++;
1670 }
1671
1672 static void resetServerSaveParams() {
1673 zfree(server.saveparams);
1674 server.saveparams = NULL;
1675 server.saveparamslen = 0;
1676 }
1677
1678 static void initServerConfig() {
1679 server.dbnum = REDIS_DEFAULT_DBNUM;
1680 server.port = REDIS_SERVERPORT;
1681 server.verbosity = REDIS_VERBOSE;
1682 server.maxidletime = REDIS_MAXIDLETIME;
1683 server.saveparams = NULL;
1684 server.logfile = NULL; /* NULL = log on standard output */
1685 server.bindaddr = NULL;
1686 server.glueoutputbuf = 1;
1687 server.daemonize = 0;
1688 server.appendonly = 0;
1689 server.appendfsync = APPENDFSYNC_EVERYSEC;
1690 server.lastfsync = time(NULL);
1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
1696 server.requirepass = NULL;
1697 server.rdbcompression = 1;
1698 server.activerehashing = 1;
1699 server.maxclients = 0;
1700 server.blpop_blocked_clients = 0;
1701 server.maxmemory = 0;
1702 server.vm_enabled = 0;
1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1707 server.vm_max_threads = 4;
1708 server.vm_blocked_clients = 0;
1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1711 server.shutdown_asap = 0;
1712
1713 resetServerSaveParams();
1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
1720 server.masterauth = NULL;
1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
1731 }
1732
1733 static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
1738 setupSigSegvAction();
1739
1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
1745 server.clients = listCreate();
1746 server.slaves = listCreate();
1747 server.monitors = listCreate();
1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
1757 for (j = 0; j < server.dbnum; j++) {
1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1764 server.db[j].id = j;
1765 }
1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1770 server.cronloops = 0;
1771 server.bgsavechildpid = -1;
1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
1774 server.aofbuf = sdsempty();
1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
1779 server.stat_expiredkeys = 0;
1780 server.stat_starttime = time(NULL);
1781 server.unixtime = time(NULL);
1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1785
1786 if (server.appendonly) {
1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
1794
1795 if (server.vm_enabled) vmInit();
1796 }
1797
1798 /* Empty the whole database */
1799 static long long emptyDb() {
1800 int j;
1801 long long removed = 0;
1802
1803 for (j = 0; j < server.dbnum; j++) {
1804 removed += dictSize(server.db[j].dict);
1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
1808 return removed;
1809 }
1810
1811 static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815 }
1816
1817 /* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819 static void loadServerConfig(char *filename) {
1820 FILE *fp;
1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1830 exit(1);
1831 }
1832 }
1833
1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1854 server.maxidletime = atoi(argv[1]);
1855 if (server.maxidletime < 0) {
1856 err = "Invalid timeout value"; goto loaderr;
1857 }
1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1864 server.bindaddr = zstrdup(argv[1]);
1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1888 FILE *logfp;
1889
1890 server.logfile = zstrdup(argv[1]);
1891 if (!strcasecmp(server.logfile,"stdout")) {
1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
1904 fclose(logfp);
1905 }
1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1916 server.maxmemory = memtoll(argv[1],NULL);
1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
1946 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1947 if (!strcasecmp(argv[1],"no")) {
1948 server.appendfsync = APPENDFSYNC_NO;
1949 } else if (!strcasecmp(argv[1],"always")) {
1950 server.appendfsync = APPENDFSYNC_ALWAYS;
1951 } else if (!strcasecmp(argv[1],"everysec")) {
1952 server.appendfsync = APPENDFSYNC_EVERYSEC;
1953 } else {
1954 err = "argument must be 'no', 'always' or 'everysec'";
1955 goto loaderr;
1956 }
1957 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1958 server.requirepass = zstrdup(argv[1]);
1959 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1960 zfree(server.pidfile);
1961 server.pidfile = zstrdup(argv[1]);
1962 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1963 zfree(server.dbfilename);
1964 server.dbfilename = zstrdup(argv[1]);
1965 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1966 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1967 err = "argument must be 'yes' or 'no'"; goto loaderr;
1968 }
1969 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1970 zfree(server.vm_swap_file);
1971 server.vm_swap_file = zstrdup(argv[1]);
1972 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1973 server.vm_max_memory = memtoll(argv[1],NULL);
1974 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1975 server.vm_page_size = memtoll(argv[1], NULL);
1976 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1977 server.vm_pages = memtoll(argv[1], NULL);
1978 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1979 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1980 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1981 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1982 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1983 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1984 } else {
1985 err = "Bad directive or wrong number of arguments"; goto loaderr;
1986 }
1987 for (j = 0; j < argc; j++)
1988 sdsfree(argv[j]);
1989 zfree(argv);
1990 sdsfree(line);
1991 }
1992 if (fp != stdin) fclose(fp);
1993 return;
1994
1995 loaderr:
1996 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1997 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1998 fprintf(stderr, ">>> '%s'\n", line);
1999 fprintf(stderr, "%s\n", err);
2000 exit(1);
2001 }
2002
2003 static void freeClientArgv(redisClient *c) {
2004 int j;
2005
2006 for (j = 0; j < c->argc; j++)
2007 decrRefCount(c->argv[j]);
2008 for (j = 0; j < c->mbargc; j++)
2009 decrRefCount(c->mbargv[j]);
2010 c->argc = 0;
2011 c->mbargc = 0;
2012 }
2013
2014 static void freeClient(redisClient *c) {
2015 listNode *ln;
2016
2017 /* Note that if the client we are freeing is blocked into a blocking
2018 * call, we have to set querybuf to NULL *before* to call
2019 * unblockClientWaitingData() to avoid processInputBuffer() will get
2020 * called. Also it is important to remove the file events after
2021 * this, because this call adds the READABLE event. */
2022 sdsfree(c->querybuf);
2023 c->querybuf = NULL;
2024 if (c->flags & REDIS_BLOCKED)
2025 unblockClientWaitingData(c);
2026
2027 /* UNWATCH all the keys */
2028 unwatchAllKeys(c);
2029 listRelease(c->watched_keys);
2030 /* Unsubscribe from all the pubsub channels */
2031 pubsubUnsubscribeAllChannels(c,0);
2032 pubsubUnsubscribeAllPatterns(c,0);
2033 dictRelease(c->pubsub_channels);
2034 listRelease(c->pubsub_patterns);
2035 /* Obvious cleanup */
2036 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2037 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2038 listRelease(c->reply);
2039 freeClientArgv(c);
2040 close(c->fd);
2041 /* Remove from the list of clients */
2042 ln = listSearchKey(server.clients,c);
2043 redisAssert(ln != NULL);
2044 listDelNode(server.clients,ln);
2045 /* Remove from the list of clients that are now ready to be restarted
2046 * after waiting for swapped keys */
2047 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2048 ln = listSearchKey(server.io_ready_clients,c);
2049 if (ln) {
2050 listDelNode(server.io_ready_clients,ln);
2051 server.vm_blocked_clients--;
2052 }
2053 }
2054 /* Remove from the list of clients waiting for swapped keys */
2055 while (server.vm_enabled && listLength(c->io_keys)) {
2056 ln = listFirst(c->io_keys);
2057 dontWaitForSwappedKey(c,ln->value);
2058 }
2059 listRelease(c->io_keys);
2060 /* Master/slave cleanup */
2061 if (c->flags & REDIS_SLAVE) {
2062 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2063 close(c->repldbfd);
2064 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2065 ln = listSearchKey(l,c);
2066 redisAssert(ln != NULL);
2067 listDelNode(l,ln);
2068 }
2069 if (c->flags & REDIS_MASTER) {
2070 server.master = NULL;
2071 server.replstate = REDIS_REPL_CONNECT;
2072 }
2073 /* Release memory */
2074 zfree(c->argv);
2075 zfree(c->mbargv);
2076 freeClientMultiState(c);
2077 zfree(c);
2078 }
2079
2080 #define GLUEREPLY_UP_TO (1024)
2081 static void glueReplyBuffersIfNeeded(redisClient *c) {
2082 int copylen = 0;
2083 char buf[GLUEREPLY_UP_TO];
2084 listNode *ln;
2085 listIter li;
2086 robj *o;
2087
2088 listRewind(c->reply,&li);
2089 while((ln = listNext(&li))) {
2090 int objlen;
2091
2092 o = ln->value;
2093 objlen = sdslen(o->ptr);
2094 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2095 memcpy(buf+copylen,o->ptr,objlen);
2096 copylen += objlen;
2097 listDelNode(c->reply,ln);
2098 } else {
2099 if (copylen == 0) return;
2100 break;
2101 }
2102 }
2103 /* Now the output buffer is empty, add the new single element */
2104 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2105 listAddNodeHead(c->reply,o);
2106 }
2107
2108 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2109 redisClient *c = privdata;
2110 int nwritten = 0, totwritten = 0, objlen;
2111 robj *o;
2112 REDIS_NOTUSED(el);
2113 REDIS_NOTUSED(mask);
2114
2115 /* Use writev() if we have enough buffers to send */
2116 if (!server.glueoutputbuf &&
2117 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2118 !(c->flags & REDIS_MASTER))
2119 {
2120 sendReplyToClientWritev(el, fd, privdata, mask);
2121 return;
2122 }
2123
2124 while(listLength(c->reply)) {
2125 if (server.glueoutputbuf && listLength(c->reply) > 1)
2126 glueReplyBuffersIfNeeded(c);
2127
2128 o = listNodeValue(listFirst(c->reply));
2129 objlen = sdslen(o->ptr);
2130
2131 if (objlen == 0) {
2132 listDelNode(c->reply,listFirst(c->reply));
2133 continue;
2134 }
2135
2136 if (c->flags & REDIS_MASTER) {
2137 /* Don't reply to a master */
2138 nwritten = objlen - c->sentlen;
2139 } else {
2140 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2141 if (nwritten <= 0) break;
2142 }
2143 c->sentlen += nwritten;
2144 totwritten += nwritten;
2145 /* If we fully sent the object on head go to the next one */
2146 if (c->sentlen == objlen) {
2147 listDelNode(c->reply,listFirst(c->reply));
2148 c->sentlen = 0;
2149 }
2150 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2151 * bytes, in a single threaded server it's a good idea to serve
2152 * other clients as well, even if a very large request comes from
2153 * super fast link that is always able to accept data (in real world
2154 * scenario think about 'KEYS *' against the loopback interfae) */
2155 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2156 }
2157 if (nwritten == -1) {
2158 if (errno == EAGAIN) {
2159 nwritten = 0;
2160 } else {
2161 redisLog(REDIS_VERBOSE,
2162 "Error writing to client: %s", strerror(errno));
2163 freeClient(c);
2164 return;
2165 }
2166 }
2167 if (totwritten > 0) c->lastinteraction = time(NULL);
2168 if (listLength(c->reply) == 0) {
2169 c->sentlen = 0;
2170 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2171 }
2172 }
2173
2174 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2175 {
2176 redisClient *c = privdata;
2177 int nwritten = 0, totwritten = 0, objlen, willwrite;
2178 robj *o;
2179 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2180 int offset, ion = 0;
2181 REDIS_NOTUSED(el);
2182 REDIS_NOTUSED(mask);
2183
2184 listNode *node;
2185 while (listLength(c->reply)) {
2186 offset = c->sentlen;
2187 ion = 0;
2188 willwrite = 0;
2189
2190 /* fill-in the iov[] array */
2191 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2192 o = listNodeValue(node);
2193 objlen = sdslen(o->ptr);
2194
2195 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2196 break;
2197
2198 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2199 break; /* no more iovecs */
2200
2201 iov[ion].iov_base = ((char*)o->ptr) + offset;
2202 iov[ion].iov_len = objlen - offset;
2203 willwrite += objlen - offset;
2204 offset = 0; /* just for the first item */
2205 ion++;
2206 }
2207
2208 if(willwrite == 0)
2209 break;
2210
2211 /* write all collected blocks at once */
2212 if((nwritten = writev(fd, iov, ion)) < 0) {
2213 if (errno != EAGAIN) {
2214 redisLog(REDIS_VERBOSE,
2215 "Error writing to client: %s", strerror(errno));
2216 freeClient(c);
2217 return;
2218 }
2219 break;
2220 }
2221
2222 totwritten += nwritten;
2223 offset = c->sentlen;
2224
2225 /* remove written robjs from c->reply */
2226 while (nwritten && listLength(c->reply)) {
2227 o = listNodeValue(listFirst(c->reply));
2228 objlen = sdslen(o->ptr);
2229
2230 if(nwritten >= objlen - offset) {
2231 listDelNode(c->reply, listFirst(c->reply));
2232 nwritten -= objlen - offset;
2233 c->sentlen = 0;
2234 } else {
2235 /* partial write */
2236 c->sentlen += nwritten;
2237 break;
2238 }
2239 offset = 0;
2240 }
2241 }
2242
2243 if (totwritten > 0)
2244 c->lastinteraction = time(NULL);
2245
2246 if (listLength(c->reply) == 0) {
2247 c->sentlen = 0;
2248 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2249 }
2250 }
2251
2252 static int qsortRedisCommands(const void *r1, const void *r2) {
2253 return strcasecmp(
2254 ((struct redisCommand*)r1)->name,
2255 ((struct redisCommand*)r2)->name);
2256 }
2257
2258 static void sortCommandTable() {
2259 int i = 0, size = 0;
2260
2261 /* Determine and store the size of the command table */
2262 while(readonlyCommandTable[i++].name != NULL) size++;
2263 commandTableSize = size;
2264
2265 /* Copy and sort the read-only version of the command table */
2266 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2267 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2268 qsort(commandTable,size,sizeof(struct redisCommand),qsortRedisCommands);
2269 }
2270
2271 static struct redisCommand *lookupCommand(char *name) {
2272 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2273 return bsearch(
2274 &tmp,
2275 commandTable,
2276 commandTableSize,
2277 sizeof(struct redisCommand),
2278 qsortRedisCommands);
2279 }
2280
2281 /* resetClient prepare the client to process the next command */
2282 static void resetClient(redisClient *c) {
2283 freeClientArgv(c);
2284 c->bulklen = -1;
2285 c->multibulk = 0;
2286 }
2287
2288 /* Call() is the core of Redis execution of a command */
2289 static void call(redisClient *c, struct redisCommand *cmd) {
2290 long long dirty;
2291
2292 dirty = server.dirty;
2293 cmd->proc(c);
2294 dirty = server.dirty-dirty;
2295
2296 if (server.appendonly && dirty)
2297 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2298 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2299 listLength(server.slaves))
2300 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2301 if (listLength(server.monitors))
2302 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2303 server.stat_numcommands++;
2304 }
2305
2306 /* If this function gets called we already read a whole
2307 * command, argments are in the client argv/argc fields.
2308 * processCommand() execute the command or prepare the
2309 * server for a bulk read from the client.
2310 *
2311 * If 1 is returned the client is still alive and valid and
2312 * and other operations can be performed by the caller. Otherwise
2313 * if 0 is returned the client was destroied (i.e. after QUIT). */
2314 static int processCommand(redisClient *c) {
2315 struct redisCommand *cmd;
2316
2317 /* Free some memory if needed (maxmemory setting) */
2318 if (server.maxmemory) freeMemoryIfNeeded();
2319
2320 /* Handle the multi bulk command type. This is an alternative protocol
2321 * supported by Redis in order to receive commands that are composed of
2322 * multiple binary-safe "bulk" arguments. The latency of processing is
2323 * a bit higher but this allows things like multi-sets, so if this
2324 * protocol is used only for MSET and similar commands this is a big win. */
2325 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2326 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2327 if (c->multibulk <= 0) {
2328 resetClient(c);
2329 return 1;
2330 } else {
2331 decrRefCount(c->argv[c->argc-1]);
2332 c->argc--;
2333 return 1;
2334 }
2335 } else if (c->multibulk) {
2336 if (c->bulklen == -1) {
2337 if (((char*)c->argv[0]->ptr)[0] != '$') {
2338 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2339 resetClient(c);
2340 return 1;
2341 } else {
2342 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2343 decrRefCount(c->argv[0]);
2344 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2345 c->argc--;
2346 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2347 resetClient(c);
2348 return 1;
2349 }
2350 c->argc--;
2351 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2352 return 1;
2353 }
2354 } else {
2355 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2356 c->mbargv[c->mbargc] = c->argv[0];
2357 c->mbargc++;
2358 c->argc--;
2359 c->multibulk--;
2360 if (c->multibulk == 0) {
2361 robj **auxargv;
2362 int auxargc;
2363
2364 /* Here we need to swap the multi-bulk argc/argv with the
2365 * normal argc/argv of the client structure. */
2366 auxargv = c->argv;
2367 c->argv = c->mbargv;
2368 c->mbargv = auxargv;
2369
2370 auxargc = c->argc;
2371 c->argc = c->mbargc;
2372 c->mbargc = auxargc;
2373
2374 /* We need to set bulklen to something different than -1
2375 * in order for the code below to process the command without
2376 * to try to read the last argument of a bulk command as
2377 * a special argument. */
2378 c->bulklen = 0;
2379 /* continue below and process the command */
2380 } else {
2381 c->bulklen = -1;
2382 return 1;
2383 }
2384 }
2385 }
2386 /* -- end of multi bulk commands processing -- */
2387
2388 /* The QUIT command is handled as a special case. Normal command
2389 * procs are unable to close the client connection safely */
2390 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2391 freeClient(c);
2392 return 0;
2393 }
2394
2395 /* Now lookup the command and check ASAP about trivial error conditions
2396 * such wrong arity, bad command name and so forth. */
2397 cmd = lookupCommand(c->argv[0]->ptr);
2398 if (!cmd) {
2399 addReplySds(c,
2400 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2401 (char*)c->argv[0]->ptr));
2402 resetClient(c);
2403 return 1;
2404 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2405 (c->argc < -cmd->arity)) {
2406 addReplySds(c,
2407 sdscatprintf(sdsempty(),
2408 "-ERR wrong number of arguments for '%s' command\r\n",
2409 cmd->name));
2410 resetClient(c);
2411 return 1;
2412 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2413 /* This is a bulk command, we have to read the last argument yet. */
2414 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2415
2416 decrRefCount(c->argv[c->argc-1]);
2417 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2418 c->argc--;
2419 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2420 resetClient(c);
2421 return 1;
2422 }
2423 c->argc--;
2424 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2425 /* It is possible that the bulk read is already in the
2426 * buffer. Check this condition and handle it accordingly.
2427 * This is just a fast path, alternative to call processInputBuffer().
2428 * It's a good idea since the code is small and this condition
2429 * happens most of the times. */
2430 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2431 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2432 c->argc++;
2433 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2434 } else {
2435 /* Otherwise return... there is to read the last argument
2436 * from the socket. */
2437 return 1;
2438 }
2439 }
2440 /* Let's try to encode the bulk object to save space. */
2441 if (cmd->flags & REDIS_CMD_BULK)
2442 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2443
2444 /* Check if the user is authenticated */
2445 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2446 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2447 resetClient(c);
2448 return 1;
2449 }
2450
2451 /* Handle the maxmemory directive */
2452 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2453 zmalloc_used_memory() > server.maxmemory)
2454 {
2455 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2456 resetClient(c);
2457 return 1;
2458 }
2459
2460 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2461 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2462 &&
2463 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2464 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2465 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2466 resetClient(c);
2467 return 1;
2468 }
2469
2470 /* Exec the command */
2471 if (c->flags & REDIS_MULTI &&
2472 cmd->proc != execCommand && cmd->proc != discardCommand &&
2473 cmd->proc != multiCommand && cmd->proc != watchCommand)
2474 {
2475 queueMultiCommand(c,cmd);
2476 addReply(c,shared.queued);
2477 } else {
2478 if (server.vm_enabled && server.vm_max_threads > 0 &&
2479 blockClientOnSwappedKeys(c,cmd)) return 1;
2480 call(c,cmd);
2481 }
2482
2483 /* Prepare the client for the next command */
2484 resetClient(c);
2485 return 1;
2486 }
2487
2488 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2489 listNode *ln;
2490 listIter li;
2491 int outc = 0, j;
2492 robj **outv;
2493 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2494 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2495 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2496 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2497 robj *lenobj;
2498
2499 if (argc <= REDIS_STATIC_ARGS) {
2500 outv = static_outv;
2501 } else {
2502 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2503 }
2504
2505 lenobj = createObject(REDIS_STRING,
2506 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2507 lenobj->refcount = 0;
2508 outv[outc++] = lenobj;
2509 for (j = 0; j < argc; j++) {
2510 lenobj = createObject(REDIS_STRING,
2511 sdscatprintf(sdsempty(),"$%lu\r\n",
2512 (unsigned long) stringObjectLen(argv[j])));
2513 lenobj->refcount = 0;
2514 outv[outc++] = lenobj;
2515 outv[outc++] = argv[j];
2516 outv[outc++] = shared.crlf;
2517 }
2518
2519 /* Increment all the refcounts at start and decrement at end in order to
2520 * be sure to free objects if there is no slave in a replication state
2521 * able to be feed with commands */
2522 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2523 listRewind(slaves,&li);
2524 while((ln = listNext(&li))) {
2525 redisClient *slave = ln->value;
2526
2527 /* Don't feed slaves that are still waiting for BGSAVE to start */
2528 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2529
2530 /* Feed all the other slaves, MONITORs and so on */
2531 if (slave->slaveseldb != dictid) {
2532 robj *selectcmd;
2533
2534 switch(dictid) {
2535 case 0: selectcmd = shared.select0; break;
2536 case 1: selectcmd = shared.select1; break;
2537 case 2: selectcmd = shared.select2; break;
2538 case 3: selectcmd = shared.select3; break;
2539 case 4: selectcmd = shared.select4; break;
2540 case 5: selectcmd = shared.select5; break;
2541 case 6: selectcmd = shared.select6; break;
2542 case 7: selectcmd = shared.select7; break;
2543 case 8: selectcmd = shared.select8; break;
2544 case 9: selectcmd = shared.select9; break;
2545 default:
2546 selectcmd = createObject(REDIS_STRING,
2547 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2548 selectcmd->refcount = 0;
2549 break;
2550 }
2551 addReply(slave,selectcmd);
2552 slave->slaveseldb = dictid;
2553 }
2554 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2555 }
2556 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2557 if (outv != static_outv) zfree(outv);
2558 }
2559
2560 static sds sdscatrepr(sds s, char *p, size_t len) {
2561 s = sdscatlen(s,"\"",1);
2562 while(len--) {
2563 switch(*p) {
2564 case '\\':
2565 case '"':
2566 s = sdscatprintf(s,"\\%c",*p);
2567 break;
2568 case '\n': s = sdscatlen(s,"\\n",1); break;
2569 case '\r': s = sdscatlen(s,"\\r",1); break;
2570 case '\t': s = sdscatlen(s,"\\t",1); break;
2571 case '\a': s = sdscatlen(s,"\\a",1); break;
2572 case '\b': s = sdscatlen(s,"\\b",1); break;
2573 default:
2574 if (isprint(*p))
2575 s = sdscatprintf(s,"%c",*p);
2576 else
2577 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2578 break;
2579 }
2580 p++;
2581 }
2582 return sdscatlen(s,"\"",1);
2583 }
2584
2585 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2586 listNode *ln;
2587 listIter li;
2588 int j;
2589 sds cmdrepr = sdsnew("+");
2590 robj *cmdobj;
2591 struct timeval tv;
2592
2593 gettimeofday(&tv,NULL);
2594 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2595 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2596
2597 for (j = 0; j < argc; j++) {
2598 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2599 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2600 } else {
2601 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2602 sdslen(argv[j]->ptr));
2603 }
2604 if (j != argc-1)
2605 cmdrepr = sdscatlen(cmdrepr," ",1);
2606 }
2607 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2608 cmdobj = createObject(REDIS_STRING,cmdrepr);
2609
2610 listRewind(monitors,&li);
2611 while((ln = listNext(&li))) {
2612 redisClient *monitor = ln->value;
2613 addReply(monitor,cmdobj);
2614 }
2615 decrRefCount(cmdobj);
2616 }
2617
2618 static void processInputBuffer(redisClient *c) {
2619 again:
2620 /* Before to process the input buffer, make sure the client is not
2621 * waitig for a blocking operation such as BLPOP. Note that the first
2622 * iteration the client is never blocked, otherwise the processInputBuffer
2623 * would not be called at all, but after the execution of the first commands
2624 * in the input buffer the client may be blocked, and the "goto again"
2625 * will try to reiterate. The following line will make it return asap. */
2626 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2627 if (c->bulklen == -1) {
2628 /* Read the first line of the query */
2629 char *p = strchr(c->querybuf,'\n');
2630 size_t querylen;
2631
2632 if (p) {
2633 sds query, *argv;
2634 int argc, j;
2635
2636 query = c->querybuf;
2637 c->querybuf = sdsempty();
2638 querylen = 1+(p-(query));
2639 if (sdslen(query) > querylen) {
2640 /* leave data after the first line of the query in the buffer */
2641 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2642 }
2643 *p = '\0'; /* remove "\n" */
2644 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2645 sdsupdatelen(query);
2646
2647 /* Now we can split the query in arguments */
2648 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2649 sdsfree(query);
2650
2651 if (c->argv) zfree(c->argv);
2652 c->argv = zmalloc(sizeof(robj*)*argc);
2653
2654 for (j = 0; j < argc; j++) {
2655 if (sdslen(argv[j])) {
2656 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2657 c->argc++;
2658 } else {
2659 sdsfree(argv[j]);
2660 }
2661 }
2662 zfree(argv);
2663 if (c->argc) {
2664 /* Execute the command. If the client is still valid
2665 * after processCommand() return and there is something
2666 * on the query buffer try to process the next command. */
2667 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2668 } else {
2669 /* Nothing to process, argc == 0. Just process the query
2670 * buffer if it's not empty or return to the caller */
2671 if (sdslen(c->querybuf)) goto again;
2672 }
2673 return;
2674 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2675 redisLog(REDIS_VERBOSE, "Client protocol error");
2676 freeClient(c);
2677 return;
2678 }
2679 } else {
2680 /* Bulk read handling. Note that if we are at this point
2681 the client already sent a command terminated with a newline,
2682 we are reading the bulk data that is actually the last
2683 argument of the command. */
2684 int qbl = sdslen(c->querybuf);
2685
2686 if (c->bulklen <= qbl) {
2687 /* Copy everything but the final CRLF as final argument */
2688 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2689 c->argc++;
2690 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2691 /* Process the command. If the client is still valid after
2692 * the processing and there is more data in the buffer
2693 * try to parse it. */
2694 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2695 return;
2696 }
2697 }
2698 }
2699
2700 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2701 redisClient *c = (redisClient*) privdata;
2702 char buf[REDIS_IOBUF_LEN];
2703 int nread;
2704 REDIS_NOTUSED(el);
2705 REDIS_NOTUSED(mask);
2706
2707 nread = read(fd, buf, REDIS_IOBUF_LEN);
2708 if (nread == -1) {
2709 if (errno == EAGAIN) {
2710 nread = 0;
2711 } else {
2712 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2713 freeClient(c);
2714 return;
2715 }
2716 } else if (nread == 0) {
2717 redisLog(REDIS_VERBOSE, "Client closed connection");
2718 freeClient(c);
2719 return;
2720 }
2721 if (nread) {
2722 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2723 c->lastinteraction = time(NULL);
2724 } else {
2725 return;
2726 }
2727 processInputBuffer(c);
2728 }
2729
2730 static int selectDb(redisClient *c, int id) {
2731 if (id < 0 || id >= server.dbnum)
2732 return REDIS_ERR;
2733 c->db = &server.db[id];
2734 return REDIS_OK;
2735 }
2736
2737 static void *dupClientReplyValue(void *o) {
2738 incrRefCount((robj*)o);
2739 return o;
2740 }
2741
2742 static int listMatchObjects(void *a, void *b) {
2743 return equalStringObjects(a,b);
2744 }
2745
2746 static redisClient *createClient(int fd) {
2747 redisClient *c = zmalloc(sizeof(*c));
2748
2749 anetNonBlock(NULL,fd);
2750 anetTcpNoDelay(NULL,fd);
2751 if (!c) return NULL;
2752 selectDb(c,0);
2753 c->fd = fd;
2754 c->querybuf = sdsempty();
2755 c->argc = 0;
2756 c->argv = NULL;
2757 c->bulklen = -1;
2758 c->multibulk = 0;
2759 c->mbargc = 0;
2760 c->mbargv = NULL;
2761 c->sentlen = 0;
2762 c->flags = 0;
2763 c->lastinteraction = time(NULL);
2764 c->authenticated = 0;
2765 c->replstate = REDIS_REPL_NONE;
2766 c->reply = listCreate();
2767 listSetFreeMethod(c->reply,decrRefCount);
2768 listSetDupMethod(c->reply,dupClientReplyValue);
2769 c->blocking_keys = NULL;
2770 c->blocking_keys_num = 0;
2771 c->io_keys = listCreate();
2772 c->watched_keys = listCreate();
2773 listSetFreeMethod(c->io_keys,decrRefCount);
2774 c->pubsub_channels = dictCreate(&setDictType,NULL);
2775 c->pubsub_patterns = listCreate();
2776 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2777 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2778 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2779 readQueryFromClient, c) == AE_ERR) {
2780 freeClient(c);
2781 return NULL;
2782 }
2783 listAddNodeTail(server.clients,c);
2784 initClientMultiState(c);
2785 return c;
2786 }
2787
2788 static void addReply(redisClient *c, robj *obj) {
2789 if (listLength(c->reply) == 0 &&
2790 (c->replstate == REDIS_REPL_NONE ||
2791 c->replstate == REDIS_REPL_ONLINE) &&
2792 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2793 sendReplyToClient, c) == AE_ERR) return;
2794
2795 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2796 obj = dupStringObject(obj);
2797 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2798 }
2799 listAddNodeTail(c->reply,getDecodedObject(obj));
2800 }
2801
2802 static void addReplySds(redisClient *c, sds s) {
2803 robj *o = createObject(REDIS_STRING,s);
2804 addReply(c,o);
2805 decrRefCount(o);
2806 }
2807
2808 static void addReplyDouble(redisClient *c, double d) {
2809 char buf[128];
2810
2811 snprintf(buf,sizeof(buf),"%.17g",d);
2812 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2813 (unsigned long) strlen(buf),buf));
2814 }
2815
2816 static void addReplyLongLong(redisClient *c, long long ll) {
2817 char buf[128];
2818 size_t len;
2819
2820 if (ll == 0) {
2821 addReply(c,shared.czero);
2822 return;
2823 } else if (ll == 1) {
2824 addReply(c,shared.cone);
2825 return;
2826 }
2827 buf[0] = ':';
2828 len = ll2string(buf+1,sizeof(buf)-1,ll);
2829 buf[len+1] = '\r';
2830 buf[len+2] = '\n';
2831 addReplySds(c,sdsnewlen(buf,len+3));
2832 }
2833
2834 static void addReplyUlong(redisClient *c, unsigned long ul) {
2835 char buf[128];
2836 size_t len;
2837
2838 if (ul == 0) {
2839 addReply(c,shared.czero);
2840 return;
2841 } else if (ul == 1) {
2842 addReply(c,shared.cone);
2843 return;
2844 }
2845 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2846 addReplySds(c,sdsnewlen(buf,len));
2847 }
2848
2849 static void addReplyBulkLen(redisClient *c, robj *obj) {
2850 size_t len, intlen;
2851 char buf[128];
2852
2853 if (obj->encoding == REDIS_ENCODING_RAW) {
2854 len = sdslen(obj->ptr);
2855 } else {
2856 long n = (long)obj->ptr;
2857
2858 /* Compute how many bytes will take this integer as a radix 10 string */
2859 len = 1;
2860 if (n < 0) {
2861 len++;
2862 n = -n;
2863 }
2864 while((n = n/10) != 0) {
2865 len++;
2866 }
2867 }
2868 buf[0] = '$';
2869 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2870 buf[intlen+1] = '\r';
2871 buf[intlen+2] = '\n';
2872 addReplySds(c,sdsnewlen(buf,intlen+3));
2873 }
2874
2875 static void addReplyBulk(redisClient *c, robj *obj) {
2876 addReplyBulkLen(c,obj);
2877 addReply(c,obj);
2878 addReply(c,shared.crlf);
2879 }
2880
2881 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2882 static void addReplyBulkCString(redisClient *c, char *s) {
2883 if (s == NULL) {
2884 addReply(c,shared.nullbulk);
2885 } else {
2886 robj *o = createStringObject(s,strlen(s));
2887 addReplyBulk(c,o);
2888 decrRefCount(o);
2889 }
2890 }
2891
2892 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2893 int cport, cfd;
2894 char cip[128];
2895 redisClient *c;
2896 REDIS_NOTUSED(el);
2897 REDIS_NOTUSED(mask);
2898 REDIS_NOTUSED(privdata);
2899
2900 cfd = anetAccept(server.neterr, fd, cip, &cport);
2901 if (cfd == AE_ERR) {
2902 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2903 return;
2904 }
2905 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2906 if ((c = createClient(cfd)) == NULL) {
2907 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2908 close(cfd); /* May be already closed, just ingore errors */
2909 return;
2910 }
2911 /* If maxclient directive is set and this is one client more... close the
2912 * connection. Note that we create the client instead to check before
2913 * for this condition, since now the socket is already set in nonblocking
2914 * mode and we can send an error for free using the Kernel I/O */
2915 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2916 char *err = "-ERR max number of clients reached\r\n";
2917
2918 /* That's a best effort error message, don't check write errors */
2919 if (write(c->fd,err,strlen(err)) == -1) {
2920 /* Nothing to do, Just to avoid the warning... */
2921 }
2922 freeClient(c);
2923 return;
2924 }
2925 server.stat_numconnections++;
2926 }
2927
2928 /* ======================= Redis objects implementation ===================== */
2929
2930 static robj *createObject(int type, void *ptr) {
2931 robj *o;
2932
2933 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2934 if (listLength(server.objfreelist)) {
2935 listNode *head = listFirst(server.objfreelist);
2936 o = listNodeValue(head);
2937 listDelNode(server.objfreelist,head);
2938 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2939 } else {
2940 if (server.vm_enabled) {
2941 pthread_mutex_unlock(&server.obj_freelist_mutex);
2942 o = zmalloc(sizeof(*o));
2943 } else {
2944 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2945 }
2946 }
2947 o->type = type;
2948 o->encoding = REDIS_ENCODING_RAW;
2949 o->ptr = ptr;
2950 o->refcount = 1;
2951 if (server.vm_enabled) {
2952 /* Note that this code may run in the context of an I/O thread
2953 * and accessing to server.unixtime in theory is an error
2954 * (no locks). But in practice this is safe, and even if we read
2955 * garbage Redis will not fail, as it's just a statistical info */
2956 o->vm.atime = server.unixtime;
2957 o->storage = REDIS_VM_MEMORY;
2958 }
2959 return o;
2960 }
2961
2962 static robj *createStringObject(char *ptr, size_t len) {
2963 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2964 }
2965
2966 static robj *createStringObjectFromLongLong(long long value) {
2967 robj *o;
2968 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2969 incrRefCount(shared.integers[value]);
2970 o = shared.integers[value];
2971 } else {
2972 if (value >= LONG_MIN && value <= LONG_MAX) {
2973 o = createObject(REDIS_STRING, NULL);
2974 o->encoding = REDIS_ENCODING_INT;
2975 o->ptr = (void*)((long)value);
2976 } else {
2977 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2978 }
2979 }
2980 return o;
2981 }
2982
2983 static robj *dupStringObject(robj *o) {
2984 assert(o->encoding == REDIS_ENCODING_RAW);
2985 return createStringObject(o->ptr,sdslen(o->ptr));
2986 }
2987
2988 static robj *createListObject(void) {
2989 list *l = listCreate();
2990
2991 listSetFreeMethod(l,decrRefCount);
2992 return createObject(REDIS_LIST,l);
2993 }
2994
2995 static robj *createSetObject(void) {
2996 dict *d = dictCreate(&setDictType,NULL);
2997 return createObject(REDIS_SET,d);
2998 }
2999
3000 static robj *createHashObject(void) {
3001 /* All the Hashes start as zipmaps. Will be automatically converted
3002 * into hash tables if there are enough elements or big elements
3003 * inside. */
3004 unsigned char *zm = zipmapNew();
3005 robj *o = createObject(REDIS_HASH,zm);
3006 o->encoding = REDIS_ENCODING_ZIPMAP;
3007 return o;
3008 }
3009
3010 static robj *createZsetObject(void) {
3011 zset *zs = zmalloc(sizeof(*zs));
3012
3013 zs->dict = dictCreate(&zsetDictType,NULL);
3014 zs->zsl = zslCreate();
3015 return createObject(REDIS_ZSET,zs);
3016 }
3017
3018 static void freeStringObject(robj *o) {
3019 if (o->encoding == REDIS_ENCODING_RAW) {
3020 sdsfree(o->ptr);
3021 }
3022 }
3023
3024 static void freeListObject(robj *o) {
3025 listRelease((list*) o->ptr);
3026 }
3027
3028 static void freeSetObject(robj *o) {
3029 dictRelease((dict*) o->ptr);
3030 }
3031
3032 static void freeZsetObject(robj *o) {
3033 zset *zs = o->ptr;
3034
3035 dictRelease(zs->dict);
3036 zslFree(zs->zsl);
3037 zfree(zs);
3038 }
3039
3040 static void freeHashObject(robj *o) {
3041 switch (o->encoding) {
3042 case REDIS_ENCODING_HT:
3043 dictRelease((dict*) o->ptr);
3044 break;
3045 case REDIS_ENCODING_ZIPMAP:
3046 zfree(o->ptr);
3047 break;
3048 default:
3049 redisPanic("Unknown hash encoding type");
3050 break;
3051 }
3052 }
3053
3054 static void incrRefCount(robj *o) {
3055 o->refcount++;
3056 }
3057
3058 static void decrRefCount(void *obj) {
3059 robj *o = obj;
3060
3061 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3062 /* Object is a key of a swapped out value, or in the process of being
3063 * loaded. */
3064 if (server.vm_enabled &&
3065 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3066 {
3067 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3068 redisAssert(o->type == REDIS_STRING);
3069 freeStringObject(o);
3070 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3071 pthread_mutex_lock(&server.obj_freelist_mutex);
3072 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3073 !listAddNodeHead(server.objfreelist,o))
3074 zfree(o);
3075 pthread_mutex_unlock(&server.obj_freelist_mutex);
3076 server.vm_stats_swapped_objects--;
3077 return;
3078 }
3079 /* Object is in memory, or in the process of being swapped out. */
3080 if (--(o->refcount) == 0) {
3081 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3082 vmCancelThreadedIOJob(obj);
3083 switch(o->type) {
3084 case REDIS_STRING: freeStringObject(o); break;
3085 case REDIS_LIST: freeListObject(o); break;
3086 case REDIS_SET: freeSetObject(o); break;
3087 case REDIS_ZSET: freeZsetObject(o); break;
3088 case REDIS_HASH: freeHashObject(o); break;
3089 default: redisPanic("Unknown object type"); break;
3090 }
3091 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3092 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3093 !listAddNodeHead(server.objfreelist,o))
3094 zfree(o);
3095 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3096 }
3097 }
3098
3099 static robj *lookupKey(redisDb *db, robj *key) {
3100 dictEntry *de = dictFind(db->dict,key);
3101 if (de) {
3102 robj *key = dictGetEntryKey(de);
3103 robj *val = dictGetEntryVal(de);
3104
3105 if (server.vm_enabled) {
3106 if (key->storage == REDIS_VM_MEMORY ||
3107 key->storage == REDIS_VM_SWAPPING)
3108 {
3109 /* If we were swapping the object out, stop it, this key
3110 * was requested. */
3111 if (key->storage == REDIS_VM_SWAPPING)
3112 vmCancelThreadedIOJob(key);
3113 /* Update the access time of the key for the aging algorithm. */
3114 key->vm.atime = server.unixtime;
3115 } else {
3116 int notify = (key->storage == REDIS_VM_LOADING);
3117
3118 /* Our value was swapped on disk. Bring it at home. */
3119 redisAssert(val == NULL);
3120 val = vmLoadObject(key);
3121 dictGetEntryVal(de) = val;
3122
3123 /* Clients blocked by the VM subsystem may be waiting for
3124 * this key... */
3125 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3126 }
3127 }
3128 return val;
3129 } else {
3130 return NULL;
3131 }
3132 }
3133
3134 static robj *lookupKeyRead(redisDb *db, robj *key) {
3135 expireIfNeeded(db,key);
3136 return lookupKey(db,key);
3137 }
3138
3139 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3140 deleteIfVolatile(db,key);
3141 touchWatchedKey(db,key);
3142 return lookupKey(db,key);
3143 }
3144
3145 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3146 robj *o = lookupKeyRead(c->db, key);
3147 if (!o) addReply(c,reply);
3148 return o;
3149 }
3150
3151 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3152 robj *o = lookupKeyWrite(c->db, key);
3153 if (!o) addReply(c,reply);
3154 return o;
3155 }
3156
3157 static int checkType(redisClient *c, robj *o, int type) {
3158 if (o->type != type) {
3159 addReply(c,shared.wrongtypeerr);
3160 return 1;
3161 }
3162 return 0;
3163 }
3164
3165 static int deleteKey(redisDb *db, robj *key) {
3166 int retval;
3167
3168 /* We need to protect key from destruction: after the first dictDelete()
3169 * it may happen that 'key' is no longer valid if we don't increment
3170 * it's count. This may happen when we get the object reference directly
3171 * from the hash table with dictRandomKey() or dict iterators */
3172 incrRefCount(key);
3173 if (dictSize(db->expires)) dictDelete(db->expires,key);
3174 retval = dictDelete(db->dict,key);
3175 decrRefCount(key);
3176
3177 return retval == DICT_OK;
3178 }
3179
3180 /* Check if the nul-terminated string 's' can be represented by a long
3181 * (that is, is a number that fits into long without any other space or
3182 * character before or after the digits).
3183 *
3184 * If so, the function returns REDIS_OK and *longval is set to the value
3185 * of the number. Otherwise REDIS_ERR is returned */
3186 static int isStringRepresentableAsLong(sds s, long *longval) {
3187 char buf[32], *endptr;
3188 long value;
3189 int slen;
3190
3191 value = strtol(s, &endptr, 10);
3192 if (endptr[0] != '\0') return REDIS_ERR;
3193 slen = ll2string(buf,32,value);
3194
3195 /* If the number converted back into a string is not identical
3196 * then it's not possible to encode the string as integer */
3197 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3198 if (longval) *longval = value;
3199 return REDIS_OK;
3200 }
3201
3202 /* Try to encode a string object in order to save space */
3203 static robj *tryObjectEncoding(robj *o) {
3204 long value;
3205 sds s = o->ptr;
3206
3207 if (o->encoding != REDIS_ENCODING_RAW)
3208 return o; /* Already encoded */
3209
3210 /* It's not safe to encode shared objects: shared objects can be shared
3211 * everywhere in the "object space" of Redis. Encoded objects can only
3212 * appear as "values" (and not, for instance, as keys) */
3213 if (o->refcount > 1) return o;
3214
3215 /* Currently we try to encode only strings */
3216 redisAssert(o->type == REDIS_STRING);
3217
3218 /* Check if we can represent this string as a long integer */
3219 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3220
3221 /* Ok, this object can be encoded */
3222 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3223 decrRefCount(o);
3224 incrRefCount(shared.integers[value]);
3225 return shared.integers[value];
3226 } else {
3227 o->encoding = REDIS_ENCODING_INT;
3228 sdsfree(o->ptr);
3229 o->ptr = (void*) value;
3230 return o;
3231 }
3232 }
3233
3234 /* Get a decoded version of an encoded object (returned as a new object).
3235 * If the object is already raw-encoded just increment the ref count. */
3236 static robj *getDecodedObject(robj *o) {
3237 robj *dec;
3238
3239 if (o->encoding == REDIS_ENCODING_RAW) {
3240 incrRefCount(o);
3241 return o;
3242 }
3243 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3244 char buf[32];
3245
3246 ll2string(buf,32,(long)o->ptr);
3247 dec = createStringObject(buf,strlen(buf));
3248 return dec;
3249 } else {
3250 redisPanic("Unknown encoding type");
3251 }
3252 }
3253
3254 /* Compare two string objects via strcmp() or alike.
3255 * Note that the objects may be integer-encoded. In such a case we
3256 * use ll2string() to get a string representation of the numbers on the stack
3257 * and compare the strings, it's much faster than calling getDecodedObject().
3258 *
3259 * Important note: if objects are not integer encoded, but binary-safe strings,
3260 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3261 * binary safe. */
3262 static int compareStringObjects(robj *a, robj *b) {
3263 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3264 char bufa[128], bufb[128], *astr, *bstr;
3265 int bothsds = 1;
3266
3267 if (a == b) return 0;
3268 if (a->encoding != REDIS_ENCODING_RAW) {
3269 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3270 astr = bufa;
3271 bothsds = 0;
3272 } else {
3273 astr = a->ptr;
3274 }
3275 if (b->encoding != REDIS_ENCODING_RAW) {
3276 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3277 bstr = bufb;
3278 bothsds = 0;
3279 } else {
3280 bstr = b->ptr;
3281 }
3282 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3283 }
3284
3285 /* Equal string objects return 1 if the two objects are the same from the
3286 * point of view of a string comparison, otherwise 0 is returned. Note that
3287 * this function is faster then checking for (compareStringObject(a,b) == 0)
3288 * because it can perform some more optimization. */
3289 static int equalStringObjects(robj *a, robj *b) {
3290 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3291 return a->ptr == b->ptr;
3292 } else {
3293 return compareStringObjects(a,b) == 0;
3294 }
3295 }
3296
3297 static size_t stringObjectLen(robj *o) {
3298 redisAssert(o->type == REDIS_STRING);
3299 if (o->encoding == REDIS_ENCODING_RAW) {
3300 return sdslen(o->ptr);
3301 } else {
3302 char buf[32];
3303
3304 return ll2string(buf,32,(long)o->ptr);
3305 }
3306 }
3307
3308 static int getDoubleFromObject(robj *o, double *target) {
3309 double value;
3310 char *eptr;
3311
3312 if (o == NULL) {
3313 value = 0;
3314 } else {
3315 redisAssert(o->type == REDIS_STRING);
3316 if (o->encoding == REDIS_ENCODING_RAW) {
3317 value = strtod(o->ptr, &eptr);
3318 if (eptr[0] != '\0') return REDIS_ERR;
3319 } else if (o->encoding == REDIS_ENCODING_INT) {
3320 value = (long)o->ptr;
3321 } else {
3322 redisPanic("Unknown string encoding");
3323 }
3324 }
3325
3326 *target = value;
3327 return REDIS_OK;
3328 }
3329
3330 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3331 double value;
3332 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3333 if (msg != NULL) {
3334 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3335 } else {
3336 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3337 }
3338 return REDIS_ERR;
3339 }
3340
3341 *target = value;
3342 return REDIS_OK;
3343 }
3344
3345 static int getLongLongFromObject(robj *o, long long *target) {
3346 long long value;
3347 char *eptr;
3348
3349 if (o == NULL) {
3350 value = 0;
3351 } else {
3352 redisAssert(o->type == REDIS_STRING);
3353 if (o->encoding == REDIS_ENCODING_RAW) {
3354 value = strtoll(o->ptr, &eptr, 10);
3355 if (eptr[0] != '\0') return REDIS_ERR;
3356 } else if (o->encoding == REDIS_ENCODING_INT) {
3357 value = (long)o->ptr;
3358 } else {
3359 redisPanic("Unknown string encoding");
3360 }
3361 }
3362
3363 *target = value;
3364 return REDIS_OK;
3365 }
3366
3367 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3368 long long value;
3369 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3370 if (msg != NULL) {
3371 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3372 } else {
3373 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3374 }
3375 return REDIS_ERR;
3376 }
3377
3378 *target = value;
3379 return REDIS_OK;
3380 }
3381
3382 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3383 long long value;
3384
3385 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3386 if (value < LONG_MIN || value > LONG_MAX) {
3387 if (msg != NULL) {
3388 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3389 } else {
3390 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3391 }
3392 return REDIS_ERR;
3393 }
3394
3395 *target = value;
3396 return REDIS_OK;
3397 }
3398
3399 /*============================ RDB saving/loading =========================== */
3400
3401 static int rdbSaveType(FILE *fp, unsigned char type) {
3402 if (fwrite(&type,1,1,fp) == 0) return -1;
3403 return 0;
3404 }
3405
3406 static int rdbSaveTime(FILE *fp, time_t t) {
3407 int32_t t32 = (int32_t) t;
3408 if (fwrite(&t32,4,1,fp) == 0) return -1;
3409 return 0;
3410 }
3411
3412 /* check rdbLoadLen() comments for more info */
3413 static int rdbSaveLen(FILE *fp, uint32_t len) {
3414 unsigned char buf[2];
3415
3416 if (len < (1<<6)) {
3417 /* Save a 6 bit len */
3418 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3419 if (fwrite(buf,1,1,fp) == 0) return -1;
3420 } else if (len < (1<<14)) {
3421 /* Save a 14 bit len */
3422 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3423 buf[1] = len&0xFF;
3424 if (fwrite(buf,2,1,fp) == 0) return -1;
3425 } else {
3426 /* Save a 32 bit len */
3427 buf[0] = (REDIS_RDB_32BITLEN<<6);
3428 if (fwrite(buf,1,1,fp) == 0) return -1;
3429 len = htonl(len);
3430 if (fwrite(&len,4,1,fp) == 0) return -1;
3431 }
3432 return 0;
3433 }
3434
3435 /* Encode 'value' as an integer if possible (if integer will fit the
3436 * supported range). If the function sucessful encoded the integer
3437 * then the (up to 5 bytes) encoded representation is written in the
3438 * string pointed by 'enc' and the length is returned. Otherwise
3439 * 0 is returned. */
3440 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3441 /* Finally check if it fits in our ranges */
3442 if (value >= -(1<<7) && value <= (1<<7)-1) {
3443 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3444 enc[1] = value&0xFF;
3445 return 2;
3446 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3447 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3448 enc[1] = value&0xFF;
3449 enc[2] = (value>>8)&0xFF;
3450 return 3;
3451 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3452 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3453 enc[1] = value&0xFF;
3454 enc[2] = (value>>8)&0xFF;
3455 enc[3] = (value>>16)&0xFF;
3456 enc[4] = (value>>24)&0xFF;
3457 return 5;
3458 } else {
3459 return 0;
3460 }
3461 }
3462
3463 /* String objects in the form "2391" "-100" without any space and with a
3464 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3465 * encoded as integers to save space */
3466 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3467 long long value;
3468 char *endptr, buf[32];
3469
3470 /* Check if it's possible to encode this value as a number */
3471 value = strtoll(s, &endptr, 10);
3472 if (endptr[0] != '\0') return 0;
3473 ll2string(buf,32,value);
3474
3475 /* If the number converted back into a string is not identical
3476 * then it's not possible to encode the string as integer */
3477 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3478
3479 return rdbEncodeInteger(value,enc);
3480 }
3481
3482 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3483 size_t comprlen, outlen;
3484 unsigned char byte;
3485 void *out;
3486
3487 /* We require at least four bytes compression for this to be worth it */
3488 if (len <= 4) return 0;
3489 outlen = len-4;
3490 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3491 comprlen = lzf_compress(s, len, out, outlen);
3492 if (comprlen == 0) {
3493 zfree(out);
3494 return 0;
3495 }
3496 /* Data compressed! Let's save it on disk */
3497 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3498 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3499 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3500 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3501 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3502 zfree(out);
3503 return comprlen;
3504
3505 writeerr:
3506 zfree(out);
3507 return -1;
3508 }
3509
3510 /* Save a string objet as [len][data] on disk. If the object is a string
3511 * representation of an integer value we try to safe it in a special form */
3512 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3513 int enclen;
3514
3515 /* Try integer encoding */
3516 if (len <= 11) {
3517 unsigned char buf[5];
3518 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3519 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3520 return 0;
3521 }
3522 }
3523
3524 /* Try LZF compression - under 20 bytes it's unable to compress even
3525 * aaaaaaaaaaaaaaaaaa so skip it */
3526 if (server.rdbcompression && len > 20) {
3527 int retval;
3528
3529 retval = rdbSaveLzfStringObject(fp,s,len);
3530 if (retval == -1) return -1;
3531 if (retval > 0) return 0;
3532 /* retval == 0 means data can't be compressed, save the old way */
3533 }
3534
3535 /* Store verbatim */
3536 if (rdbSaveLen(fp,len) == -1) return -1;
3537 if (len && fwrite(s,len,1,fp) == 0) return -1;
3538 return 0;
3539 }
3540
3541 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3542 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3543 int retval;
3544
3545 /* Avoid to decode the object, then encode it again, if the
3546 * object is alrady integer encoded. */
3547 if (obj->encoding == REDIS_ENCODING_INT) {
3548 long val = (long) obj->ptr;
3549 unsigned char buf[5];
3550 int enclen;
3551
3552 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3553 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3554 return 0;
3555 }
3556 /* otherwise... fall throught and continue with the usual
3557 * code path. */
3558 }
3559
3560 /* Avoid incr/decr ref count business when possible.
3561 * This plays well with copy-on-write given that we are probably
3562 * in a child process (BGSAVE). Also this makes sure key objects
3563 * of swapped objects are not incRefCount-ed (an assert does not allow
3564 * this in order to avoid bugs) */
3565 if (obj->encoding != REDIS_ENCODING_RAW) {
3566 obj = getDecodedObject(obj);
3567 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3568 decrRefCount(obj);
3569 } else {
3570 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3571 }
3572 return retval;
3573 }
3574
3575 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3576 * 8 bit integer specifing the length of the representation.
3577 * This 8 bit integer has special values in order to specify the following
3578 * conditions:
3579 * 253: not a number
3580 * 254: + inf
3581 * 255: - inf
3582 */
3583 static int rdbSaveDoubleValue(FILE *fp, double val) {
3584 unsigned char buf[128];
3585 int len;
3586
3587 if (isnan(val)) {
3588 buf[0] = 253;
3589 len = 1;
3590 } else if (!isfinite(val)) {
3591 len = 1;
3592 buf[0] = (val < 0) ? 255 : 254;
3593 } else {
3594 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3595 /* Check if the float is in a safe range to be casted into a
3596 * long long. We are assuming that long long is 64 bit here.
3597 * Also we are assuming that there are no implementations around where
3598 * double has precision < 52 bit.
3599 *
3600 * Under this assumptions we test if a double is inside an interval
3601 * where casting to long long is safe. Then using two castings we
3602 * make sure the decimal part is zero. If all this is true we use
3603 * integer printing function that is much faster. */
3604 double min = -4503599627370495; /* (2^52)-1 */
3605 double max = 4503599627370496; /* -(2^52) */
3606 if (val > min && val < max && val == ((double)((long long)val)))
3607 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3608 else
3609 #endif
3610 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3611 buf[0] = strlen((char*)buf+1);
3612 len = buf[0]+1;
3613 }
3614 if (fwrite(buf,len,1,fp) == 0) return -1;
3615 return 0;
3616 }
3617
3618 /* Save a Redis object. */
3619 static int rdbSaveObject(FILE *fp, robj *o) {
3620 if (o->type == REDIS_STRING) {
3621 /* Save a string value */
3622 if (rdbSaveStringObject(fp,o) == -1) return -1;
3623 } else if (o->type == REDIS_LIST) {
3624 /* Save a list value */
3625 list *list = o->ptr;
3626 listIter li;
3627 listNode *ln;
3628
3629 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3630 listRewind(list,&li);
3631 while((ln = listNext(&li))) {
3632 robj *eleobj = listNodeValue(ln);
3633
3634 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3635 }
3636 } else if (o->type == REDIS_SET) {
3637 /* Save a set value */
3638 dict *set = o->ptr;
3639 dictIterator *di = dictGetIterator(set);
3640 dictEntry *de;
3641
3642 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3643 while((de = dictNext(di)) != NULL) {
3644 robj *eleobj = dictGetEntryKey(de);
3645
3646 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3647 }
3648 dictReleaseIterator(di);
3649 } else if (o->type == REDIS_ZSET) {
3650 /* Save a set value */
3651 zset *zs = o->ptr;
3652 dictIterator *di = dictGetIterator(zs->dict);
3653 dictEntry *de;
3654
3655 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3656 while((de = dictNext(di)) != NULL) {
3657 robj *eleobj = dictGetEntryKey(de);
3658 double *score = dictGetEntryVal(de);
3659
3660 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3661 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3662 }
3663 dictReleaseIterator(di);
3664 } else if (o->type == REDIS_HASH) {
3665 /* Save a hash value */
3666 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3667 unsigned char *p = zipmapRewind(o->ptr);
3668 unsigned int count = zipmapLen(o->ptr);
3669 unsigned char *key, *val;
3670 unsigned int klen, vlen;
3671
3672 if (rdbSaveLen(fp,count) == -1) return -1;
3673 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3674 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3675 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3676 }
3677 } else {
3678 dictIterator *di = dictGetIterator(o->ptr);
3679 dictEntry *de;
3680
3681 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3682 while((de = dictNext(di)) != NULL) {
3683 robj *key = dictGetEntryKey(de);
3684 robj *val = dictGetEntryVal(de);
3685
3686 if (rdbSaveStringObject(fp,key) == -1) return -1;
3687 if (rdbSaveStringObject(fp,val) == -1) return -1;
3688 }
3689 dictReleaseIterator(di);
3690 }
3691 } else {
3692 redisPanic("Unknown object type");
3693 }
3694 return 0;
3695 }
3696
3697 /* Return the length the object will have on disk if saved with
3698 * the rdbSaveObject() function. Currently we use a trick to get
3699 * this length with very little changes to the code. In the future
3700 * we could switch to a faster solution. */
3701 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3702 if (fp == NULL) fp = server.devnull;
3703 rewind(fp);
3704 assert(rdbSaveObject(fp,o) != 1);
3705 return ftello(fp);
3706 }
3707
3708 /* Return the number of pages required to save this object in the swap file */
3709 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3710 off_t bytes = rdbSavedObjectLen(o,fp);
3711
3712 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3713 }
3714
3715 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3716 static int rdbSave(char *filename) {
3717 dictIterator *di = NULL;
3718 dictEntry *de;
3719 FILE *fp;
3720 char tmpfile[256];
3721 int j;
3722 time_t now = time(NULL);
3723
3724 /* Wait for I/O therads to terminate, just in case this is a
3725 * foreground-saving, to avoid seeking the swap file descriptor at the
3726 * same time. */
3727 if (server.vm_enabled)
3728 waitEmptyIOJobsQueue();
3729
3730 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3731 fp = fopen(tmpfile,"w");
3732 if (!fp) {
3733 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3734 return REDIS_ERR;
3735 }
3736 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3737 for (j = 0; j < server.dbnum; j++) {
3738 redisDb *db = server.db+j;
3739 dict *d = db->dict;
3740 if (dictSize(d) == 0) continue;
3741 di = dictGetIterator(d);
3742 if (!di) {
3743 fclose(fp);
3744 return REDIS_ERR;
3745 }
3746
3747 /* Write the SELECT DB opcode */
3748 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3749 if (rdbSaveLen(fp,j) == -1) goto werr;
3750
3751 /* Iterate this DB writing every entry */
3752 while((de = dictNext(di)) != NULL) {
3753 robj *key = dictGetEntryKey(de);
3754 robj *o = dictGetEntryVal(de);
3755 time_t expiretime = getExpire(db,key);
3756
3757 /* Save the expire time */
3758 if (expiretime != -1) {
3759 /* If this key is already expired skip it */
3760 if (expiretime < now) continue;
3761 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3762 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3763 }
3764 /* Save the key and associated value. This requires special
3765 * handling if the value is swapped out. */
3766 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3767 key->storage == REDIS_VM_SWAPPING) {
3768 /* Save type, key, value */
3769 if (rdbSaveType(fp,o->type) == -1) goto werr;
3770 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3771 if (rdbSaveObject(fp,o) == -1) goto werr;
3772 } else {
3773 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3774 robj *po;
3775 /* Get a preview of the object in memory */
3776 po = vmPreviewObject(key);
3777 /* Save type, key, value */
3778 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3779 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3780 if (rdbSaveObject(fp,po) == -1) goto werr;
3781 /* Remove the loaded object from memory */
3782 decrRefCount(po);
3783 }
3784 }
3785 dictReleaseIterator(di);
3786 }
3787 /* EOF opcode */
3788 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3789
3790 /* Make sure data will not remain on the OS's output buffers */
3791 fflush(fp);
3792 fsync(fileno(fp));
3793 fclose(fp);
3794
3795 /* Use RENAME to make sure the DB file is changed atomically only
3796 * if the generate DB file is ok. */
3797 if (rename(tmpfile,filename) == -1) {
3798 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3799 unlink(tmpfile);
3800 return REDIS_ERR;
3801 }
3802 redisLog(REDIS_NOTICE,"DB saved on disk");
3803 server.dirty = 0;
3804 server.lastsave = time(NULL);
3805 return REDIS_OK;
3806
3807 werr:
3808 fclose(fp);
3809 unlink(tmpfile);
3810 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3811 if (di) dictReleaseIterator(di);
3812 return REDIS_ERR;
3813 }
3814
3815 static int rdbSaveBackground(char *filename) {
3816 pid_t childpid;
3817
3818 if (server.bgsavechildpid != -1) return REDIS_ERR;
3819 if (server.vm_enabled) waitEmptyIOJobsQueue();
3820 if ((childpid = fork()) == 0) {
3821 /* Child */
3822 if (server.vm_enabled) vmReopenSwapFile();
3823 close(server.fd);
3824 if (rdbSave(filename) == REDIS_OK) {
3825 _exit(0);
3826 } else {
3827 _exit(1);
3828 }
3829 } else {
3830 /* Parent */
3831 if (childpid == -1) {
3832 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3833 strerror(errno));
3834 return REDIS_ERR;
3835 }
3836 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3837 server.bgsavechildpid = childpid;
3838 updateDictResizePolicy();
3839 return REDIS_OK;
3840 }
3841 return REDIS_OK; /* unreached */
3842 }
3843
3844 static void rdbRemoveTempFile(pid_t childpid) {
3845 char tmpfile[256];
3846
3847 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3848 unlink(tmpfile);
3849 }
3850
3851 static int rdbLoadType(FILE *fp) {
3852 unsigned char type;
3853 if (fread(&type,1,1,fp) == 0) return -1;
3854 return type;
3855 }
3856
3857 static time_t rdbLoadTime(FILE *fp) {
3858 int32_t t32;
3859 if (fread(&t32,4,1,fp) == 0) return -1;
3860 return (time_t) t32;
3861 }
3862
3863 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3864 * of this file for a description of how this are stored on disk.
3865 *
3866 * isencoded is set to 1 if the readed length is not actually a length but
3867 * an "encoding type", check the above comments for more info */
3868 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3869 unsigned char buf[2];
3870 uint32_t len;
3871 int type;
3872
3873 if (isencoded) *isencoded = 0;
3874 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3875 type = (buf[0]&0xC0)>>6;
3876 if (type == REDIS_RDB_6BITLEN) {
3877 /* Read a 6 bit len */
3878 return buf[0]&0x3F;
3879 } else if (type == REDIS_RDB_ENCVAL) {
3880 /* Read a 6 bit len encoding type */
3881 if (isencoded) *isencoded = 1;
3882 return buf[0]&0x3F;
3883 } else if (type == REDIS_RDB_14BITLEN) {
3884 /* Read a 14 bit len */
3885 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3886 return ((buf[0]&0x3F)<<8)|buf[1];
3887 } else {
3888 /* Read a 32 bit len */
3889 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3890 return ntohl(len);
3891 }
3892 }
3893
3894 /* Load an integer-encoded object from file 'fp', with the specified
3895 * encoding type 'enctype'. If encode is true the function may return
3896 * an integer-encoded object as reply, otherwise the returned object
3897 * will always be encoded as a raw string. */
3898 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3899 unsigned char enc[4];
3900 long long val;
3901
3902 if (enctype == REDIS_RDB_ENC_INT8) {
3903 if (fread(enc,1,1,fp) == 0) return NULL;
3904 val = (signed char)enc[0];
3905 } else if (enctype == REDIS_RDB_ENC_INT16) {
3906 uint16_t v;
3907 if (fread(enc,2,1,fp) == 0) return NULL;
3908 v = enc[0]|(enc[1]<<8);
3909 val = (int16_t)v;
3910 } else if (enctype == REDIS_RDB_ENC_INT32) {
3911 uint32_t v;
3912 if (fread(enc,4,1,fp) == 0) return NULL;
3913 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3914 val = (int32_t)v;
3915 } else {
3916 val = 0; /* anti-warning */
3917 redisPanic("Unknown RDB integer encoding type");
3918 }
3919 if (encode)
3920 return createStringObjectFromLongLong(val);
3921 else
3922 return createObject(REDIS_STRING,sdsfromlonglong(val));
3923 }
3924
3925 static robj *rdbLoadLzfStringObject(FILE*fp) {
3926 unsigned int len, clen;
3927 unsigned char *c = NULL;
3928 sds val = NULL;
3929
3930 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3931 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3932 if ((c = zmalloc(clen)) == NULL) goto err;
3933 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3934 if (fread(c,clen,1,fp) == 0) goto err;
3935 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3936 zfree(c);
3937 return createObject(REDIS_STRING,val);
3938 err:
3939 zfree(c);
3940 sdsfree(val);
3941 return NULL;
3942 }
3943
3944 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3945 int isencoded;
3946 uint32_t len;
3947 sds val;
3948
3949 len = rdbLoadLen(fp,&isencoded);
3950 if (isencoded) {
3951 switch(len) {
3952 case REDIS_RDB_ENC_INT8:
3953 case REDIS_RDB_ENC_INT16:
3954 case REDIS_RDB_ENC_INT32:
3955 return rdbLoadIntegerObject(fp,len,encode);
3956 case REDIS_RDB_ENC_LZF:
3957 return rdbLoadLzfStringObject(fp);
3958 default:
3959 redisPanic("Unknown RDB encoding type");
3960 }
3961 }
3962
3963 if (len == REDIS_RDB_LENERR) return NULL;
3964 val = sdsnewlen(NULL,len);
3965 if (len && fread(val,len,1,fp) == 0) {
3966 sdsfree(val);
3967 return NULL;
3968 }
3969 return createObject(REDIS_STRING,val);
3970 }
3971
3972 static robj *rdbLoadStringObject(FILE *fp) {
3973 return rdbGenericLoadStringObject(fp,0);
3974 }
3975
3976 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3977 return rdbGenericLoadStringObject(fp,1);
3978 }
3979
3980 /* For information about double serialization check rdbSaveDoubleValue() */
3981 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3982 char buf[128];
3983 unsigned char len;
3984
3985 if (fread(&len,1,1,fp) == 0) return -1;
3986 switch(len) {
3987 case 255: *val = R_NegInf; return 0;
3988 case 254: *val = R_PosInf; return 0;
3989 case 253: *val = R_Nan; return 0;
3990 default:
3991 if (fread(buf,len,1,fp) == 0) return -1;
3992 buf[len] = '\0';
3993 sscanf(buf, "%lg", val);
3994 return 0;
3995 }
3996 }
3997
3998 /* Load a Redis object of the specified type from the specified file.
3999 * On success a newly allocated object is returned, otherwise NULL. */
4000 static robj *rdbLoadObject(int type, FILE *fp) {
4001 robj *o;
4002
4003 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4004 if (type == REDIS_STRING) {
4005 /* Read string value */
4006 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4007 o = tryObjectEncoding(o);
4008 } else if (type == REDIS_LIST || type == REDIS_SET) {
4009 /* Read list/set value */
4010 uint32_t listlen;
4011
4012 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4013 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
4014 /* It's faster to expand the dict to the right size asap in order
4015 * to avoid rehashing */
4016 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4017 dictExpand(o->ptr,listlen);
4018 /* Load every single element of the list/set */
4019 while(listlen--) {
4020 robj *ele;
4021
4022 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4023 ele = tryObjectEncoding(ele);
4024 if (type == REDIS_LIST) {
4025 listAddNodeTail((list*)o->ptr,ele);
4026 } else {
4027 dictAdd((dict*)o->ptr,ele,NULL);
4028 }
4029 }
4030 } else if (type == REDIS_ZSET) {
4031 /* Read list/set value */
4032 size_t zsetlen;
4033 zset *zs;
4034
4035 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4036 o = createZsetObject();
4037 zs = o->ptr;
4038 /* Load every single element of the list/set */
4039 while(zsetlen--) {
4040 robj *ele;
4041 double *score = zmalloc(sizeof(double));
4042
4043 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4044 ele = tryObjectEncoding(ele);
4045 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4046 dictAdd(zs->dict,ele,score);
4047 zslInsert(zs->zsl,*score,ele);
4048 incrRefCount(ele); /* added to skiplist */
4049 }
4050 } else if (type == REDIS_HASH) {
4051 size_t hashlen;
4052
4053 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4054 o = createHashObject();
4055 /* Too many entries? Use an hash table. */
4056 if (hashlen > server.hash_max_zipmap_entries)
4057 convertToRealHash(o);
4058 /* Load every key/value, then set it into the zipmap or hash
4059 * table, as needed. */
4060 while(hashlen--) {
4061 robj *key, *val;
4062
4063 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4064 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4065 /* If we are using a zipmap and there are too big values
4066 * the object is converted to real hash table encoding. */
4067 if (o->encoding != REDIS_ENCODING_HT &&
4068 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4069 sdslen(val->ptr) > server.hash_max_zipmap_value))
4070 {
4071 convertToRealHash(o);
4072 }
4073
4074 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4075 unsigned char *zm = o->ptr;
4076
4077 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4078 val->ptr,sdslen(val->ptr),NULL);
4079 o->ptr = zm;
4080 decrRefCount(key);
4081 decrRefCount(val);
4082 } else {
4083 key = tryObjectEncoding(key);
4084 val = tryObjectEncoding(val);
4085 dictAdd((dict*)o->ptr,key,val);
4086 }
4087 }
4088 } else {
4089 redisPanic("Unknown object type");
4090 }
4091 return o;
4092 }
4093
4094 static int rdbLoad(char *filename) {
4095 FILE *fp;
4096 uint32_t dbid;
4097 int type, retval, rdbver;
4098 int swap_all_values = 0;
4099 dict *d = server.db[0].dict;
4100 redisDb *db = server.db+0;
4101 char buf[1024];
4102 time_t expiretime, now = time(NULL);
4103 long long loadedkeys = 0;
4104
4105 fp = fopen(filename,"r");
4106 if (!fp) return REDIS_ERR;
4107 if (fread(buf,9,1,fp) == 0) goto eoferr;
4108 buf[9] = '\0';
4109 if (memcmp(buf,"REDIS",5) != 0) {
4110 fclose(fp);
4111 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4112 return REDIS_ERR;
4113 }
4114 rdbver = atoi(buf+5);
4115 if (rdbver != 1) {
4116 fclose(fp);
4117 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4118 return REDIS_ERR;
4119 }
4120 while(1) {
4121 robj *key, *val;
4122
4123 expiretime = -1;
4124 /* Read type. */
4125 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4126 if (type == REDIS_EXPIRETIME) {
4127 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4128 /* We read the time so we need to read the object type again */
4129 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4130 }
4131 if (type == REDIS_EOF) break;
4132 /* Handle SELECT DB opcode as a special case */
4133 if (type == REDIS_SELECTDB) {
4134 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4135 goto eoferr;
4136 if (dbid >= (unsigned)server.dbnum) {
4137 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4138 exit(1);
4139 }
4140 db = server.db+dbid;
4141 d = db->dict;
4142 continue;
4143 }
4144 /* Read key */
4145 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4146 /* Read value */
4147 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4148 /* Check if the key already expired */
4149 if (expiretime != -1 && expiretime < now) {
4150 decrRefCount(key);
4151 decrRefCount(val);
4152 continue;
4153 }
4154 /* Add the new object in the hash table */
4155 retval = dictAdd(d,key,val);
4156 if (retval == DICT_ERR) {
4157 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4158 exit(1);
4159 }
4160 loadedkeys++;
4161 /* Set the expire time if needed */
4162 if (expiretime != -1) setExpire(db,key,expiretime);
4163
4164 /* Handle swapping while loading big datasets when VM is on */
4165
4166 /* If we detecter we are hopeless about fitting something in memory
4167 * we just swap every new key on disk. Directly...
4168 * Note that's important to check for this condition before resorting
4169 * to random sampling, otherwise we may try to swap already
4170 * swapped keys. */
4171 if (swap_all_values) {
4172 dictEntry *de = dictFind(d,key);
4173
4174 /* de may be NULL since the key already expired */
4175 if (de) {
4176 key = dictGetEntryKey(de);
4177 val = dictGetEntryVal(de);
4178
4179 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4180 dictGetEntryVal(de) = NULL;
4181 }
4182 }
4183 continue;
4184 }
4185
4186 /* If we have still some hope of having some value fitting memory
4187 * then we try random sampling. */
4188 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4189 while (zmalloc_used_memory() > server.vm_max_memory) {
4190 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4191 }
4192 if (zmalloc_used_memory() > server.vm_max_memory)
4193 swap_all_values = 1; /* We are already using too much mem */
4194 }
4195 }
4196 fclose(fp);
4197 return REDIS_OK;
4198
4199 eoferr: /* unexpected end of file is handled here with a fatal exit */
4200 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4201 exit(1);
4202 return REDIS_ERR; /* Just to avoid warning */
4203 }
4204
4205 /*================================== Shutdown =============================== */
4206 static int prepareForShutdown() {
4207 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4208 /* Kill the saving child if there is a background saving in progress.
4209 We want to avoid race conditions, for instance our saving child may
4210 overwrite the synchronous saving did by SHUTDOWN. */
4211 if (server.bgsavechildpid != -1) {
4212 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4213 kill(server.bgsavechildpid,SIGKILL);
4214 rdbRemoveTempFile(server.bgsavechildpid);
4215 }
4216 if (server.appendonly) {
4217 /* Append only file: fsync() the AOF and exit */
4218 fsync(server.appendfd);
4219 if (server.vm_enabled) unlink(server.vm_swap_file);
4220 } else {
4221 /* Snapshotting. Perform a SYNC SAVE and exit */
4222 if (rdbSave(server.dbfilename) == REDIS_OK) {
4223 if (server.daemonize)
4224 unlink(server.pidfile);
4225 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4226 } else {
4227 /* Ooops.. error saving! The best we can do is to continue
4228 * operating. Note that if there was a background saving process,
4229 * in the next cron() Redis will be notified that the background
4230 * saving aborted, handling special stuff like slaves pending for
4231 * synchronization... */
4232 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4233 return REDIS_ERR;
4234 }
4235 }
4236 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4237 return REDIS_OK;
4238 }
4239
4240 /*================================== Commands =============================== */
4241
4242 static void authCommand(redisClient *c) {
4243 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4244 c->authenticated = 1;
4245 addReply(c,shared.ok);
4246 } else {
4247 c->authenticated = 0;
4248 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4249 }
4250 }
4251
4252 static void pingCommand(redisClient *c) {
4253 addReply(c,shared.pong);
4254 }
4255
4256 static void echoCommand(redisClient *c) {
4257 addReplyBulk(c,c->argv[1]);
4258 }
4259
4260 /*=================================== Strings =============================== */
4261
4262 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4263 int retval;
4264 long seconds = 0; /* initialized to avoid an harmness warning */
4265
4266 if (expire) {
4267 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4268 return;
4269 if (seconds <= 0) {
4270 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4271 return;
4272 }
4273 }
4274
4275 touchWatchedKey(c->db,key);
4276 if (nx) deleteIfVolatile(c->db,key);
4277 retval = dictAdd(c->db->dict,key,val);
4278 if (retval == DICT_ERR) {
4279 if (!nx) {
4280 /* If the key is about a swapped value, we want a new key object
4281 * to overwrite the old. So we delete the old key in the database.
4282 * This will also make sure that swap pages about the old object
4283 * will be marked as free. */
4284 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4285 incrRefCount(key);
4286 dictReplace(c->db->dict,key,val);
4287 incrRefCount(val);
4288 } else {
4289 addReply(c,shared.czero);
4290 return;
4291 }
4292 } else {
4293 incrRefCount(key);
4294 incrRefCount(val);
4295 }
4296 server.dirty++;
4297 removeExpire(c->db,key);
4298 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4299 addReply(c, nx ? shared.cone : shared.ok);
4300 }
4301
4302 static void setCommand(redisClient *c) {
4303 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4304 }
4305
4306 static void setnxCommand(redisClient *c) {
4307 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4308 }
4309
4310 static void setexCommand(redisClient *c) {
4311 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4312 }
4313
4314 static int getGenericCommand(redisClient *c) {
4315 robj *o;
4316
4317 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4318 return REDIS_OK;
4319
4320 if (o->type != REDIS_STRING) {
4321 addReply(c,shared.wrongtypeerr);
4322 return REDIS_ERR;
4323 } else {
4324 addReplyBulk(c,o);
4325 return REDIS_OK;
4326 }
4327 }
4328
4329 static void getCommand(redisClient *c) {
4330 getGenericCommand(c);
4331 }
4332
4333 static void getsetCommand(redisClient *c) {
4334 if (getGenericCommand(c) == REDIS_ERR) return;
4335 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4336 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4337 } else {
4338 incrRefCount(c->argv[1]);
4339 }
4340 incrRefCount(c->argv[2]);
4341 server.dirty++;
4342 removeExpire(c->db,c->argv[1]);
4343 }
4344
4345 static void mgetCommand(redisClient *c) {
4346 int j;
4347
4348 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4349 for (j = 1; j < c->argc; j++) {
4350 robj *o = lookupKeyRead(c->db,c->argv[j]);
4351 if (o == NULL) {
4352 addReply(c,shared.nullbulk);
4353 } else {
4354 if (o->type != REDIS_STRING) {
4355 addReply(c,shared.nullbulk);
4356 } else {
4357 addReplyBulk(c,o);
4358 }
4359 }
4360 }
4361 }
4362
4363 static void msetGenericCommand(redisClient *c, int nx) {
4364 int j, busykeys = 0;
4365
4366 if ((c->argc % 2) == 0) {
4367 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4368 return;
4369 }
4370 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4371 * set nothing at all if at least one already key exists. */
4372 if (nx) {
4373 for (j = 1; j < c->argc; j += 2) {
4374 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4375 busykeys++;
4376 }
4377 }
4378 }
4379 if (busykeys) {
4380 addReply(c, shared.czero);
4381 return;
4382 }
4383
4384 for (j = 1; j < c->argc; j += 2) {
4385 int retval;
4386
4387 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4388 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4389 if (retval == DICT_ERR) {
4390 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4391 incrRefCount(c->argv[j+1]);
4392 } else {
4393 incrRefCount(c->argv[j]);
4394 incrRefCount(c->argv[j+1]);
4395 }
4396 removeExpire(c->db,c->argv[j]);
4397 }
4398 server.dirty += (c->argc-1)/2;
4399 addReply(c, nx ? shared.cone : shared.ok);
4400 }
4401
4402 static void msetCommand(redisClient *c) {
4403 msetGenericCommand(c,0);
4404 }
4405
4406 static void msetnxCommand(redisClient *c) {
4407 msetGenericCommand(c,1);
4408 }
4409
4410 static void incrDecrCommand(redisClient *c, long long incr) {
4411 long long value;
4412 int retval;
4413 robj *o;
4414
4415 o = lookupKeyWrite(c->db,c->argv[1]);
4416 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4417 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4418
4419 value += incr;
4420 o = createStringObjectFromLongLong(value);
4421 retval = dictAdd(c->db->dict,c->argv[1],o);
4422 if (retval == DICT_ERR) {
4423 dictReplace(c->db->dict,c->argv[1],o);
4424 removeExpire(c->db,c->argv[1]);
4425 } else {
4426 incrRefCount(c->argv[1]);
4427 }
4428 server.dirty++;
4429 addReply(c,shared.colon);
4430 addReply(c,o);
4431 addReply(c,shared.crlf);
4432 }
4433
4434 static void incrCommand(redisClient *c) {
4435 incrDecrCommand(c,1);
4436 }
4437
4438 static void decrCommand(redisClient *c) {
4439 incrDecrCommand(c,-1);
4440 }
4441
4442 static void incrbyCommand(redisClient *c) {
4443 long long incr;
4444
4445 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4446 incrDecrCommand(c,incr);
4447 }
4448
4449 static void decrbyCommand(redisClient *c) {
4450 long long incr;
4451
4452 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4453 incrDecrCommand(c,-incr);
4454 }
4455
4456 static void appendCommand(redisClient *c) {
4457 int retval;
4458 size_t totlen;
4459 robj *o;
4460
4461 o = lookupKeyWrite(c->db,c->argv[1]);
4462 if (o == NULL) {
4463 /* Create the key */
4464 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4465 incrRefCount(c->argv[1]);
4466 incrRefCount(c->argv[2]);
4467 totlen = stringObjectLen(c->argv[2]);
4468 } else {
4469 dictEntry *de;
4470
4471 de = dictFind(c->db->dict,c->argv[1]);
4472 assert(de != NULL);
4473
4474 o = dictGetEntryVal(de);
4475 if (o->type != REDIS_STRING) {
4476 addReply(c,shared.wrongtypeerr);
4477 return;
4478 }
4479 /* If the object is specially encoded or shared we have to make
4480 * a copy */
4481 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4482 robj *decoded = getDecodedObject(o);
4483
4484 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4485 decrRefCount(decoded);
4486 dictReplace(c->db->dict,c->argv[1],o);
4487 }
4488 /* APPEND! */
4489 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4490 o->ptr = sdscatlen(o->ptr,
4491 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4492 } else {
4493 o->ptr = sdscatprintf(o->ptr, "%ld",
4494 (unsigned long) c->argv[2]->ptr);
4495 }
4496 totlen = sdslen(o->ptr);
4497 }
4498 server.dirty++;
4499 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4500 }
4501
4502 static void substrCommand(redisClient *c) {
4503 robj *o;
4504 long start = atoi(c->argv[2]->ptr);
4505 long end = atoi(c->argv[3]->ptr);
4506 size_t rangelen, strlen;
4507 sds range;
4508
4509 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4510 checkType(c,o,REDIS_STRING)) return;
4511
4512 o = getDecodedObject(o);
4513 strlen = sdslen(o->ptr);
4514
4515 /* convert negative indexes */
4516 if (start < 0) start = strlen+start;
4517 if (end < 0) end = strlen+end;
4518 if (start < 0) start = 0;
4519 if (end < 0) end = 0;
4520
4521 /* indexes sanity checks */
4522 if (start > end || (size_t)start >= strlen) {
4523 /* Out of range start or start > end result in null reply */
4524 addReply(c,shared.nullbulk);
4525 decrRefCount(o);
4526 return;
4527 }
4528 if ((size_t)end >= strlen) end = strlen-1;
4529 rangelen = (end-start)+1;
4530
4531 /* Return the result */
4532 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4533 range = sdsnewlen((char*)o->ptr+start,rangelen);
4534 addReplySds(c,range);
4535 addReply(c,shared.crlf);
4536 decrRefCount(o);
4537 }
4538
4539 /* ========================= Type agnostic commands ========================= */
4540
4541 static void delCommand(redisClient *c) {
4542 int deleted = 0, j;
4543
4544 for (j = 1; j < c->argc; j++) {
4545 if (deleteKey(c->db,c->argv[j])) {
4546 touchWatchedKey(c->db,c->argv[j]);
4547 server.dirty++;
4548 deleted++;
4549 }
4550 }
4551 addReplyLongLong(c,deleted);
4552 }
4553
4554 static void existsCommand(redisClient *c) {
4555 expireIfNeeded(c->db,c->argv[1]);
4556 if (dictFind(c->db->dict,c->argv[1])) {
4557 addReply(c, shared.cone);
4558 } else {
4559 addReply(c, shared.czero);
4560 }
4561 }
4562
4563 static void selectCommand(redisClient *c) {
4564 int id = atoi(c->argv[1]->ptr);
4565
4566 if (selectDb(c,id) == REDIS_ERR) {
4567 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4568 } else {
4569 addReply(c,shared.ok);
4570 }
4571 }
4572
4573 static void randomkeyCommand(redisClient *c) {
4574 dictEntry *de;
4575 robj *key;
4576
4577 while(1) {
4578 de = dictGetRandomKey(c->db->dict);
4579 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4580 }
4581
4582 if (de == NULL) {
4583 addReply(c,shared.nullbulk);
4584 return;
4585 }
4586
4587 key = dictGetEntryKey(de);
4588 if (server.vm_enabled) {
4589 key = dupStringObject(key);
4590 addReplyBulk(c,key);
4591 decrRefCount(key);
4592 } else {
4593 addReplyBulk(c,key);
4594 }
4595 }
4596
4597 static void keysCommand(redisClient *c) {
4598 dictIterator *di;
4599 dictEntry *de;
4600 sds pattern = c->argv[1]->ptr;
4601 int plen = sdslen(pattern);
4602 unsigned long numkeys = 0;
4603 robj *lenobj = createObject(REDIS_STRING,NULL);
4604
4605 di = dictGetIterator(c->db->dict);
4606 addReply(c,lenobj);
4607 decrRefCount(lenobj);
4608 while((de = dictNext(di)) != NULL) {
4609 robj *keyobj = dictGetEntryKey(de);
4610
4611 sds key = keyobj->ptr;
4612 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4613 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4614 if (expireIfNeeded(c->db,keyobj) == 0) {
4615 addReplyBulk(c,keyobj);
4616 numkeys++;
4617 }
4618 }
4619 }
4620 dictReleaseIterator(di);
4621 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4622 }
4623
4624 static void dbsizeCommand(redisClient *c) {
4625 addReplySds(c,
4626 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4627 }
4628
4629 static void lastsaveCommand(redisClient *c) {
4630 addReplySds(c,
4631 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4632 }
4633
4634 static void typeCommand(redisClient *c) {
4635 robj *o;
4636 char *type;
4637
4638 o = lookupKeyRead(c->db,c->argv[1]);
4639 if (o == NULL) {
4640 type = "+none";
4641 } else {
4642 switch(o->type) {
4643 case REDIS_STRING: type = "+string"; break;
4644 case REDIS_LIST: type = "+list"; break;
4645 case REDIS_SET: type = "+set"; break;
4646 case REDIS_ZSET: type = "+zset"; break;
4647 case REDIS_HASH: type = "+hash"; break;
4648 default: type = "+unknown"; break;
4649 }
4650 }
4651 addReplySds(c,sdsnew(type));
4652 addReply(c,shared.crlf);
4653 }
4654
4655 static void saveCommand(redisClient *c) {
4656 if (server.bgsavechildpid != -1) {
4657 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4658 return;
4659 }
4660 if (rdbSave(server.dbfilename) == REDIS_OK) {
4661 addReply(c,shared.ok);
4662 } else {
4663 addReply(c,shared.err);
4664 }
4665 }
4666
4667 static void bgsaveCommand(redisClient *c) {
4668 if (server.bgsavechildpid != -1) {
4669 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4670 return;
4671 }
4672 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4673 char *status = "+Background saving started\r\n";
4674 addReplySds(c,sdsnew(status));
4675 } else {
4676 addReply(c,shared.err);
4677 }
4678 }
4679
4680 static void shutdownCommand(redisClient *c) {
4681 if (prepareForShutdown() == REDIS_OK)
4682 exit(0);
4683 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4684 }
4685
4686 static void renameGenericCommand(redisClient *c, int nx) {
4687 robj *o;
4688
4689 /* To use the same key as src and dst is probably an error */
4690 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4691 addReply(c,shared.sameobjecterr);
4692 return;
4693 }
4694
4695 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4696 return;
4697
4698 incrRefCount(o);
4699 deleteIfVolatile(c->db,c->argv[2]);
4700 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4701 if (nx) {
4702 decrRefCount(o);
4703 addReply(c,shared.czero);
4704 return;
4705 }
4706 dictReplace(c->db->dict,c->argv[2],o);
4707 } else {
4708 incrRefCount(c->argv[2]);
4709 }
4710 deleteKey(c->db,c->argv[1]);
4711 touchWatchedKey(c->db,c->argv[2]);
4712 server.dirty++;
4713 addReply(c,nx ? shared.cone : shared.ok);
4714 }
4715
4716 static void renameCommand(redisClient *c) {
4717 renameGenericCommand(c,0);
4718 }
4719
4720 static void renamenxCommand(redisClient *c) {
4721 renameGenericCommand(c,1);
4722 }
4723
4724 static void moveCommand(redisClient *c) {
4725 robj *o;
4726 redisDb *src, *dst;
4727 int srcid;
4728
4729 /* Obtain source and target DB pointers */
4730 src = c->db;
4731 srcid = c->db->id;
4732 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4733 addReply(c,shared.outofrangeerr);
4734 return;
4735 }
4736 dst = c->db;
4737 selectDb(c,srcid); /* Back to the source DB */
4738
4739 /* If the user is moving using as target the same
4740 * DB as the source DB it is probably an error. */
4741 if (src == dst) {
4742 addReply(c,shared.sameobjecterr);
4743 return;
4744 }
4745
4746 /* Check if the element exists and get a reference */
4747 o = lookupKeyWrite(c->db,c->argv[1]);
4748 if (!o) {
4749 addReply(c,shared.czero);
4750 return;
4751 }
4752
4753 /* Try to add the element to the target DB */
4754 deleteIfVolatile(dst,c->argv[1]);
4755 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4756 addReply(c,shared.czero);
4757 return;
4758 }
4759 incrRefCount(c->argv[1]);
4760 incrRefCount(o);
4761
4762 /* OK! key moved, free the entry in the source DB */
4763 deleteKey(src,c->argv[1]);
4764 server.dirty++;
4765 addReply(c,shared.cone);
4766 }
4767
4768 /* =================================== Lists ================================ */
4769 static void pushGenericCommand(redisClient *c, int where) {
4770 robj *lobj;
4771 list *list;
4772
4773 lobj = lookupKeyWrite(c->db,c->argv[1]);
4774 if (lobj == NULL) {
4775 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4776 addReply(c,shared.cone);
4777 return;
4778 }
4779 lobj = createListObject();
4780 list = lobj->ptr;
4781 if (where == REDIS_HEAD) {
4782 listAddNodeHead(list,c->argv[2]);
4783 } else {
4784 listAddNodeTail(list,c->argv[2]);
4785 }
4786 dictAdd(c->db->dict,c->argv[1],lobj);
4787 incrRefCount(c->argv[1]);
4788 incrRefCount(c->argv[2]);
4789 } else {
4790 if (lobj->type != REDIS_LIST) {
4791 addReply(c,shared.wrongtypeerr);
4792 return;
4793 }
4794 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4795 addReply(c,shared.cone);
4796 return;
4797 }
4798 list = lobj->ptr;
4799 if (where == REDIS_HEAD) {
4800 listAddNodeHead(list,c->argv[2]);
4801 } else {
4802 listAddNodeTail(list,c->argv[2]);
4803 }
4804 incrRefCount(c->argv[2]);
4805 }
4806 server.dirty++;
4807 addReplyLongLong(c,listLength(list));
4808 }
4809
4810 static void lpushCommand(redisClient *c) {
4811 pushGenericCommand(c,REDIS_HEAD);
4812 }
4813
4814 static void rpushCommand(redisClient *c) {
4815 pushGenericCommand(c,REDIS_TAIL);
4816 }
4817
4818 static void llenCommand(redisClient *c) {
4819 robj *o;
4820 list *l;
4821
4822 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4823 checkType(c,o,REDIS_LIST)) return;
4824
4825 l = o->ptr;
4826 addReplyUlong(c,listLength(l));
4827 }
4828
4829 static void lindexCommand(redisClient *c) {
4830 robj *o;
4831 int index = atoi(c->argv[2]->ptr);
4832 list *list;
4833 listNode *ln;
4834
4835 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4836 checkType(c,o,REDIS_LIST)) return;
4837 list = o->ptr;
4838
4839 ln = listIndex(list, index);
4840 if (ln == NULL) {
4841 addReply(c,shared.nullbulk);
4842 } else {
4843 robj *ele = listNodeValue(ln);
4844 addReplyBulk(c,ele);
4845 }
4846 }
4847
4848 static void lsetCommand(redisClient *c) {
4849 robj *o;
4850 int index = atoi(c->argv[2]->ptr);
4851 list *list;
4852 listNode *ln;
4853
4854 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4855 checkType(c,o,REDIS_LIST)) return;
4856 list = o->ptr;
4857
4858 ln = listIndex(list, index);
4859 if (ln == NULL) {
4860 addReply(c,shared.outofrangeerr);
4861 } else {
4862 robj *ele = listNodeValue(ln);
4863
4864 decrRefCount(ele);
4865 listNodeValue(ln) = c->argv[3];
4866 incrRefCount(c->argv[3]);
4867 addReply(c,shared.ok);
4868 server.dirty++;
4869 }
4870 }
4871
4872 static void popGenericCommand(redisClient *c, int where) {
4873 robj *o;
4874 list *list;
4875 listNode *ln;
4876
4877 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4878 checkType(c,o,REDIS_LIST)) return;
4879 list = o->ptr;
4880
4881 if (where == REDIS_HEAD)
4882 ln = listFirst(list);
4883 else
4884 ln = listLast(list);
4885
4886 if (ln == NULL) {
4887 addReply(c,shared.nullbulk);
4888 } else {
4889 robj *ele = listNodeValue(ln);
4890 addReplyBulk(c,ele);
4891 listDelNode(list,ln);
4892 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4893 server.dirty++;
4894 }
4895 }
4896
4897 static void lpopCommand(redisClient *c) {
4898 popGenericCommand(c,REDIS_HEAD);
4899 }
4900
4901 static void rpopCommand(redisClient *c) {
4902 popGenericCommand(c,REDIS_TAIL);
4903 }
4904
4905 static void lrangeCommand(redisClient *c) {
4906 robj *o;
4907 int start = atoi(c->argv[2]->ptr);
4908 int end = atoi(c->argv[3]->ptr);
4909 int llen;
4910 int rangelen, j;
4911 list *list;
4912 listNode *ln;
4913 robj *ele;
4914
4915 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4916 || checkType(c,o,REDIS_LIST)) return;
4917 list = o->ptr;
4918 llen = listLength(list);
4919
4920 /* convert negative indexes */
4921 if (start < 0) start = llen+start;
4922 if (end < 0) end = llen+end;
4923 if (start < 0) start = 0;
4924 if (end < 0) end = 0;
4925
4926 /* indexes sanity checks */
4927 if (start > end || start >= llen) {
4928 /* Out of range start or start > end result in empty list */
4929 addReply(c,shared.emptymultibulk);
4930 return;
4931 }
4932 if (end >= llen) end = llen-1;
4933 rangelen = (end-start)+1;
4934
4935 /* Return the result in form of a multi-bulk reply */
4936 ln = listIndex(list, start);
4937 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4938 for (j = 0; j < rangelen; j++) {
4939 ele = listNodeValue(ln);
4940 addReplyBulk(c,ele);
4941 ln = ln->next;
4942 }
4943 }
4944
4945 static void ltrimCommand(redisClient *c) {
4946 robj *o;
4947 int start = atoi(c->argv[2]->ptr);
4948 int end = atoi(c->argv[3]->ptr);
4949 int llen;
4950 int j, ltrim, rtrim;
4951 list *list;
4952 listNode *ln;
4953
4954 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4955 checkType(c,o,REDIS_LIST)) return;
4956 list = o->ptr;
4957 llen = listLength(list);
4958
4959 /* convert negative indexes */
4960 if (start < 0) start = llen+start;
4961 if (end < 0) end = llen+end;
4962 if (start < 0) start = 0;
4963 if (end < 0) end = 0;
4964
4965 /* indexes sanity checks */
4966 if (start > end || start >= llen) {
4967 /* Out of range start or start > end result in empty list */
4968 ltrim = llen;
4969 rtrim = 0;
4970 } else {
4971 if (end >= llen) end = llen-1;
4972 ltrim = start;
4973 rtrim = llen-end-1;
4974 }
4975
4976 /* Remove list elements to perform the trim */
4977 for (j = 0; j < ltrim; j++) {
4978 ln = listFirst(list);
4979 listDelNode(list,ln);
4980 }
4981 for (j = 0; j < rtrim; j++) {
4982 ln = listLast(list);
4983 listDelNode(list,ln);
4984 }
4985 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4986 server.dirty++;
4987 addReply(c,shared.ok);
4988 }
4989
4990 static void lremCommand(redisClient *c) {
4991 robj *o;
4992 list *list;
4993 listNode *ln, *next;
4994 int toremove = atoi(c->argv[2]->ptr);
4995 int removed = 0;
4996 int fromtail = 0;
4997
4998 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4999 checkType(c,o,REDIS_LIST)) return;
5000 list = o->ptr;
5001
5002 if (toremove < 0) {
5003 toremove = -toremove;
5004 fromtail = 1;
5005 }
5006 ln = fromtail ? list->tail : list->head;
5007 while (ln) {
5008 robj *ele = listNodeValue(ln);
5009
5010 next = fromtail ? ln->prev : ln->next;
5011 if (equalStringObjects(ele,c->argv[3])) {
5012 listDelNode(list,ln);
5013 server.dirty++;
5014 removed++;
5015 if (toremove && removed == toremove) break;
5016 }
5017 ln = next;
5018 }
5019 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
5020 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5021 }
5022
5023 /* This is the semantic of this command:
5024 * RPOPLPUSH srclist dstlist:
5025 * IF LLEN(srclist) > 0
5026 * element = RPOP srclist
5027 * LPUSH dstlist element
5028 * RETURN element
5029 * ELSE
5030 * RETURN nil
5031 * END
5032 * END
5033 *
5034 * The idea is to be able to get an element from a list in a reliable way
5035 * since the element is not just returned but pushed against another list
5036 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5037 */
5038 static void rpoplpushcommand(redisClient *c) {
5039 robj *sobj;
5040 list *srclist;
5041 listNode *ln;
5042
5043 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5044 checkType(c,sobj,REDIS_LIST)) return;
5045 srclist = sobj->ptr;
5046 ln = listLast(srclist);
5047
5048 if (ln == NULL) {
5049 addReply(c,shared.nullbulk);
5050 } else {
5051 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5052 robj *ele = listNodeValue(ln);
5053 list *dstlist;
5054
5055 if (dobj && dobj->type != REDIS_LIST) {
5056 addReply(c,shared.wrongtypeerr);
5057 return;
5058 }
5059
5060 /* Add the element to the target list (unless it's directly
5061 * passed to some BLPOP-ing client */
5062 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5063 if (dobj == NULL) {
5064 /* Create the list if the key does not exist */
5065 dobj = createListObject();
5066 dictAdd(c->db->dict,c->argv[2],dobj);
5067 incrRefCount(c->argv[2]);
5068 }
5069 dstlist = dobj->ptr;
5070 listAddNodeHead(dstlist,ele);
5071 incrRefCount(ele);
5072 }
5073
5074 /* Send the element to the client as reply as well */
5075 addReplyBulk(c,ele);
5076
5077 /* Finally remove the element from the source list */
5078 listDelNode(srclist,ln);
5079 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5080 server.dirty++;
5081 }
5082 }
5083
5084 /* ==================================== Sets ================================ */
5085
5086 static void saddCommand(redisClient *c) {
5087 robj *set;
5088
5089 set = lookupKeyWrite(c->db,c->argv[1]);
5090 if (set == NULL) {
5091 set = createSetObject();
5092 dictAdd(c->db->dict,c->argv[1],set);
5093 incrRefCount(c->argv[1]);
5094 } else {
5095 if (set->type != REDIS_SET) {
5096 addReply(c,shared.wrongtypeerr);
5097 return;
5098 }
5099 }
5100 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5101 incrRefCount(c->argv[2]);
5102 server.dirty++;
5103 addReply(c,shared.cone);
5104 } else {
5105 addReply(c,shared.czero);
5106 }
5107 }
5108
5109 static void sremCommand(redisClient *c) {
5110 robj *set;
5111
5112 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5113 checkType(c,set,REDIS_SET)) return;
5114
5115 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5116 server.dirty++;
5117 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5118 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5119 addReply(c,shared.cone);
5120 } else {
5121 addReply(c,shared.czero);
5122 }
5123 }
5124
5125 static void smoveCommand(redisClient *c) {
5126 robj *srcset, *dstset;
5127
5128 srcset = lookupKeyWrite(c->db,c->argv[1]);
5129 dstset = lookupKeyWrite(c->db,c->argv[2]);
5130
5131 /* If the source key does not exist return 0, if it's of the wrong type
5132 * raise an error */
5133 if (srcset == NULL || srcset->type != REDIS_SET) {
5134 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5135 return;
5136 }
5137 /* Error if the destination key is not a set as well */
5138 if (dstset && dstset->type != REDIS_SET) {
5139 addReply(c,shared.wrongtypeerr);
5140 return;
5141 }
5142 /* Remove the element from the source set */
5143 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5144 /* Key not found in the src set! return zero */
5145 addReply(c,shared.czero);
5146 return;
5147 }
5148 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5149 deleteKey(c->db,c->argv[1]);
5150 server.dirty++;
5151 /* Add the element to the destination set */
5152 if (!dstset) {
5153 dstset = createSetObject();
5154 dictAdd(c->db->dict,c->argv[2],dstset);
5155 incrRefCount(c->argv[2]);
5156 }
5157 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5158 incrRefCount(c->argv[3]);
5159 addReply(c,shared.cone);
5160 }
5161
5162 static void sismemberCommand(redisClient *c) {
5163 robj *set;
5164
5165 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5166 checkType(c,set,REDIS_SET)) return;
5167
5168 if (dictFind(set->ptr,c->argv[2]))
5169 addReply(c,shared.cone);
5170 else
5171 addReply(c,shared.czero);
5172 }
5173
5174 static void scardCommand(redisClient *c) {
5175 robj *o;
5176 dict *s;
5177
5178 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5179 checkType(c,o,REDIS_SET)) return;
5180
5181 s = o->ptr;
5182 addReplyUlong(c,dictSize(s));
5183 }
5184
5185 static void spopCommand(redisClient *c) {
5186 robj *set;
5187 dictEntry *de;
5188
5189 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5190 checkType(c,set,REDIS_SET)) return;
5191
5192 de = dictGetRandomKey(set->ptr);
5193 if (de == NULL) {
5194 addReply(c,shared.nullbulk);
5195 } else {
5196 robj *ele = dictGetEntryKey(de);
5197
5198 addReplyBulk(c,ele);
5199 dictDelete(set->ptr,ele);
5200 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5201 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5202 server.dirty++;
5203 }
5204 }
5205
5206 static void srandmemberCommand(redisClient *c) {
5207 robj *set;
5208 dictEntry *de;
5209
5210 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5211 checkType(c,set,REDIS_SET)) return;
5212
5213 de = dictGetRandomKey(set->ptr);
5214 if (de == NULL) {
5215 addReply(c,shared.nullbulk);
5216 } else {
5217 robj *ele = dictGetEntryKey(de);
5218
5219 addReplyBulk(c,ele);
5220 }
5221 }
5222
5223 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5224 dict **d1 = (void*) s1, **d2 = (void*) s2;
5225
5226 return dictSize(*d1)-dictSize(*d2);
5227 }
5228
5229 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5230 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5231 dictIterator *di;
5232 dictEntry *de;
5233 robj *lenobj = NULL, *dstset = NULL;
5234 unsigned long j, cardinality = 0;
5235
5236 for (j = 0; j < setsnum; j++) {
5237 robj *setobj;
5238
5239 setobj = dstkey ?
5240 lookupKeyWrite(c->db,setskeys[j]) :
5241 lookupKeyRead(c->db,setskeys[j]);
5242 if (!setobj) {
5243 zfree(dv);
5244 if (dstkey) {
5245 if (deleteKey(c->db,dstkey))
5246 server.dirty++;
5247 addReply(c,shared.czero);
5248 } else {
5249 addReply(c,shared.emptymultibulk);
5250 }
5251 return;
5252 }
5253 if (setobj->type != REDIS_SET) {
5254 zfree(dv);
5255 addReply(c,shared.wrongtypeerr);
5256 return;
5257 }
5258 dv[j] = setobj->ptr;
5259 }
5260 /* Sort sets from the smallest to largest, this will improve our
5261 * algorithm's performace */
5262 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5263
5264 /* The first thing we should output is the total number of elements...
5265 * since this is a multi-bulk write, but at this stage we don't know
5266 * the intersection set size, so we use a trick, append an empty object
5267 * to the output list and save the pointer to later modify it with the
5268 * right length */
5269 if (!dstkey) {
5270 lenobj = createObject(REDIS_STRING,NULL);
5271 addReply(c,lenobj);
5272 decrRefCount(lenobj);
5273 } else {
5274 /* If we have a target key where to store the resulting set
5275 * create this key with an empty set inside */
5276 dstset = createSetObject();
5277 }
5278
5279 /* Iterate all the elements of the first (smallest) set, and test
5280 * the element against all the other sets, if at least one set does
5281 * not include the element it is discarded */
5282 di = dictGetIterator(dv[0]);
5283
5284 while((de = dictNext(di)) != NULL) {
5285 robj *ele;
5286
5287 for (j = 1; j < setsnum; j++)
5288 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5289 if (j != setsnum)
5290 continue; /* at least one set does not contain the member */
5291 ele = dictGetEntryKey(de);
5292 if (!dstkey) {
5293 addReplyBulk(c,ele);
5294 cardinality++;
5295 } else {
5296 dictAdd(dstset->ptr,ele,NULL);
5297 incrRefCount(ele);
5298 }
5299 }
5300 dictReleaseIterator(di);
5301
5302 if (dstkey) {
5303 /* Store the resulting set into the target, if the intersection
5304 * is not an empty set. */
5305 deleteKey(c->db,dstkey);
5306 if (dictSize((dict*)dstset->ptr) > 0) {
5307 dictAdd(c->db->dict,dstkey,dstset);
5308 incrRefCount(dstkey);
5309 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5310 } else {
5311 decrRefCount(dstset);
5312 addReply(c,shared.czero);
5313 }
5314 server.dirty++;
5315 } else {
5316 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5317 }
5318 zfree(dv);
5319 }
5320
5321 static void sinterCommand(redisClient *c) {
5322 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5323 }
5324
5325 static void sinterstoreCommand(redisClient *c) {
5326 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5327 }
5328
5329 #define REDIS_OP_UNION 0
5330 #define REDIS_OP_DIFF 1
5331 #define REDIS_OP_INTER 2
5332
5333 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5334 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5335 dictIterator *di;
5336 dictEntry *de;
5337 robj *dstset = NULL;
5338 int j, cardinality = 0;
5339
5340 for (j = 0; j < setsnum; j++) {
5341 robj *setobj;
5342
5343 setobj = dstkey ?
5344 lookupKeyWrite(c->db,setskeys[j]) :
5345 lookupKeyRead(c->db,setskeys[j]);
5346 if (!setobj) {
5347 dv[j] = NULL;
5348 continue;
5349 }
5350 if (setobj->type != REDIS_SET) {
5351 zfree(dv);
5352 addReply(c,shared.wrongtypeerr);
5353 return;
5354 }
5355 dv[j] = setobj->ptr;
5356 }
5357
5358 /* We need a temp set object to store our union. If the dstkey
5359 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5360 * this set object will be the resulting object to set into the target key*/
5361 dstset = createSetObject();
5362
5363 /* Iterate all the elements of all the sets, add every element a single
5364 * time to the result set */
5365 for (j = 0; j < setsnum; j++) {
5366 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5367 if (!dv[j]) continue; /* non existing keys are like empty sets */
5368
5369 di = dictGetIterator(dv[j]);
5370
5371 while((de = dictNext(di)) != NULL) {
5372 robj *ele;
5373
5374 /* dictAdd will not add the same element multiple times */
5375 ele = dictGetEntryKey(de);
5376 if (op == REDIS_OP_UNION || j == 0) {
5377 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5378 incrRefCount(ele);
5379 cardinality++;
5380 }
5381 } else if (op == REDIS_OP_DIFF) {
5382 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5383 cardinality--;
5384 }
5385 }
5386 }
5387 dictReleaseIterator(di);
5388
5389 /* result set is empty? Exit asap. */
5390 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5391 }
5392
5393 /* Output the content of the resulting set, if not in STORE mode */
5394 if (!dstkey) {
5395 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5396 di = dictGetIterator(dstset->ptr);
5397 while((de = dictNext(di)) != NULL) {
5398 robj *ele;
5399
5400 ele = dictGetEntryKey(de);
5401 addReplyBulk(c,ele);
5402 }
5403 dictReleaseIterator(di);
5404 decrRefCount(dstset);
5405 } else {
5406 /* If we have a target key where to store the resulting set
5407 * create this key with the result set inside */
5408 deleteKey(c->db,dstkey);
5409 if (dictSize((dict*)dstset->ptr) > 0) {
5410 dictAdd(c->db->dict,dstkey,dstset);
5411 incrRefCount(dstkey);
5412 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5413 } else {
5414 decrRefCount(dstset);
5415 addReply(c,shared.czero);
5416 }
5417 server.dirty++;
5418 }
5419 zfree(dv);
5420 }
5421
5422 static void sunionCommand(redisClient *c) {
5423 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5424 }
5425
5426 static void sunionstoreCommand(redisClient *c) {
5427 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5428 }
5429
5430 static void sdiffCommand(redisClient *c) {
5431 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5432 }
5433
5434 static void sdiffstoreCommand(redisClient *c) {
5435 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5436 }
5437
5438 /* ==================================== ZSets =============================== */
5439
5440 /* ZSETs are ordered sets using two data structures to hold the same elements
5441 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5442 * data structure.
5443 *
5444 * The elements are added to an hash table mapping Redis objects to scores.
5445 * At the same time the elements are added to a skip list mapping scores
5446 * to Redis objects (so objects are sorted by scores in this "view"). */
5447
5448 /* This skiplist implementation is almost a C translation of the original
5449 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5450 * Alternative to Balanced Trees", modified in three ways:
5451 * a) this implementation allows for repeated values.
5452 * b) the comparison is not just by key (our 'score') but by satellite data.
5453 * c) there is a back pointer, so it's a doubly linked list with the back
5454 * pointers being only at "level 1". This allows to traverse the list
5455 * from tail to head, useful for ZREVRANGE. */
5456
5457 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5458 zskiplistNode *zn = zmalloc(sizeof(*zn));
5459
5460 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5461 if (level > 1)
5462 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5463 else
5464 zn->span = NULL;
5465 zn->score = score;
5466 zn->obj = obj;
5467 return zn;
5468 }
5469
5470 static zskiplist *zslCreate(void) {
5471 int j;
5472 zskiplist *zsl;
5473
5474 zsl = zmalloc(sizeof(*zsl));
5475 zsl->level = 1;
5476 zsl->length = 0;
5477 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5478 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5479 zsl->header->forward[j] = NULL;
5480
5481 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5482 if (j < ZSKIPLIST_MAXLEVEL-1)
5483 zsl->header->span[j] = 0;
5484 }
5485 zsl->header->backward = NULL;
5486 zsl->tail = NULL;
5487 return zsl;
5488 }
5489
5490 static void zslFreeNode(zskiplistNode *node) {
5491 decrRefCount(node->obj);
5492 zfree(node->forward);
5493 zfree(node->span);
5494 zfree(node);
5495 }
5496
5497 static void zslFree(zskiplist *zsl) {
5498 zskiplistNode *node = zsl->header->forward[0], *next;
5499
5500 zfree(zsl->header->forward);
5501 zfree(zsl->header->span);
5502 zfree(zsl->header);
5503 while(node) {
5504 next = node->forward[0];
5505 zslFreeNode(node);
5506 node = next;
5507 }
5508 zfree(zsl);
5509 }
5510
5511 static int zslRandomLevel(void) {
5512 int level = 1;
5513 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5514 level += 1;
5515 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5516 }
5517
5518 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5519 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5520 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5521 int i, level;
5522
5523 x = zsl->header;
5524 for (i = zsl->level-1; i >= 0; i--) {
5525 /* store rank that is crossed to reach the insert position */
5526 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5527
5528 while (x->forward[i] &&
5529 (x->forward[i]->score < score ||
5530 (x->forward[i]->score == score &&
5531 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5532 rank[i] += i > 0 ? x->span[i-1] : 1;
5533 x = x->forward[i];
5534 }
5535 update[i] = x;
5536 }
5537 /* we assume the key is not already inside, since we allow duplicated
5538 * scores, and the re-insertion of score and redis object should never
5539 * happpen since the caller of zslInsert() should test in the hash table
5540 * if the element is already inside or not. */
5541 level = zslRandomLevel();
5542 if (level > zsl->level) {
5543 for (i = zsl->level; i < level; i++) {
5544 rank[i] = 0;
5545 update[i] = zsl->header;
5546 update[i]->span[i-1] = zsl->length;
5547 }
5548 zsl->level = level;
5549 }
5550 x = zslCreateNode(level,score,obj);
5551 for (i = 0; i < level; i++) {
5552 x->forward[i] = update[i]->forward[i];
5553 update[i]->forward[i] = x;
5554
5555 /* update span covered by update[i] as x is inserted here */
5556 if (i > 0) {
5557 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5558 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5559 }
5560 }
5561
5562 /* increment span for untouched levels */
5563 for (i = level; i < zsl->level; i++) {
5564 update[i]->span[i-1]++;
5565 }
5566
5567 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5568 if (x->forward[0])
5569 x->forward[0]->backward = x;
5570 else
5571 zsl->tail = x;
5572 zsl->length++;
5573 }
5574
5575 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5576 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5577 int i;
5578 for (i = 0; i < zsl->level; i++) {
5579 if (update[i]->forward[i] == x) {
5580 if (i > 0) {
5581 update[i]->span[i-1] += x->span[i-1] - 1;
5582 }
5583 update[i]->forward[i] = x->forward[i];
5584 } else {
5585 /* invariant: i > 0, because update[0]->forward[0]
5586 * is always equal to x */
5587 update[i]->span[i-1] -= 1;
5588 }
5589 }
5590 if (x->forward[0]) {
5591 x->forward[0]->backward = x->backward;
5592 } else {
5593 zsl->tail = x->backward;
5594 }
5595 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5596 zsl->level--;
5597 zsl->length--;
5598 }
5599
5600 /* Delete an element with matching score/object from the skiplist. */
5601 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5602 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5603 int i;
5604
5605 x = zsl->header;
5606 for (i = zsl->level-1; i >= 0; i--) {
5607 while (x->forward[i] &&
5608 (x->forward[i]->score < score ||
5609 (x->forward[i]->score == score &&
5610 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5611 x = x->forward[i];
5612 update[i] = x;
5613 }
5614 /* We may have multiple elements with the same score, what we need
5615 * is to find the element with both the right score and object. */
5616 x = x->forward[0];
5617 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5618 zslDeleteNode(zsl, x, update);
5619 zslFreeNode(x);
5620 return 1;
5621 } else {
5622 return 0; /* not found */
5623 }
5624 return 0; /* not found */
5625 }
5626
5627 /* Delete all the elements with score between min and max from the skiplist.
5628 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5629 * Note that this function takes the reference to the hash table view of the
5630 * sorted set, in order to remove the elements from the hash table too. */
5631 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5632 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5633 unsigned long removed = 0;
5634 int i;
5635
5636 x = zsl->header;
5637 for (i = zsl->level-1; i >= 0; i--) {
5638 while (x->forward[i] && x->forward[i]->score < min)
5639 x = x->forward[i];
5640 update[i] = x;
5641 }
5642 /* We may have multiple elements with the same score, what we need
5643 * is to find the element with both the right score and object. */
5644 x = x->forward[0];
5645 while (x && x->score <= max) {
5646 zskiplistNode *next = x->forward[0];
5647 zslDeleteNode(zsl, x, update);
5648 dictDelete(dict,x->obj);
5649 zslFreeNode(x);
5650 removed++;
5651 x = next;
5652 }
5653 return removed; /* not found */
5654 }
5655
5656 /* Delete all the elements with rank between start and end from the skiplist.
5657 * Start and end are inclusive. Note that start and end need to be 1-based */
5658 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5659 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5660 unsigned long traversed = 0, removed = 0;
5661 int i;
5662
5663 x = zsl->header;
5664 for (i = zsl->level-1; i >= 0; i--) {
5665 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5666 traversed += i > 0 ? x->span[i-1] : 1;
5667 x = x->forward[i];
5668 }
5669 update[i] = x;
5670 }
5671
5672 traversed++;
5673 x = x->forward[0];
5674 while (x && traversed <= end) {
5675 zskiplistNode *next = x->forward[0];
5676 zslDeleteNode(zsl, x, update);
5677 dictDelete(dict,x->obj);
5678 zslFreeNode(x);
5679 removed++;
5680 traversed++;
5681 x = next;
5682 }
5683 return removed;
5684 }
5685
5686 /* Find the first node having a score equal or greater than the specified one.
5687 * Returns NULL if there is no match. */
5688 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5689 zskiplistNode *x;
5690 int i;
5691
5692 x = zsl->header;
5693 for (i = zsl->level-1; i >= 0; i--) {
5694 while (x->forward[i] && x->forward[i]->score < score)
5695 x = x->forward[i];
5696 }
5697 /* We may have multiple elements with the same score, what we need
5698 * is to find the element with both the right score and object. */
5699 return x->forward[0];
5700 }
5701
5702 /* Find the rank for an element by both score and key.
5703 * Returns 0 when the element cannot be found, rank otherwise.
5704 * Note that the rank is 1-based due to the span of zsl->header to the
5705 * first element. */
5706 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5707 zskiplistNode *x;
5708 unsigned long rank = 0;
5709 int i;
5710
5711 x = zsl->header;
5712 for (i = zsl->level-1; i >= 0; i--) {
5713 while (x->forward[i] &&
5714 (x->forward[i]->score < score ||
5715 (x->forward[i]->score == score &&
5716 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5717 rank += i > 0 ? x->span[i-1] : 1;
5718 x = x->forward[i];
5719 }
5720
5721 /* x might be equal to zsl->header, so test if obj is non-NULL */
5722 if (x->obj && equalStringObjects(x->obj,o)) {
5723 return rank;
5724 }
5725 }
5726 return 0;
5727 }
5728
5729 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5730 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5731 zskiplistNode *x;
5732 unsigned long traversed = 0;
5733 int i;
5734
5735 x = zsl->header;
5736 for (i = zsl->level-1; i >= 0; i--) {
5737 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5738 {
5739 traversed += i > 0 ? x->span[i-1] : 1;
5740 x = x->forward[i];
5741 }
5742 if (traversed == rank) {
5743 return x;
5744 }
5745 }
5746 return NULL;
5747 }
5748
5749 /* The actual Z-commands implementations */
5750
5751 /* This generic command implements both ZADD and ZINCRBY.
5752 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5753 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5754 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5755 robj *zsetobj;
5756 zset *zs;
5757 double *score;
5758
5759 if (isnan(scoreval)) {
5760 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5761 return;
5762 }
5763
5764 zsetobj = lookupKeyWrite(c->db,key);
5765 if (zsetobj == NULL) {
5766 zsetobj = createZsetObject();
5767 dictAdd(c->db->dict,key,zsetobj);
5768 incrRefCount(key);
5769 } else {
5770 if (zsetobj->type != REDIS_ZSET) {
5771 addReply(c,shared.wrongtypeerr);
5772 return;
5773 }
5774 }
5775 zs = zsetobj->ptr;
5776
5777 /* Ok now since we implement both ZADD and ZINCRBY here the code
5778 * needs to handle the two different conditions. It's all about setting
5779 * '*score', that is, the new score to set, to the right value. */
5780 score = zmalloc(sizeof(double));
5781 if (doincrement) {
5782 dictEntry *de;
5783
5784 /* Read the old score. If the element was not present starts from 0 */
5785 de = dictFind(zs->dict,ele);
5786 if (de) {
5787 double *oldscore = dictGetEntryVal(de);
5788 *score = *oldscore + scoreval;
5789 } else {
5790 *score = scoreval;
5791 }
5792 if (isnan(*score)) {
5793 addReplySds(c,
5794 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5795 zfree(score);
5796 /* Note that we don't need to check if the zset may be empty and
5797 * should be removed here, as we can only obtain Nan as score if
5798 * there was already an element in the sorted set. */
5799 return;
5800 }
5801 } else {
5802 *score = scoreval;
5803 }
5804
5805 /* What follows is a simple remove and re-insert operation that is common
5806 * to both ZADD and ZINCRBY... */
5807 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5808 /* case 1: New element */
5809 incrRefCount(ele); /* added to hash */
5810 zslInsert(zs->zsl,*score,ele);
5811 incrRefCount(ele); /* added to skiplist */
5812 server.dirty++;
5813 if (doincrement)
5814 addReplyDouble(c,*score);
5815 else
5816 addReply(c,shared.cone);
5817 } else {
5818 dictEntry *de;
5819 double *oldscore;
5820
5821 /* case 2: Score update operation */
5822 de = dictFind(zs->dict,ele);
5823 redisAssert(de != NULL);
5824 oldscore = dictGetEntryVal(de);
5825 if (*score != *oldscore) {
5826 int deleted;
5827
5828 /* Remove and insert the element in the skip list with new score */
5829 deleted = zslDelete(zs->zsl,*oldscore,ele);
5830 redisAssert(deleted != 0);
5831 zslInsert(zs->zsl,*score,ele);
5832 incrRefCount(ele);
5833 /* Update the score in the hash table */
5834 dictReplace(zs->dict,ele,score);
5835 server.dirty++;
5836 } else {
5837 zfree(score);
5838 }
5839 if (doincrement)
5840 addReplyDouble(c,*score);
5841 else
5842 addReply(c,shared.czero);
5843 }
5844 }
5845
5846 static void zaddCommand(redisClient *c) {
5847 double scoreval;
5848
5849 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5850 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5851 }
5852
5853 static void zincrbyCommand(redisClient *c) {
5854 double scoreval;
5855
5856 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5857 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5858 }
5859
5860 static void zremCommand(redisClient *c) {
5861 robj *zsetobj;
5862 zset *zs;
5863 dictEntry *de;
5864 double *oldscore;
5865 int deleted;
5866
5867 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5868 checkType(c,zsetobj,REDIS_ZSET)) return;
5869
5870 zs = zsetobj->ptr;
5871 de = dictFind(zs->dict,c->argv[2]);
5872 if (de == NULL) {
5873 addReply(c,shared.czero);
5874 return;
5875 }
5876 /* Delete from the skiplist */
5877 oldscore = dictGetEntryVal(de);
5878 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5879 redisAssert(deleted != 0);
5880
5881 /* Delete from the hash table */
5882 dictDelete(zs->dict,c->argv[2]);
5883 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5884 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5885 server.dirty++;
5886 addReply(c,shared.cone);
5887 }
5888
5889 static void zremrangebyscoreCommand(redisClient *c) {
5890 double min;
5891 double max;
5892 long deleted;
5893 robj *zsetobj;
5894 zset *zs;
5895
5896 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5897 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5898
5899 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5900 checkType(c,zsetobj,REDIS_ZSET)) return;
5901
5902 zs = zsetobj->ptr;
5903 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5904 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5905 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5906 server.dirty += deleted;
5907 addReplyLongLong(c,deleted);
5908 }
5909
5910 static void zremrangebyrankCommand(redisClient *c) {
5911 long start;
5912 long end;
5913 int llen;
5914 long deleted;
5915 robj *zsetobj;
5916 zset *zs;
5917
5918 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5919 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5920
5921 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5922 checkType(c,zsetobj,REDIS_ZSET)) return;
5923 zs = zsetobj->ptr;
5924 llen = zs->zsl->length;
5925
5926 /* convert negative indexes */
5927 if (start < 0) start = llen+start;
5928 if (end < 0) end = llen+end;
5929 if (start < 0) start = 0;
5930 if (end < 0) end = 0;
5931
5932 /* indexes sanity checks */
5933 if (start > end || start >= llen) {
5934 addReply(c,shared.czero);
5935 return;
5936 }
5937 if (end >= llen) end = llen-1;
5938
5939 /* increment start and end because zsl*Rank functions
5940 * use 1-based rank */
5941 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5942 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5943 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5944 server.dirty += deleted;
5945 addReplyLongLong(c, deleted);
5946 }
5947
5948 typedef struct {
5949 dict *dict;
5950 double weight;
5951 } zsetopsrc;
5952
5953 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5954 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5955 unsigned long size1, size2;
5956 size1 = d1->dict ? dictSize(d1->dict) : 0;
5957 size2 = d2->dict ? dictSize(d2->dict) : 0;
5958 return size1 - size2;
5959 }
5960
5961 #define REDIS_AGGR_SUM 1
5962 #define REDIS_AGGR_MIN 2
5963 #define REDIS_AGGR_MAX 3
5964 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5965
5966 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5967 if (aggregate == REDIS_AGGR_SUM) {
5968 *target = *target + val;
5969 } else if (aggregate == REDIS_AGGR_MIN) {
5970 *target = val < *target ? val : *target;
5971 } else if (aggregate == REDIS_AGGR_MAX) {
5972 *target = val > *target ? val : *target;
5973 } else {
5974 /* safety net */
5975 redisPanic("Unknown ZUNION/INTER aggregate type");
5976 }
5977 }
5978
5979 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5980 int i, j, setnum;
5981 int aggregate = REDIS_AGGR_SUM;
5982 zsetopsrc *src;
5983 robj *dstobj;
5984 zset *dstzset;
5985 dictIterator *di;
5986 dictEntry *de;
5987
5988 /* expect setnum input keys to be given */
5989 setnum = atoi(c->argv[2]->ptr);
5990 if (setnum < 1) {
5991 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5992 return;
5993 }
5994
5995 /* test if the expected number of keys would overflow */
5996 if (3+setnum > c->argc) {
5997 addReply(c,shared.syntaxerr);
5998 return;
5999 }
6000
6001 /* read keys to be used for input */
6002 src = zmalloc(sizeof(zsetopsrc) * setnum);
6003 for (i = 0, j = 3; i < setnum; i++, j++) {
6004 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6005 if (!obj) {
6006 src[i].dict = NULL;
6007 } else {
6008 if (obj->type == REDIS_ZSET) {
6009 src[i].dict = ((zset*)obj->ptr)->dict;
6010 } else if (obj->type == REDIS_SET) {
6011 src[i].dict = (obj->ptr);
6012 } else {
6013 zfree(src);
6014 addReply(c,shared.wrongtypeerr);
6015 return;
6016 }
6017 }
6018
6019 /* default all weights to 1 */
6020 src[i].weight = 1.0;
6021 }
6022
6023 /* parse optional extra arguments */
6024 if (j < c->argc) {
6025 int remaining = c->argc - j;
6026
6027 while (remaining) {
6028 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6029 j++; remaining--;
6030 for (i = 0; i < setnum; i++, j++, remaining--) {
6031 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6032 return;
6033 }
6034 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6035 j++; remaining--;
6036 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6037 aggregate = REDIS_AGGR_SUM;
6038 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6039 aggregate = REDIS_AGGR_MIN;
6040 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6041 aggregate = REDIS_AGGR_MAX;
6042 } else {
6043 zfree(src);
6044 addReply(c,shared.syntaxerr);
6045 return;
6046 }
6047 j++; remaining--;
6048 } else {
6049 zfree(src);
6050 addReply(c,shared.syntaxerr);
6051 return;
6052 }
6053 }
6054 }
6055
6056 /* sort sets from the smallest to largest, this will improve our
6057 * algorithm's performance */
6058 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6059
6060 dstobj = createZsetObject();
6061 dstzset = dstobj->ptr;
6062
6063 if (op == REDIS_OP_INTER) {
6064 /* skip going over all entries if the smallest zset is NULL or empty */
6065 if (src[0].dict && dictSize(src[0].dict) > 0) {
6066 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6067 * from small to large, all src[i > 0].dict are non-empty too */
6068 di = dictGetIterator(src[0].dict);
6069 while((de = dictNext(di)) != NULL) {
6070 double *score = zmalloc(sizeof(double)), value;
6071 *score = src[0].weight * zunionInterDictValue(de);
6072
6073 for (j = 1; j < setnum; j++) {
6074 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6075 if (other) {
6076 value = src[j].weight * zunionInterDictValue(other);
6077 zunionInterAggregate(score, value, aggregate);
6078 } else {
6079 break;
6080 }
6081 }
6082
6083 /* skip entry when not present in every source dict */
6084 if (j != setnum) {
6085 zfree(score);
6086 } else {
6087 robj *o = dictGetEntryKey(de);
6088 dictAdd(dstzset->dict,o,score);
6089 incrRefCount(o); /* added to dictionary */
6090 zslInsert(dstzset->zsl,*score,o);
6091 incrRefCount(o); /* added to skiplist */
6092 }
6093 }
6094 dictReleaseIterator(di);
6095 }
6096 } else if (op == REDIS_OP_UNION) {
6097 for (i = 0; i < setnum; i++) {
6098 if (!src[i].dict) continue;
6099
6100 di = dictGetIterator(src[i].dict);
6101 while((de = dictNext(di)) != NULL) {
6102 /* skip key when already processed */
6103 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6104
6105 double *score = zmalloc(sizeof(double)), value;
6106 *score = src[i].weight * zunionInterDictValue(de);
6107
6108 /* because the zsets are sorted by size, its only possible
6109 * for sets at larger indices to hold this entry */
6110 for (j = (i+1); j < setnum; j++) {
6111 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6112 if (other) {
6113 value = src[j].weight * zunionInterDictValue(other);
6114 zunionInterAggregate(score, value, aggregate);
6115 }
6116 }
6117
6118 robj *o = dictGetEntryKey(de);
6119 dictAdd(dstzset->dict,o,score);
6120 incrRefCount(o); /* added to dictionary */
6121 zslInsert(dstzset->zsl,*score,o);
6122 incrRefCount(o); /* added to skiplist */
6123 }
6124 dictReleaseIterator(di);
6125 }
6126 } else {
6127 /* unknown operator */
6128 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6129 }
6130
6131 deleteKey(c->db,dstkey);
6132 if (dstzset->zsl->length) {
6133 dictAdd(c->db->dict,dstkey,dstobj);
6134 incrRefCount(dstkey);
6135 addReplyLongLong(c, dstzset->zsl->length);
6136 server.dirty++;
6137 } else {
6138 decrRefCount(dstobj);
6139 addReply(c, shared.czero);
6140 }
6141 zfree(src);
6142 }
6143
6144 static void zunionstoreCommand(redisClient *c) {
6145 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6146 }
6147
6148 static void zinterstoreCommand(redisClient *c) {
6149 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6150 }
6151
6152 static void zrangeGenericCommand(redisClient *c, int reverse) {
6153 robj *o;
6154 long start;
6155 long end;
6156 int withscores = 0;
6157 int llen;
6158 int rangelen, j;
6159 zset *zsetobj;
6160 zskiplist *zsl;
6161 zskiplistNode *ln;
6162 robj *ele;
6163
6164 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6165 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6166
6167 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6168 withscores = 1;
6169 } else if (c->argc >= 5) {
6170 addReply(c,shared.syntaxerr);
6171 return;
6172 }
6173
6174 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6175 || checkType(c,o,REDIS_ZSET)) return;
6176 zsetobj = o->ptr;
6177 zsl = zsetobj->zsl;
6178 llen = zsl->length;
6179
6180 /* convert negative indexes */
6181 if (start < 0) start = llen+start;
6182 if (end < 0) end = llen+end;
6183 if (start < 0) start = 0;
6184 if (end < 0) end = 0;
6185
6186 /* indexes sanity checks */
6187 if (start > end || start >= llen) {
6188 /* Out of range start or start > end result in empty list */
6189 addReply(c,shared.emptymultibulk);
6190 return;
6191 }
6192 if (end >= llen) end = llen-1;
6193 rangelen = (end-start)+1;
6194
6195 /* check if starting point is trivial, before searching
6196 * the element in log(N) time */
6197 if (reverse) {
6198 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6199 } else {
6200 ln = start == 0 ?
6201 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6202 }
6203
6204 /* Return the result in form of a multi-bulk reply */
6205 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6206 withscores ? (rangelen*2) : rangelen));
6207 for (j = 0; j < rangelen; j++) {
6208 ele = ln->obj;
6209 addReplyBulk(c,ele);
6210 if (withscores)
6211 addReplyDouble(c,ln->score);
6212 ln = reverse ? ln->backward : ln->forward[0];
6213 }
6214 }
6215
6216 static void zrangeCommand(redisClient *c) {
6217 zrangeGenericCommand(c,0);
6218 }
6219
6220 static void zrevrangeCommand(redisClient *c) {
6221 zrangeGenericCommand(c,1);
6222 }
6223
6224 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6225 * If justcount is non-zero, just the count is returned. */
6226 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6227 robj *o;
6228 double min, max;
6229 int minex = 0, maxex = 0; /* are min or max exclusive? */
6230 int offset = 0, limit = -1;
6231 int withscores = 0;
6232 int badsyntax = 0;
6233
6234 /* Parse the min-max interval. If one of the values is prefixed
6235 * by the "(" character, it's considered "open". For instance
6236 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6237 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6238 if (((char*)c->argv[2]->ptr)[0] == '(') {
6239 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6240 minex = 1;
6241 } else {
6242 min = strtod(c->argv[2]->ptr,NULL);
6243 }
6244 if (((char*)c->argv[3]->ptr)[0] == '(') {
6245 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6246 maxex = 1;
6247 } else {
6248 max = strtod(c->argv[3]->ptr,NULL);
6249 }
6250
6251 /* Parse "WITHSCORES": note that if the command was called with
6252 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6253 * enter the following paths to parse WITHSCORES and LIMIT. */
6254 if (c->argc == 5 || c->argc == 8) {
6255 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6256 withscores = 1;
6257 else
6258 badsyntax = 1;
6259 }
6260 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6261 badsyntax = 1;
6262 if (badsyntax) {
6263 addReplySds(c,
6264 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6265 return;
6266 }
6267
6268 /* Parse "LIMIT" */
6269 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6270 addReply(c,shared.syntaxerr);
6271 return;
6272 } else if (c->argc == (7 + withscores)) {
6273 offset = atoi(c->argv[5]->ptr);
6274 limit = atoi(c->argv[6]->ptr);
6275 if (offset < 0) offset = 0;
6276 }
6277
6278 /* Ok, lookup the key and get the range */
6279 o = lookupKeyRead(c->db,c->argv[1]);
6280 if (o == NULL) {
6281 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6282 } else {
6283 if (o->type != REDIS_ZSET) {
6284 addReply(c,shared.wrongtypeerr);
6285 } else {
6286 zset *zsetobj = o->ptr;
6287 zskiplist *zsl = zsetobj->zsl;
6288 zskiplistNode *ln;
6289 robj *ele, *lenobj = NULL;
6290 unsigned long rangelen = 0;
6291
6292 /* Get the first node with the score >= min, or with
6293 * score > min if 'minex' is true. */
6294 ln = zslFirstWithScore(zsl,min);
6295 while (minex && ln && ln->score == min) ln = ln->forward[0];
6296
6297 if (ln == NULL) {
6298 /* No element matching the speciifed interval */
6299 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6300 return;
6301 }
6302
6303 /* We don't know in advance how many matching elements there
6304 * are in the list, so we push this object that will represent
6305 * the multi-bulk length in the output buffer, and will "fix"
6306 * it later */
6307 if (!justcount) {
6308 lenobj = createObject(REDIS_STRING,NULL);
6309 addReply(c,lenobj);
6310 decrRefCount(lenobj);
6311 }
6312
6313 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6314 if (offset) {
6315 offset--;
6316 ln = ln->forward[0];
6317 continue;
6318 }
6319 if (limit == 0) break;
6320 if (!justcount) {
6321 ele = ln->obj;
6322 addReplyBulk(c,ele);
6323 if (withscores)
6324 addReplyDouble(c,ln->score);
6325 }
6326 ln = ln->forward[0];
6327 rangelen++;
6328 if (limit > 0) limit--;
6329 }
6330 if (justcount) {
6331 addReplyLongLong(c,(long)rangelen);
6332 } else {
6333 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6334 withscores ? (rangelen*2) : rangelen);
6335 }
6336 }
6337 }
6338 }
6339
6340 static void zrangebyscoreCommand(redisClient *c) {
6341 genericZrangebyscoreCommand(c,0);
6342 }
6343
6344 static void zcountCommand(redisClient *c) {
6345 genericZrangebyscoreCommand(c,1);
6346 }
6347
6348 static void zcardCommand(redisClient *c) {
6349 robj *o;
6350 zset *zs;
6351
6352 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6353 checkType(c,o,REDIS_ZSET)) return;
6354
6355 zs = o->ptr;
6356 addReplyUlong(c,zs->zsl->length);
6357 }
6358
6359 static void zscoreCommand(redisClient *c) {
6360 robj *o;
6361 zset *zs;
6362 dictEntry *de;
6363
6364 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6365 checkType(c,o,REDIS_ZSET)) return;
6366
6367 zs = o->ptr;
6368 de = dictFind(zs->dict,c->argv[2]);
6369 if (!de) {
6370 addReply(c,shared.nullbulk);
6371 } else {
6372 double *score = dictGetEntryVal(de);
6373
6374 addReplyDouble(c,*score);
6375 }
6376 }
6377
6378 static void zrankGenericCommand(redisClient *c, int reverse) {
6379 robj *o;
6380 zset *zs;
6381 zskiplist *zsl;
6382 dictEntry *de;
6383 unsigned long rank;
6384 double *score;
6385
6386 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6387 checkType(c,o,REDIS_ZSET)) return;
6388
6389 zs = o->ptr;
6390 zsl = zs->zsl;
6391 de = dictFind(zs->dict,c->argv[2]);
6392 if (!de) {
6393 addReply(c,shared.nullbulk);
6394 return;
6395 }
6396
6397 score = dictGetEntryVal(de);
6398 rank = zslGetRank(zsl, *score, c->argv[2]);
6399 if (rank) {
6400 if (reverse) {
6401 addReplyLongLong(c, zsl->length - rank);
6402 } else {
6403 addReplyLongLong(c, rank-1);
6404 }
6405 } else {
6406 addReply(c,shared.nullbulk);
6407 }
6408 }
6409
6410 static void zrankCommand(redisClient *c) {
6411 zrankGenericCommand(c, 0);
6412 }
6413
6414 static void zrevrankCommand(redisClient *c) {
6415 zrankGenericCommand(c, 1);
6416 }
6417
6418 /* ========================= Hashes utility functions ======================= */
6419 #define REDIS_HASH_KEY 1
6420 #define REDIS_HASH_VALUE 2
6421
6422 /* Check the length of a number of objects to see if we need to convert a
6423 * zipmap to a real hash. Note that we only check string encoded objects
6424 * as their string length can be queried in constant time. */
6425 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6426 int i;
6427 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6428
6429 for (i = start; i <= end; i++) {
6430 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6431 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6432 {
6433 convertToRealHash(subject);
6434 return;
6435 }
6436 }
6437 }
6438
6439 /* Encode given objects in-place when the hash uses a dict. */
6440 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6441 if (subject->encoding == REDIS_ENCODING_HT) {
6442 if (o1) *o1 = tryObjectEncoding(*o1);
6443 if (o2) *o2 = tryObjectEncoding(*o2);
6444 }
6445 }
6446
6447 /* Get the value from a hash identified by key. Returns either a string
6448 * object or NULL if the value cannot be found. The refcount of the object
6449 * is always increased by 1 when the value was found. */
6450 static robj *hashGet(robj *o, robj *key) {
6451 robj *value = NULL;
6452 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6453 unsigned char *v;
6454 unsigned int vlen;
6455 key = getDecodedObject(key);
6456 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6457 value = createStringObject((char*)v,vlen);
6458 }
6459 decrRefCount(key);
6460 } else {
6461 dictEntry *de = dictFind(o->ptr,key);
6462 if (de != NULL) {
6463 value = dictGetEntryVal(de);
6464 incrRefCount(value);
6465 }
6466 }
6467 return value;
6468 }
6469
6470 /* Test if the key exists in the given hash. Returns 1 if the key
6471 * exists and 0 when it doesn't. */
6472 static int hashExists(robj *o, robj *key) {
6473 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6474 key = getDecodedObject(key);
6475 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6476 decrRefCount(key);
6477 return 1;
6478 }
6479 decrRefCount(key);
6480 } else {
6481 if (dictFind(o->ptr,key) != NULL) {
6482 return 1;
6483 }
6484 }
6485 return 0;
6486 }
6487
6488 /* Add an element, discard the old if the key already exists.
6489 * Return 0 on insert and 1 on update. */
6490 static int hashSet(robj *o, robj *key, robj *value) {
6491 int update = 0;
6492 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6493 key = getDecodedObject(key);
6494 value = getDecodedObject(value);
6495 o->ptr = zipmapSet(o->ptr,
6496 key->ptr,sdslen(key->ptr),
6497 value->ptr,sdslen(value->ptr), &update);
6498 decrRefCount(key);
6499 decrRefCount(value);
6500
6501 /* Check if the zipmap needs to be upgraded to a real hash table */
6502 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6503 convertToRealHash(o);
6504 } else {
6505 if (dictReplace(o->ptr,key,value)) {
6506 /* Insert */
6507 incrRefCount(key);
6508 } else {
6509 /* Update */
6510 update = 1;
6511 }
6512 incrRefCount(value);
6513 }
6514 return update;
6515 }
6516
6517 /* Delete an element from a hash.
6518 * Return 1 on deleted and 0 on not found. */
6519 static int hashDelete(robj *o, robj *key) {
6520 int deleted = 0;
6521 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6522 key = getDecodedObject(key);
6523 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6524 decrRefCount(key);
6525 } else {
6526 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6527 /* Always check if the dictionary needs a resize after a delete. */
6528 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6529 }
6530 return deleted;
6531 }
6532
6533 /* Return the number of elements in a hash. */
6534 static unsigned long hashLength(robj *o) {
6535 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6536 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6537 }
6538
6539 /* Structure to hold hash iteration abstration. Note that iteration over
6540 * hashes involves both fields and values. Because it is possible that
6541 * not both are required, store pointers in the iterator to avoid
6542 * unnecessary memory allocation for fields/values. */
6543 typedef struct {
6544 int encoding;
6545 unsigned char *zi;
6546 unsigned char *zk, *zv;
6547 unsigned int zklen, zvlen;
6548
6549 dictIterator *di;
6550 dictEntry *de;
6551 } hashIterator;
6552
6553 static hashIterator *hashInitIterator(robj *subject) {
6554 hashIterator *hi = zmalloc(sizeof(hashIterator));
6555 hi->encoding = subject->encoding;
6556 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6557 hi->zi = zipmapRewind(subject->ptr);
6558 } else if (hi->encoding == REDIS_ENCODING_HT) {
6559 hi->di = dictGetIterator(subject->ptr);
6560 } else {
6561 redisAssert(NULL);
6562 }
6563 return hi;
6564 }
6565
6566 static void hashReleaseIterator(hashIterator *hi) {
6567 if (hi->encoding == REDIS_ENCODING_HT) {
6568 dictReleaseIterator(hi->di);
6569 }
6570 zfree(hi);
6571 }
6572
6573 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6574 * could be found and REDIS_ERR when the iterator reaches the end. */
6575 static int hashNext(hashIterator *hi) {
6576 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6577 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6578 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6579 } else {
6580 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6581 }
6582 return REDIS_OK;
6583 }
6584
6585 /* Get key or value object at current iteration position.
6586 * This increases the refcount of the field object by 1. */
6587 static robj *hashCurrent(hashIterator *hi, int what) {
6588 robj *o;
6589 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6590 if (what & REDIS_HASH_KEY) {
6591 o = createStringObject((char*)hi->zk,hi->zklen);
6592 } else {
6593 o = createStringObject((char*)hi->zv,hi->zvlen);
6594 }
6595 } else {
6596 if (what & REDIS_HASH_KEY) {
6597 o = dictGetEntryKey(hi->de);
6598 } else {
6599 o = dictGetEntryVal(hi->de);
6600 }
6601 incrRefCount(o);
6602 }
6603 return o;
6604 }
6605
6606 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6607 robj *o = lookupKeyWrite(c->db,key);
6608 if (o == NULL) {
6609 o = createHashObject();
6610 dictAdd(c->db->dict,key,o);
6611 incrRefCount(key);
6612 } else {
6613 if (o->type != REDIS_HASH) {
6614 addReply(c,shared.wrongtypeerr);
6615 return NULL;
6616 }
6617 }
6618 return o;
6619 }
6620
6621 /* ============================= Hash commands ============================== */
6622 static void hsetCommand(redisClient *c) {
6623 int update;
6624 robj *o;
6625
6626 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6627 hashTryConversion(o,c->argv,2,3);
6628 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6629 update = hashSet(o,c->argv[2],c->argv[3]);
6630 addReply(c, update ? shared.czero : shared.cone);
6631 server.dirty++;
6632 }
6633
6634 static void hsetnxCommand(redisClient *c) {
6635 robj *o;
6636 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6637 hashTryConversion(o,c->argv,2,3);
6638
6639 if (hashExists(o, c->argv[2])) {
6640 addReply(c, shared.czero);
6641 } else {
6642 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6643 hashSet(o,c->argv[2],c->argv[3]);
6644 addReply(c, shared.cone);
6645 server.dirty++;
6646 }
6647 }
6648
6649 static void hmsetCommand(redisClient *c) {
6650 int i;
6651 robj *o;
6652
6653 if ((c->argc % 2) == 1) {
6654 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6655 return;
6656 }
6657
6658 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6659 hashTryConversion(o,c->argv,2,c->argc-1);
6660 for (i = 2; i < c->argc; i += 2) {
6661 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6662 hashSet(o,c->argv[i],c->argv[i+1]);
6663 }
6664 addReply(c, shared.ok);
6665 server.dirty++;
6666 }
6667
6668 static void hincrbyCommand(redisClient *c) {
6669 long long value, incr;
6670 robj *o, *current, *new;
6671
6672 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6673 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6674 if ((current = hashGet(o,c->argv[2])) != NULL) {
6675 if (getLongLongFromObjectOrReply(c,current,&value,
6676 "hash value is not an integer") != REDIS_OK) {
6677 decrRefCount(current);
6678 return;
6679 }
6680 decrRefCount(current);
6681 } else {
6682 value = 0;
6683 }
6684
6685 value += incr;
6686 new = createStringObjectFromLongLong(value);
6687 hashTryObjectEncoding(o,&c->argv[2],NULL);
6688 hashSet(o,c->argv[2],new);
6689 decrRefCount(new);
6690 addReplyLongLong(c,value);
6691 server.dirty++;
6692 }
6693
6694 static void hgetCommand(redisClient *c) {
6695 robj *o, *value;
6696 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6697 checkType(c,o,REDIS_HASH)) return;
6698
6699 if ((value = hashGet(o,c->argv[2])) != NULL) {
6700 addReplyBulk(c,value);
6701 decrRefCount(value);
6702 } else {
6703 addReply(c,shared.nullbulk);
6704 }
6705 }
6706
6707 static void hmgetCommand(redisClient *c) {
6708 int i;
6709 robj *o, *value;
6710 o = lookupKeyRead(c->db,c->argv[1]);
6711 if (o != NULL && o->type != REDIS_HASH) {
6712 addReply(c,shared.wrongtypeerr);
6713 }
6714
6715 /* Note the check for o != NULL happens inside the loop. This is
6716 * done because objects that cannot be found are considered to be
6717 * an empty hash. The reply should then be a series of NULLs. */
6718 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6719 for (i = 2; i < c->argc; i++) {
6720 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6721 addReplyBulk(c,value);
6722 decrRefCount(value);
6723 } else {
6724 addReply(c,shared.nullbulk);
6725 }
6726 }
6727 }
6728
6729 static void hdelCommand(redisClient *c) {
6730 robj *o;
6731 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6732 checkType(c,o,REDIS_HASH)) return;
6733
6734 if (hashDelete(o,c->argv[2])) {
6735 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6736 addReply(c,shared.cone);
6737 server.dirty++;
6738 } else {
6739 addReply(c,shared.czero);
6740 }
6741 }
6742
6743 static void hlenCommand(redisClient *c) {
6744 robj *o;
6745 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6746 checkType(c,o,REDIS_HASH)) return;
6747
6748 addReplyUlong(c,hashLength(o));
6749 }
6750
6751 static void genericHgetallCommand(redisClient *c, int flags) {
6752 robj *o, *lenobj, *obj;
6753 unsigned long count = 0;
6754 hashIterator *hi;
6755
6756 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6757 || checkType(c,o,REDIS_HASH)) return;
6758
6759 lenobj = createObject(REDIS_STRING,NULL);
6760 addReply(c,lenobj);
6761 decrRefCount(lenobj);
6762
6763 hi = hashInitIterator(o);
6764 while (hashNext(hi) != REDIS_ERR) {
6765 if (flags & REDIS_HASH_KEY) {
6766 obj = hashCurrent(hi,REDIS_HASH_KEY);
6767 addReplyBulk(c,obj);
6768 decrRefCount(obj);
6769 count++;
6770 }
6771 if (flags & REDIS_HASH_VALUE) {
6772 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6773 addReplyBulk(c,obj);
6774 decrRefCount(obj);
6775 count++;
6776 }
6777 }
6778 hashReleaseIterator(hi);
6779
6780 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6781 }
6782
6783 static void hkeysCommand(redisClient *c) {
6784 genericHgetallCommand(c,REDIS_HASH_KEY);
6785 }
6786
6787 static void hvalsCommand(redisClient *c) {
6788 genericHgetallCommand(c,REDIS_HASH_VALUE);
6789 }
6790
6791 static void hgetallCommand(redisClient *c) {
6792 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6793 }
6794
6795 static void hexistsCommand(redisClient *c) {
6796 robj *o;
6797 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6798 checkType(c,o,REDIS_HASH)) return;
6799
6800 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6801 }
6802
6803 static void convertToRealHash(robj *o) {
6804 unsigned char *key, *val, *p, *zm = o->ptr;
6805 unsigned int klen, vlen;
6806 dict *dict = dictCreate(&hashDictType,NULL);
6807
6808 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6809 p = zipmapRewind(zm);
6810 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6811 robj *keyobj, *valobj;
6812
6813 keyobj = createStringObject((char*)key,klen);
6814 valobj = createStringObject((char*)val,vlen);
6815 keyobj = tryObjectEncoding(keyobj);
6816 valobj = tryObjectEncoding(valobj);
6817 dictAdd(dict,keyobj,valobj);
6818 }
6819 o->encoding = REDIS_ENCODING_HT;
6820 o->ptr = dict;
6821 zfree(zm);
6822 }
6823
6824 /* ========================= Non type-specific commands ==================== */
6825
6826 static void flushdbCommand(redisClient *c) {
6827 server.dirty += dictSize(c->db->dict);
6828 touchWatchedKeysOnFlush(c->db->id);
6829 dictEmpty(c->db->dict);
6830 dictEmpty(c->db->expires);
6831 addReply(c,shared.ok);
6832 }
6833
6834 static void flushallCommand(redisClient *c) {
6835 touchWatchedKeysOnFlush(-1);
6836 server.dirty += emptyDb();
6837 addReply(c,shared.ok);
6838 if (server.bgsavechildpid != -1) {
6839 kill(server.bgsavechildpid,SIGKILL);
6840 rdbRemoveTempFile(server.bgsavechildpid);
6841 }
6842 rdbSave(server.dbfilename);
6843 server.dirty++;
6844 }
6845
6846 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6847 redisSortOperation *so = zmalloc(sizeof(*so));
6848 so->type = type;
6849 so->pattern = pattern;
6850 return so;
6851 }
6852
6853 /* Return the value associated to the key with a name obtained
6854 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6855 * The returned object will always have its refcount increased by 1
6856 * when it is non-NULL. */
6857 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6858 char *p, *f;
6859 sds spat, ssub;
6860 robj keyobj, fieldobj, *o;
6861 int prefixlen, sublen, postfixlen, fieldlen;
6862 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6863 struct {
6864 long len;
6865 long free;
6866 char buf[REDIS_SORTKEY_MAX+1];
6867 } keyname, fieldname;
6868
6869 /* If the pattern is "#" return the substitution object itself in order
6870 * to implement the "SORT ... GET #" feature. */
6871 spat = pattern->ptr;
6872 if (spat[0] == '#' && spat[1] == '\0') {
6873 incrRefCount(subst);
6874 return subst;
6875 }
6876
6877 /* The substitution object may be specially encoded. If so we create
6878 * a decoded object on the fly. Otherwise getDecodedObject will just
6879 * increment the ref count, that we'll decrement later. */
6880 subst = getDecodedObject(subst);
6881
6882 ssub = subst->ptr;
6883 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6884 p = strchr(spat,'*');
6885 if (!p) {
6886 decrRefCount(subst);
6887 return NULL;
6888 }
6889
6890 /* Find out if we're dealing with a hash dereference. */
6891 if ((f = strstr(p+1, "->")) != NULL) {
6892 fieldlen = sdslen(spat)-(f-spat);
6893 /* this also copies \0 character */
6894 memcpy(fieldname.buf,f+2,fieldlen-1);
6895 fieldname.len = fieldlen-2;
6896 } else {
6897 fieldlen = 0;
6898 }
6899
6900 prefixlen = p-spat;
6901 sublen = sdslen(ssub);
6902 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6903 memcpy(keyname.buf,spat,prefixlen);
6904 memcpy(keyname.buf+prefixlen,ssub,sublen);
6905 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6906 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6907 keyname.len = prefixlen+sublen+postfixlen;
6908 decrRefCount(subst);
6909
6910 /* Lookup substituted key */
6911 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6912 o = lookupKeyRead(db,&keyobj);
6913 if (o == NULL) return NULL;
6914
6915 if (fieldlen > 0) {
6916 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6917
6918 /* Retrieve value from hash by the field name. This operation
6919 * already increases the refcount of the returned object. */
6920 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6921 o = hashGet(o, &fieldobj);
6922 } else {
6923 if (o->type != REDIS_STRING) return NULL;
6924
6925 /* Every object that this function returns needs to have its refcount
6926 * increased. sortCommand decreases it again. */
6927 incrRefCount(o);
6928 }
6929
6930 return o;
6931 }
6932
6933 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6934 * the additional parameter is not standard but a BSD-specific we have to
6935 * pass sorting parameters via the global 'server' structure */
6936 static int sortCompare(const void *s1, const void *s2) {
6937 const redisSortObject *so1 = s1, *so2 = s2;
6938 int cmp;
6939
6940 if (!server.sort_alpha) {
6941 /* Numeric sorting. Here it's trivial as we precomputed scores */
6942 if (so1->u.score > so2->u.score) {
6943 cmp = 1;
6944 } else if (so1->u.score < so2->u.score) {
6945 cmp = -1;
6946 } else {
6947 cmp = 0;
6948 }
6949 } else {
6950 /* Alphanumeric sorting */
6951 if (server.sort_bypattern) {
6952 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6953 /* At least one compare object is NULL */
6954 if (so1->u.cmpobj == so2->u.cmpobj)
6955 cmp = 0;
6956 else if (so1->u.cmpobj == NULL)
6957 cmp = -1;
6958 else
6959 cmp = 1;
6960 } else {
6961 /* We have both the objects, use strcoll */
6962 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6963 }
6964 } else {
6965 /* Compare elements directly. */
6966 cmp = compareStringObjects(so1->obj,so2->obj);
6967 }
6968 }
6969 return server.sort_desc ? -cmp : cmp;
6970 }
6971
6972 /* The SORT command is the most complex command in Redis. Warning: this code
6973 * is optimized for speed and a bit less for readability */
6974 static void sortCommand(redisClient *c) {
6975 list *operations;
6976 int outputlen = 0;
6977 int desc = 0, alpha = 0;
6978 int limit_start = 0, limit_count = -1, start, end;
6979 int j, dontsort = 0, vectorlen;
6980 int getop = 0; /* GET operation counter */
6981 robj *sortval, *sortby = NULL, *storekey = NULL;
6982 redisSortObject *vector; /* Resulting vector to sort */
6983
6984 /* Lookup the key to sort. It must be of the right types */
6985 sortval = lookupKeyRead(c->db,c->argv[1]);
6986 if (sortval == NULL) {
6987 addReply(c,shared.emptymultibulk);
6988 return;
6989 }
6990 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6991 sortval->type != REDIS_ZSET)
6992 {
6993 addReply(c,shared.wrongtypeerr);
6994 return;
6995 }
6996
6997 /* Create a list of operations to perform for every sorted element.
6998 * Operations can be GET/DEL/INCR/DECR */
6999 operations = listCreate();
7000 listSetFreeMethod(operations,zfree);
7001 j = 2;
7002
7003 /* Now we need to protect sortval incrementing its count, in the future
7004 * SORT may have options able to overwrite/delete keys during the sorting
7005 * and the sorted key itself may get destroied */
7006 incrRefCount(sortval);
7007
7008 /* The SORT command has an SQL-alike syntax, parse it */
7009 while(j < c->argc) {
7010 int leftargs = c->argc-j-1;
7011 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7012 desc = 0;
7013 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7014 desc = 1;
7015 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7016 alpha = 1;
7017 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7018 limit_start = atoi(c->argv[j+1]->ptr);
7019 limit_count = atoi(c->argv[j+2]->ptr);
7020 j+=2;
7021 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7022 storekey = c->argv[j+1];
7023 j++;
7024 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7025 sortby = c->argv[j+1];
7026 /* If the BY pattern does not contain '*', i.e. it is constant,
7027 * we don't need to sort nor to lookup the weight keys. */
7028 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7029 j++;
7030 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7031 listAddNodeTail(operations,createSortOperation(
7032 REDIS_SORT_GET,c->argv[j+1]));
7033 getop++;
7034 j++;
7035 } else {
7036 decrRefCount(sortval);
7037 listRelease(operations);
7038 addReply(c,shared.syntaxerr);
7039 return;
7040 }
7041 j++;
7042 }
7043
7044 /* Load the sorting vector with all the objects to sort */
7045 switch(sortval->type) {
7046 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7047 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7048 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7049 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7050 }
7051 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7052 j = 0;
7053
7054 if (sortval->type == REDIS_LIST) {
7055 list *list = sortval->ptr;
7056 listNode *ln;
7057 listIter li;
7058
7059 listRewind(list,&li);
7060 while((ln = listNext(&li))) {
7061 robj *ele = ln->value;
7062 vector[j].obj = ele;
7063 vector[j].u.score = 0;
7064 vector[j].u.cmpobj = NULL;
7065 j++;
7066 }
7067 } else {
7068 dict *set;
7069 dictIterator *di;
7070 dictEntry *setele;
7071
7072 if (sortval->type == REDIS_SET) {
7073 set = sortval->ptr;
7074 } else {
7075 zset *zs = sortval->ptr;
7076 set = zs->dict;
7077 }
7078
7079 di = dictGetIterator(set);
7080 while((setele = dictNext(di)) != NULL) {
7081 vector[j].obj = dictGetEntryKey(setele);
7082 vector[j].u.score = 0;
7083 vector[j].u.cmpobj = NULL;
7084 j++;
7085 }
7086 dictReleaseIterator(di);
7087 }
7088 redisAssert(j == vectorlen);
7089
7090 /* Now it's time to load the right scores in the sorting vector */
7091 if (dontsort == 0) {
7092 for (j = 0; j < vectorlen; j++) {
7093 robj *byval;
7094 if (sortby) {
7095 /* lookup value to sort by */
7096 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7097 if (!byval) continue;
7098 } else {
7099 /* use object itself to sort by */
7100 byval = vector[j].obj;
7101 }
7102
7103 if (alpha) {
7104 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7105 } else {
7106 if (byval->encoding == REDIS_ENCODING_RAW) {
7107 vector[j].u.score = strtod(byval->ptr,NULL);
7108 } else if (byval->encoding == REDIS_ENCODING_INT) {
7109 /* Don't need to decode the object if it's
7110 * integer-encoded (the only encoding supported) so
7111 * far. We can just cast it */
7112 vector[j].u.score = (long)byval->ptr;
7113 } else {
7114 redisAssert(1 != 1);
7115 }
7116 }
7117
7118 /* when the object was retrieved using lookupKeyByPattern,
7119 * its refcount needs to be decreased. */
7120 if (sortby) {
7121 decrRefCount(byval);
7122 }
7123 }
7124 }
7125
7126 /* We are ready to sort the vector... perform a bit of sanity check
7127 * on the LIMIT option too. We'll use a partial version of quicksort. */
7128 start = (limit_start < 0) ? 0 : limit_start;
7129 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7130 if (start >= vectorlen) {
7131 start = vectorlen-1;
7132 end = vectorlen-2;
7133 }
7134 if (end >= vectorlen) end = vectorlen-1;
7135
7136 if (dontsort == 0) {
7137 server.sort_desc = desc;
7138 server.sort_alpha = alpha;
7139 server.sort_bypattern = sortby ? 1 : 0;
7140 if (sortby && (start != 0 || end != vectorlen-1))
7141 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7142 else
7143 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7144 }
7145
7146 /* Send command output to the output buffer, performing the specified
7147 * GET/DEL/INCR/DECR operations if any. */
7148 outputlen = getop ? getop*(end-start+1) : end-start+1;
7149 if (storekey == NULL) {
7150 /* STORE option not specified, sent the sorting result to client */
7151 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7152 for (j = start; j <= end; j++) {
7153 listNode *ln;
7154 listIter li;
7155
7156 if (!getop) addReplyBulk(c,vector[j].obj);
7157 listRewind(operations,&li);
7158 while((ln = listNext(&li))) {
7159 redisSortOperation *sop = ln->value;
7160 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7161 vector[j].obj);
7162
7163 if (sop->type == REDIS_SORT_GET) {
7164 if (!val) {
7165 addReply(c,shared.nullbulk);
7166 } else {
7167 addReplyBulk(c,val);
7168 decrRefCount(val);
7169 }
7170 } else {
7171 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7172 }
7173 }
7174 }
7175 } else {
7176 robj *listObject = createListObject();
7177 list *listPtr = (list*) listObject->ptr;
7178
7179 /* STORE option specified, set the sorting result as a List object */
7180 for (j = start; j <= end; j++) {
7181 listNode *ln;
7182 listIter li;
7183
7184 if (!getop) {
7185 listAddNodeTail(listPtr,vector[j].obj);
7186 incrRefCount(vector[j].obj);
7187 }
7188 listRewind(operations,&li);
7189 while((ln = listNext(&li))) {
7190 redisSortOperation *sop = ln->value;
7191 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7192 vector[j].obj);
7193
7194 if (sop->type == REDIS_SORT_GET) {
7195 if (!val) {
7196 listAddNodeTail(listPtr,createStringObject("",0));
7197 } else {
7198 /* We should do a incrRefCount on val because it is
7199 * added to the list, but also a decrRefCount because
7200 * it is returned by lookupKeyByPattern. This results
7201 * in doing nothing at all. */
7202 listAddNodeTail(listPtr,val);
7203 }
7204 } else {
7205 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7206 }
7207 }
7208 }
7209 if (dictReplace(c->db->dict,storekey,listObject)) {
7210 incrRefCount(storekey);
7211 }
7212 /* Note: we add 1 because the DB is dirty anyway since even if the
7213 * SORT result is empty a new key is set and maybe the old content
7214 * replaced. */
7215 server.dirty += 1+outputlen;
7216 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7217 }
7218
7219 /* Cleanup */
7220 decrRefCount(sortval);
7221 listRelease(operations);
7222 for (j = 0; j < vectorlen; j++) {
7223 if (alpha && vector[j].u.cmpobj)
7224 decrRefCount(vector[j].u.cmpobj);
7225 }
7226 zfree(vector);
7227 }
7228
7229 /* Convert an amount of bytes into a human readable string in the form
7230 * of 100B, 2G, 100M, 4K, and so forth. */
7231 static void bytesToHuman(char *s, unsigned long long n) {
7232 double d;
7233
7234 if (n < 1024) {
7235 /* Bytes */
7236 sprintf(s,"%lluB",n);
7237 return;
7238 } else if (n < (1024*1024)) {
7239 d = (double)n/(1024);
7240 sprintf(s,"%.2fK",d);
7241 } else if (n < (1024LL*1024*1024)) {
7242 d = (double)n/(1024*1024);
7243 sprintf(s,"%.2fM",d);
7244 } else if (n < (1024LL*1024*1024*1024)) {
7245 d = (double)n/(1024LL*1024*1024);
7246 sprintf(s,"%.2fG",d);
7247 }
7248 }
7249
7250 /* Create the string returned by the INFO command. This is decoupled
7251 * by the INFO command itself as we need to report the same information
7252 * on memory corruption problems. */
7253 static sds genRedisInfoString(void) {
7254 sds info;
7255 time_t uptime = time(NULL)-server.stat_starttime;
7256 int j;
7257 char hmem[64];
7258
7259 bytesToHuman(hmem,zmalloc_used_memory());
7260 info = sdscatprintf(sdsempty(),
7261 "redis_version:%s\r\n"
7262 "redis_git_sha1:%s\r\n"
7263 "redis_git_dirty:%d\r\n"
7264 "arch_bits:%s\r\n"
7265 "multiplexing_api:%s\r\n"
7266 "process_id:%ld\r\n"
7267 "uptime_in_seconds:%ld\r\n"
7268 "uptime_in_days:%ld\r\n"
7269 "connected_clients:%d\r\n"
7270 "connected_slaves:%d\r\n"
7271 "blocked_clients:%d\r\n"
7272 "used_memory:%zu\r\n"
7273 "used_memory_human:%s\r\n"
7274 "changes_since_last_save:%lld\r\n"
7275 "bgsave_in_progress:%d\r\n"
7276 "last_save_time:%ld\r\n"
7277 "bgrewriteaof_in_progress:%d\r\n"
7278 "total_connections_received:%lld\r\n"
7279 "total_commands_processed:%lld\r\n"
7280 "expired_keys:%lld\r\n"
7281 "hash_max_zipmap_entries:%zu\r\n"
7282 "hash_max_zipmap_value:%zu\r\n"
7283 "pubsub_channels:%ld\r\n"
7284 "pubsub_patterns:%u\r\n"
7285 "vm_enabled:%d\r\n"
7286 "role:%s\r\n"
7287 ,REDIS_VERSION,
7288 REDIS_GIT_SHA1,
7289 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7290 (sizeof(long) == 8) ? "64" : "32",
7291 aeGetApiName(),
7292 (long) getpid(),
7293 uptime,
7294 uptime/(3600*24),
7295 listLength(server.clients)-listLength(server.slaves),
7296 listLength(server.slaves),
7297 server.blpop_blocked_clients,
7298 zmalloc_used_memory(),
7299 hmem,
7300 server.dirty,
7301 server.bgsavechildpid != -1,
7302 server.lastsave,
7303 server.bgrewritechildpid != -1,
7304 server.stat_numconnections,
7305 server.stat_numcommands,
7306 server.stat_expiredkeys,
7307 server.hash_max_zipmap_entries,
7308 server.hash_max_zipmap_value,
7309 dictSize(server.pubsub_channels),
7310 listLength(server.pubsub_patterns),
7311 server.vm_enabled != 0,
7312 server.masterhost == NULL ? "master" : "slave"
7313 );
7314 if (server.masterhost) {
7315 info = sdscatprintf(info,
7316 "master_host:%s\r\n"
7317 "master_port:%d\r\n"
7318 "master_link_status:%s\r\n"
7319 "master_last_io_seconds_ago:%d\r\n"
7320 ,server.masterhost,
7321 server.masterport,
7322 (server.replstate == REDIS_REPL_CONNECTED) ?
7323 "up" : "down",
7324 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7325 );
7326 }
7327 if (server.vm_enabled) {
7328 lockThreadedIO();
7329 info = sdscatprintf(info,
7330 "vm_conf_max_memory:%llu\r\n"
7331 "vm_conf_page_size:%llu\r\n"
7332 "vm_conf_pages:%llu\r\n"
7333 "vm_stats_used_pages:%llu\r\n"
7334 "vm_stats_swapped_objects:%llu\r\n"
7335 "vm_stats_swappin_count:%llu\r\n"
7336 "vm_stats_swappout_count:%llu\r\n"
7337 "vm_stats_io_newjobs_len:%lu\r\n"
7338 "vm_stats_io_processing_len:%lu\r\n"
7339 "vm_stats_io_processed_len:%lu\r\n"
7340 "vm_stats_io_active_threads:%lu\r\n"
7341 "vm_stats_blocked_clients:%lu\r\n"
7342 ,(unsigned long long) server.vm_max_memory,
7343 (unsigned long long) server.vm_page_size,
7344 (unsigned long long) server.vm_pages,
7345 (unsigned long long) server.vm_stats_used_pages,
7346 (unsigned long long) server.vm_stats_swapped_objects,
7347 (unsigned long long) server.vm_stats_swapins,
7348 (unsigned long long) server.vm_stats_swapouts,
7349 (unsigned long) listLength(server.io_newjobs),
7350 (unsigned long) listLength(server.io_processing),
7351 (unsigned long) listLength(server.io_processed),
7352 (unsigned long) server.io_active_threads,
7353 (unsigned long) server.vm_blocked_clients
7354 );
7355 unlockThreadedIO();
7356 }
7357 for (j = 0; j < server.dbnum; j++) {
7358 long long keys, vkeys;
7359
7360 keys = dictSize(server.db[j].dict);
7361 vkeys = dictSize(server.db[j].expires);
7362 if (keys || vkeys) {
7363 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7364 j, keys, vkeys);
7365 }
7366 }
7367 return info;
7368 }
7369
7370 static void infoCommand(redisClient *c) {
7371 sds info = genRedisInfoString();
7372 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7373 (unsigned long)sdslen(info)));
7374 addReplySds(c,info);
7375 addReply(c,shared.crlf);
7376 }
7377
7378 static void monitorCommand(redisClient *c) {
7379 /* ignore MONITOR if aleady slave or in monitor mode */
7380 if (c->flags & REDIS_SLAVE) return;
7381
7382 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7383 c->slaveseldb = 0;
7384 listAddNodeTail(server.monitors,c);
7385 addReply(c,shared.ok);
7386 }
7387
7388 /* ================================= Expire ================================= */
7389 static int removeExpire(redisDb *db, robj *key) {
7390 if (dictDelete(db->expires,key) == DICT_OK) {
7391 return 1;
7392 } else {
7393 return 0;
7394 }
7395 }
7396
7397 static int setExpire(redisDb *db, robj *key, time_t when) {
7398 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7399 return 0;
7400 } else {
7401 incrRefCount(key);
7402 return 1;
7403 }
7404 }
7405
7406 /* Return the expire time of the specified key, or -1 if no expire
7407 * is associated with this key (i.e. the key is non volatile) */
7408 static time_t getExpire(redisDb *db, robj *key) {
7409 dictEntry *de;
7410
7411 /* No expire? return ASAP */
7412 if (dictSize(db->expires) == 0 ||
7413 (de = dictFind(db->expires,key)) == NULL) return -1;
7414
7415 return (time_t) dictGetEntryVal(de);
7416 }
7417
7418 static int expireIfNeeded(redisDb *db, robj *key) {
7419 time_t when;
7420 dictEntry *de;
7421
7422 /* No expire? return ASAP */
7423 if (dictSize(db->expires) == 0 ||
7424 (de = dictFind(db->expires,key)) == NULL) return 0;
7425
7426 /* Lookup the expire */
7427 when = (time_t) dictGetEntryVal(de);
7428 if (time(NULL) <= when) return 0;
7429
7430 /* Delete the key */
7431 dictDelete(db->expires,key);
7432 server.stat_expiredkeys++;
7433 return dictDelete(db->dict,key) == DICT_OK;
7434 }
7435
7436 static int deleteIfVolatile(redisDb *db, robj *key) {
7437 dictEntry *de;
7438
7439 /* No expire? return ASAP */
7440 if (dictSize(db->expires) == 0 ||
7441 (de = dictFind(db->expires,key)) == NULL) return 0;
7442
7443 /* Delete the key */
7444 server.dirty++;
7445 server.stat_expiredkeys++;
7446 dictDelete(db->expires,key);
7447 return dictDelete(db->dict,key) == DICT_OK;
7448 }
7449
7450 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7451 dictEntry *de;
7452 time_t seconds;
7453
7454 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7455
7456 seconds -= offset;
7457
7458 de = dictFind(c->db->dict,key);
7459 if (de == NULL) {
7460 addReply(c,shared.czero);
7461 return;
7462 }
7463 if (seconds <= 0) {
7464 if (deleteKey(c->db,key)) server.dirty++;
7465 addReply(c, shared.cone);
7466 return;
7467 } else {
7468 time_t when = time(NULL)+seconds;
7469 if (setExpire(c->db,key,when)) {
7470 addReply(c,shared.cone);
7471 server.dirty++;
7472 } else {
7473 addReply(c,shared.czero);
7474 }
7475 return;
7476 }
7477 }
7478
7479 static void expireCommand(redisClient *c) {
7480 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7481 }
7482
7483 static void expireatCommand(redisClient *c) {
7484 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7485 }
7486
7487 static void ttlCommand(redisClient *c) {
7488 time_t expire;
7489 int ttl = -1;
7490
7491 expire = getExpire(c->db,c->argv[1]);
7492 if (expire != -1) {
7493 ttl = (int) (expire-time(NULL));
7494 if (ttl < 0) ttl = -1;
7495 }
7496 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7497 }
7498
7499 /* ================================ MULTI/EXEC ============================== */
7500
7501 /* Client state initialization for MULTI/EXEC */
7502 static void initClientMultiState(redisClient *c) {
7503 c->mstate.commands = NULL;
7504 c->mstate.count = 0;
7505 }
7506
7507 /* Release all the resources associated with MULTI/EXEC state */
7508 static void freeClientMultiState(redisClient *c) {
7509 int j;
7510
7511 for (j = 0; j < c->mstate.count; j++) {
7512 int i;
7513 multiCmd *mc = c->mstate.commands+j;
7514
7515 for (i = 0; i < mc->argc; i++)
7516 decrRefCount(mc->argv[i]);
7517 zfree(mc->argv);
7518 }
7519 zfree(c->mstate.commands);
7520 }
7521
7522 /* Add a new command into the MULTI commands queue */
7523 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7524 multiCmd *mc;
7525 int j;
7526
7527 c->mstate.commands = zrealloc(c->mstate.commands,
7528 sizeof(multiCmd)*(c->mstate.count+1));
7529 mc = c->mstate.commands+c->mstate.count;
7530 mc->cmd = cmd;
7531 mc->argc = c->argc;
7532 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7533 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7534 for (j = 0; j < c->argc; j++)
7535 incrRefCount(mc->argv[j]);
7536 c->mstate.count++;
7537 }
7538
7539 static void multiCommand(redisClient *c) {
7540 if (c->flags & REDIS_MULTI) {
7541 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7542 return;
7543 }
7544 c->flags |= REDIS_MULTI;
7545 addReply(c,shared.ok);
7546 }
7547
7548 static void discardCommand(redisClient *c) {
7549 if (!(c->flags & REDIS_MULTI)) {
7550 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7551 return;
7552 }
7553
7554 freeClientMultiState(c);
7555 initClientMultiState(c);
7556 c->flags &= (~REDIS_MULTI);
7557 addReply(c,shared.ok);
7558 }
7559
7560 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7561 * implememntation for more information. */
7562 static void execCommandReplicateMulti(redisClient *c) {
7563 struct redisCommand *cmd;
7564 robj *multistring = createStringObject("MULTI",5);
7565
7566 cmd = lookupCommand("multi");
7567 if (server.appendonly)
7568 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7569 if (listLength(server.slaves))
7570 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7571 decrRefCount(multistring);
7572 }
7573
7574 static void execCommand(redisClient *c) {
7575 int j;
7576 robj **orig_argv;
7577 int orig_argc;
7578
7579 if (!(c->flags & REDIS_MULTI)) {
7580 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7581 return;
7582 }
7583
7584 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7585 * A failed EXEC will return a multi bulk nil object. */
7586 if (c->flags & REDIS_DIRTY_CAS) {
7587 freeClientMultiState(c);
7588 initClientMultiState(c);
7589 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7590 unwatchAllKeys(c);
7591 addReply(c,shared.nullmultibulk);
7592 return;
7593 }
7594
7595 /* Replicate a MULTI request now that we are sure the block is executed.
7596 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7597 * both the AOF and the replication link will have the same consistency
7598 * and atomicity guarantees. */
7599 execCommandReplicateMulti(c);
7600
7601 /* Exec all the queued commands */
7602 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7603 orig_argv = c->argv;
7604 orig_argc = c->argc;
7605 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7606 for (j = 0; j < c->mstate.count; j++) {
7607 c->argc = c->mstate.commands[j].argc;
7608 c->argv = c->mstate.commands[j].argv;
7609 call(c,c->mstate.commands[j].cmd);
7610 }
7611 c->argv = orig_argv;
7612 c->argc = orig_argc;
7613 freeClientMultiState(c);
7614 initClientMultiState(c);
7615 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7616 /* Make sure the EXEC command is always replicated / AOF, since we
7617 * always send the MULTI command (we can't know beforehand if the
7618 * next operations will contain at least a modification to the DB). */
7619 server.dirty++;
7620 }
7621
7622 /* =========================== Blocking Operations ========================= */
7623
7624 /* Currently Redis blocking operations support is limited to list POP ops,
7625 * so the current implementation is not fully generic, but it is also not
7626 * completely specific so it will not require a rewrite to support new
7627 * kind of blocking operations in the future.
7628 *
7629 * Still it's important to note that list blocking operations can be already
7630 * used as a notification mechanism in order to implement other blocking
7631 * operations at application level, so there must be a very strong evidence
7632 * of usefulness and generality before new blocking operations are implemented.
7633 *
7634 * This is how the current blocking POP works, we use BLPOP as example:
7635 * - If the user calls BLPOP and the key exists and contains a non empty list
7636 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7637 * if there is not to block.
7638 * - If instead BLPOP is called and the key does not exists or the list is
7639 * empty we need to block. In order to do so we remove the notification for
7640 * new data to read in the client socket (so that we'll not serve new
7641 * requests if the blocking request is not served). Also we put the client
7642 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7643 * blocking for this keys.
7644 * - If a PUSH operation against a key with blocked clients waiting is
7645 * performed, we serve the first in the list: basically instead to push
7646 * the new element inside the list we return it to the (first / oldest)
7647 * blocking client, unblock the client, and remove it form the list.
7648 *
7649 * The above comment and the source code should be enough in order to understand
7650 * the implementation and modify / fix it later.
7651 */
7652
7653 /* Set a client in blocking mode for the specified key, with the specified
7654 * timeout */
7655 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7656 dictEntry *de;
7657 list *l;
7658 int j;
7659
7660 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7661 c->blocking_keys_num = numkeys;
7662 c->blockingto = timeout;
7663 for (j = 0; j < numkeys; j++) {
7664 /* Add the key in the client structure, to map clients -> keys */
7665 c->blocking_keys[j] = keys[j];
7666 incrRefCount(keys[j]);
7667
7668 /* And in the other "side", to map keys -> clients */
7669 de = dictFind(c->db->blocking_keys,keys[j]);
7670 if (de == NULL) {
7671 int retval;
7672
7673 /* For every key we take a list of clients blocked for it */
7674 l = listCreate();
7675 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7676 incrRefCount(keys[j]);
7677 assert(retval == DICT_OK);
7678 } else {
7679 l = dictGetEntryVal(de);
7680 }
7681 listAddNodeTail(l,c);
7682 }
7683 /* Mark the client as a blocked client */
7684 c->flags |= REDIS_BLOCKED;
7685 server.blpop_blocked_clients++;
7686 }
7687
7688 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7689 static void unblockClientWaitingData(redisClient *c) {
7690 dictEntry *de;
7691 list *l;
7692 int j;
7693
7694 assert(c->blocking_keys != NULL);
7695 /* The client may wait for multiple keys, so unblock it for every key. */
7696 for (j = 0; j < c->blocking_keys_num; j++) {
7697 /* Remove this client from the list of clients waiting for this key. */
7698 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7699 assert(de != NULL);
7700 l = dictGetEntryVal(de);
7701 listDelNode(l,listSearchKey(l,c));
7702 /* If the list is empty we need to remove it to avoid wasting memory */
7703 if (listLength(l) == 0)
7704 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7705 decrRefCount(c->blocking_keys[j]);
7706 }
7707 /* Cleanup the client structure */
7708 zfree(c->blocking_keys);
7709 c->blocking_keys = NULL;
7710 c->flags &= (~REDIS_BLOCKED);
7711 server.blpop_blocked_clients--;
7712 /* We want to process data if there is some command waiting
7713 * in the input buffer. Note that this is safe even if
7714 * unblockClientWaitingData() gets called from freeClient() because
7715 * freeClient() will be smart enough to call this function
7716 * *after* c->querybuf was set to NULL. */
7717 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7718 }
7719
7720 /* This should be called from any function PUSHing into lists.
7721 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7722 * 'ele' is the element pushed.
7723 *
7724 * If the function returns 0 there was no client waiting for a list push
7725 * against this key.
7726 *
7727 * If the function returns 1 there was a client waiting for a list push
7728 * against this key, the element was passed to this client thus it's not
7729 * needed to actually add it to the list and the caller should return asap. */
7730 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7731 struct dictEntry *de;
7732 redisClient *receiver;
7733 list *l;
7734 listNode *ln;
7735
7736 de = dictFind(c->db->blocking_keys,key);
7737 if (de == NULL) return 0;
7738 l = dictGetEntryVal(de);
7739 ln = listFirst(l);
7740 assert(ln != NULL);
7741 receiver = ln->value;
7742
7743 addReplySds(receiver,sdsnew("*2\r\n"));
7744 addReplyBulk(receiver,key);
7745 addReplyBulk(receiver,ele);
7746 unblockClientWaitingData(receiver);
7747 return 1;
7748 }
7749
7750 /* Blocking RPOP/LPOP */
7751 static void blockingPopGenericCommand(redisClient *c, int where) {
7752 robj *o;
7753 time_t timeout;
7754 int j;
7755
7756 for (j = 1; j < c->argc-1; j++) {
7757 o = lookupKeyWrite(c->db,c->argv[j]);
7758 if (o != NULL) {
7759 if (o->type != REDIS_LIST) {
7760 addReply(c,shared.wrongtypeerr);
7761 return;
7762 } else {
7763 list *list = o->ptr;
7764 if (listLength(list) != 0) {
7765 /* If the list contains elements fall back to the usual
7766 * non-blocking POP operation */
7767 robj *argv[2], **orig_argv;
7768 int orig_argc;
7769
7770 /* We need to alter the command arguments before to call
7771 * popGenericCommand() as the command takes a single key. */
7772 orig_argv = c->argv;
7773 orig_argc = c->argc;
7774 argv[1] = c->argv[j];
7775 c->argv = argv;
7776 c->argc = 2;
7777
7778 /* Also the return value is different, we need to output
7779 * the multi bulk reply header and the key name. The
7780 * "real" command will add the last element (the value)
7781 * for us. If this souds like an hack to you it's just
7782 * because it is... */
7783 addReplySds(c,sdsnew("*2\r\n"));
7784 addReplyBulk(c,argv[1]);
7785 popGenericCommand(c,where);
7786
7787 /* Fix the client structure with the original stuff */
7788 c->argv = orig_argv;
7789 c->argc = orig_argc;
7790 return;
7791 }
7792 }
7793 }
7794 }
7795 /* If the list is empty or the key does not exists we must block */
7796 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7797 if (timeout > 0) timeout += time(NULL);
7798 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7799 }
7800
7801 static void blpopCommand(redisClient *c) {
7802 blockingPopGenericCommand(c,REDIS_HEAD);
7803 }
7804
7805 static void brpopCommand(redisClient *c) {
7806 blockingPopGenericCommand(c,REDIS_TAIL);
7807 }
7808
7809 /* =============================== Replication ============================= */
7810
7811 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7812 ssize_t nwritten, ret = size;
7813 time_t start = time(NULL);
7814
7815 timeout++;
7816 while(size) {
7817 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7818 nwritten = write(fd,ptr,size);
7819 if (nwritten == -1) return -1;
7820 ptr += nwritten;
7821 size -= nwritten;
7822 }
7823 if ((time(NULL)-start) > timeout) {
7824 errno = ETIMEDOUT;
7825 return -1;
7826 }
7827 }
7828 return ret;
7829 }
7830
7831 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7832 ssize_t nread, totread = 0;
7833 time_t start = time(NULL);
7834
7835 timeout++;
7836 while(size) {
7837 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7838 nread = read(fd,ptr,size);
7839 if (nread == -1) return -1;
7840 ptr += nread;
7841 size -= nread;
7842 totread += nread;
7843 }
7844 if ((time(NULL)-start) > timeout) {
7845 errno = ETIMEDOUT;
7846 return -1;
7847 }
7848 }
7849 return totread;
7850 }
7851
7852 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7853 ssize_t nread = 0;
7854
7855 size--;
7856 while(size) {
7857 char c;
7858
7859 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7860 if (c == '\n') {
7861 *ptr = '\0';
7862 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7863 return nread;
7864 } else {
7865 *ptr++ = c;
7866 *ptr = '\0';
7867 nread++;
7868 }
7869 }
7870 return nread;
7871 }
7872
7873 static void syncCommand(redisClient *c) {
7874 /* ignore SYNC if aleady slave or in monitor mode */
7875 if (c->flags & REDIS_SLAVE) return;
7876
7877 /* SYNC can't be issued when the server has pending data to send to
7878 * the client about already issued commands. We need a fresh reply
7879 * buffer registering the differences between the BGSAVE and the current
7880 * dataset, so that we can copy to other slaves if needed. */
7881 if (listLength(c->reply) != 0) {
7882 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7883 return;
7884 }
7885
7886 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7887 /* Here we need to check if there is a background saving operation
7888 * in progress, or if it is required to start one */
7889 if (server.bgsavechildpid != -1) {
7890 /* Ok a background save is in progress. Let's check if it is a good
7891 * one for replication, i.e. if there is another slave that is
7892 * registering differences since the server forked to save */
7893 redisClient *slave;
7894 listNode *ln;
7895 listIter li;
7896
7897 listRewind(server.slaves,&li);
7898 while((ln = listNext(&li))) {
7899 slave = ln->value;
7900 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7901 }
7902 if (ln) {
7903 /* Perfect, the server is already registering differences for
7904 * another slave. Set the right state, and copy the buffer. */
7905 listRelease(c->reply);
7906 c->reply = listDup(slave->reply);
7907 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7908 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7909 } else {
7910 /* No way, we need to wait for the next BGSAVE in order to
7911 * register differences */
7912 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7913 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7914 }
7915 } else {
7916 /* Ok we don't have a BGSAVE in progress, let's start one */
7917 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7918 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7919 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7920 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7921 return;
7922 }
7923 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7924 }
7925 c->repldbfd = -1;
7926 c->flags |= REDIS_SLAVE;
7927 c->slaveseldb = 0;
7928 listAddNodeTail(server.slaves,c);
7929 return;
7930 }
7931
7932 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7933 redisClient *slave = privdata;
7934 REDIS_NOTUSED(el);
7935 REDIS_NOTUSED(mask);
7936 char buf[REDIS_IOBUF_LEN];
7937 ssize_t nwritten, buflen;
7938
7939 if (slave->repldboff == 0) {
7940 /* Write the bulk write count before to transfer the DB. In theory here
7941 * we don't know how much room there is in the output buffer of the
7942 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7943 * operations) will never be smaller than the few bytes we need. */
7944 sds bulkcount;
7945
7946 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7947 slave->repldbsize);
7948 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7949 {
7950 sdsfree(bulkcount);
7951 freeClient(slave);
7952 return;
7953 }
7954 sdsfree(bulkcount);
7955 }
7956 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7957 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7958 if (buflen <= 0) {
7959 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7960 (buflen == 0) ? "premature EOF" : strerror(errno));
7961 freeClient(slave);
7962 return;
7963 }
7964 if ((nwritten = write(fd,buf,buflen)) == -1) {
7965 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7966 strerror(errno));
7967 freeClient(slave);
7968 return;
7969 }
7970 slave->repldboff += nwritten;
7971 if (slave->repldboff == slave->repldbsize) {
7972 close(slave->repldbfd);
7973 slave->repldbfd = -1;
7974 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7975 slave->replstate = REDIS_REPL_ONLINE;
7976 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7977 sendReplyToClient, slave) == AE_ERR) {
7978 freeClient(slave);
7979 return;
7980 }
7981 addReplySds(slave,sdsempty());
7982 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7983 }
7984 }
7985
7986 /* This function is called at the end of every backgrond saving.
7987 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7988 * otherwise REDIS_ERR is passed to the function.
7989 *
7990 * The goal of this function is to handle slaves waiting for a successful
7991 * background saving in order to perform non-blocking synchronization. */
7992 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7993 listNode *ln;
7994 int startbgsave = 0;
7995 listIter li;
7996
7997 listRewind(server.slaves,&li);
7998 while((ln = listNext(&li))) {
7999 redisClient *slave = ln->value;
8000
8001 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8002 startbgsave = 1;
8003 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8004 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8005 struct redis_stat buf;
8006
8007 if (bgsaveerr != REDIS_OK) {
8008 freeClient(slave);
8009 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8010 continue;
8011 }
8012 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8013 redis_fstat(slave->repldbfd,&buf) == -1) {
8014 freeClient(slave);
8015 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8016 continue;
8017 }
8018 slave->repldboff = 0;
8019 slave->repldbsize = buf.st_size;
8020 slave->replstate = REDIS_REPL_SEND_BULK;
8021 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8022 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8023 freeClient(slave);
8024 continue;
8025 }
8026 }
8027 }
8028 if (startbgsave) {
8029 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8030 listIter li;
8031
8032 listRewind(server.slaves,&li);
8033 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8034 while((ln = listNext(&li))) {
8035 redisClient *slave = ln->value;
8036
8037 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8038 freeClient(slave);
8039 }
8040 }
8041 }
8042 }
8043
8044 static int syncWithMaster(void) {
8045 char buf[1024], tmpfile[256], authcmd[1024];
8046 long dumpsize;
8047 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8048 int dfd, maxtries = 5;
8049
8050 if (fd == -1) {
8051 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8052 strerror(errno));
8053 return REDIS_ERR;
8054 }
8055
8056 /* AUTH with the master if required. */
8057 if(server.masterauth) {
8058 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8059 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8060 close(fd);
8061 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8062 strerror(errno));
8063 return REDIS_ERR;
8064 }
8065 /* Read the AUTH result. */
8066 if (syncReadLine(fd,buf,1024,3600) == -1) {
8067 close(fd);
8068 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8069 strerror(errno));
8070 return REDIS_ERR;
8071 }
8072 if (buf[0] != '+') {
8073 close(fd);
8074 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8075 return REDIS_ERR;
8076 }
8077 }
8078
8079 /* Issue the SYNC command */
8080 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8081 close(fd);
8082 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8083 strerror(errno));
8084 return REDIS_ERR;
8085 }
8086 /* Read the bulk write count */
8087 if (syncReadLine(fd,buf,1024,3600) == -1) {
8088 close(fd);
8089 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8090 strerror(errno));
8091 return REDIS_ERR;
8092 }
8093 if (buf[0] != '$') {
8094 close(fd);
8095 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8096 return REDIS_ERR;
8097 }
8098 dumpsize = strtol(buf+1,NULL,10);
8099 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8100 /* Read the bulk write data on a temp file */
8101 while(maxtries--) {
8102 snprintf(tmpfile,256,
8103 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8104 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8105 if (dfd != -1) break;
8106 sleep(1);
8107 }
8108 if (dfd == -1) {
8109 close(fd);
8110 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8111 return REDIS_ERR;
8112 }
8113 while(dumpsize) {
8114 int nread, nwritten;
8115
8116 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8117 if (nread == -1) {
8118 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8119 strerror(errno));
8120 close(fd);
8121 close(dfd);
8122 return REDIS_ERR;
8123 }
8124 nwritten = write(dfd,buf,nread);
8125 if (nwritten == -1) {
8126 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8127 close(fd);
8128 close(dfd);
8129 return REDIS_ERR;
8130 }
8131 dumpsize -= nread;
8132 }
8133 close(dfd);
8134 if (rename(tmpfile,server.dbfilename) == -1) {
8135 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8136 unlink(tmpfile);
8137 close(fd);
8138 return REDIS_ERR;
8139 }
8140 emptyDb();
8141 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8142 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8143 close(fd);
8144 return REDIS_ERR;
8145 }
8146 server.master = createClient(fd);
8147 server.master->flags |= REDIS_MASTER;
8148 server.master->authenticated = 1;
8149 server.replstate = REDIS_REPL_CONNECTED;
8150 return REDIS_OK;
8151 }
8152
8153 static void slaveofCommand(redisClient *c) {
8154 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8155 !strcasecmp(c->argv[2]->ptr,"one")) {
8156 if (server.masterhost) {
8157 sdsfree(server.masterhost);
8158 server.masterhost = NULL;
8159 if (server.master) freeClient(server.master);
8160 server.replstate = REDIS_REPL_NONE;
8161 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8162 }
8163 } else {
8164 sdsfree(server.masterhost);
8165 server.masterhost = sdsdup(c->argv[1]->ptr);
8166 server.masterport = atoi(c->argv[2]->ptr);
8167 if (server.master) freeClient(server.master);
8168 server.replstate = REDIS_REPL_CONNECT;
8169 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8170 server.masterhost, server.masterport);
8171 }
8172 addReply(c,shared.ok);
8173 }
8174
8175 /* ============================ Maxmemory directive ======================== */
8176
8177 /* Try to free one object form the pre-allocated objects free list.
8178 * This is useful under low mem conditions as by default we take 1 million
8179 * free objects allocated. On success REDIS_OK is returned, otherwise
8180 * REDIS_ERR. */
8181 static int tryFreeOneObjectFromFreelist(void) {
8182 robj *o;
8183
8184 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8185 if (listLength(server.objfreelist)) {
8186 listNode *head = listFirst(server.objfreelist);
8187 o = listNodeValue(head);
8188 listDelNode(server.objfreelist,head);
8189 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8190 zfree(o);
8191 return REDIS_OK;
8192 } else {
8193 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8194 return REDIS_ERR;
8195 }
8196 }
8197
8198 /* This function gets called when 'maxmemory' is set on the config file to limit
8199 * the max memory used by the server, and we are out of memory.
8200 * This function will try to, in order:
8201 *
8202 * - Free objects from the free list
8203 * - Try to remove keys with an EXPIRE set
8204 *
8205 * It is not possible to free enough memory to reach used-memory < maxmemory
8206 * the server will start refusing commands that will enlarge even more the
8207 * memory usage.
8208 */
8209 static void freeMemoryIfNeeded(void) {
8210 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8211 int j, k, freed = 0;
8212
8213 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8214 for (j = 0; j < server.dbnum; j++) {
8215 int minttl = -1;
8216 robj *minkey = NULL;
8217 struct dictEntry *de;
8218
8219 if (dictSize(server.db[j].expires)) {
8220 freed = 1;
8221 /* From a sample of three keys drop the one nearest to
8222 * the natural expire */
8223 for (k = 0; k < 3; k++) {
8224 time_t t;
8225
8226 de = dictGetRandomKey(server.db[j].expires);
8227 t = (time_t) dictGetEntryVal(de);
8228 if (minttl == -1 || t < minttl) {
8229 minkey = dictGetEntryKey(de);
8230 minttl = t;
8231 }
8232 }
8233 deleteKey(server.db+j,minkey);
8234 }
8235 }
8236 if (!freed) return; /* nothing to free... */
8237 }
8238 }
8239
8240 /* ============================== Append Only file ========================== */
8241
8242 /* Write the append only file buffer on disk.
8243 *
8244 * Since we are required to write the AOF before replying to the client,
8245 * and the only way the client socket can get a write is entering when the
8246 * the event loop, we accumulate all the AOF writes in a memory
8247 * buffer and write it on disk using this function just before entering
8248 * the event loop again. */
8249 static void flushAppendOnlyFile(void) {
8250 time_t now;
8251 ssize_t nwritten;
8252
8253 if (sdslen(server.aofbuf) == 0) return;
8254
8255 /* We want to perform a single write. This should be guaranteed atomic
8256 * at least if the filesystem we are writing is a real physical one.
8257 * While this will save us against the server being killed I don't think
8258 * there is much to do about the whole server stopping for power problems
8259 * or alike */
8260 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8261 if (nwritten != (signed)sdslen(server.aofbuf)) {
8262 /* Ooops, we are in troubles. The best thing to do for now is
8263 * aborting instead of giving the illusion that everything is
8264 * working as expected. */
8265 if (nwritten == -1) {
8266 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8267 } else {
8268 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8269 }
8270 exit(1);
8271 }
8272 sdsfree(server.aofbuf);
8273 server.aofbuf = sdsempty();
8274
8275 /* Fsync if needed */
8276 now = time(NULL);
8277 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8278 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8279 now-server.lastfsync > 1))
8280 {
8281 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8282 * flushing metadata. */
8283 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8284 server.lastfsync = now;
8285 }
8286 }
8287
8288 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8289 int j;
8290 buf = sdscatprintf(buf,"*%d\r\n",argc);
8291 for (j = 0; j < argc; j++) {
8292 robj *o = getDecodedObject(argv[j]);
8293 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8294 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8295 buf = sdscatlen(buf,"\r\n",2);
8296 decrRefCount(o);
8297 }
8298 return buf;
8299 }
8300
8301 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8302 int argc = 3;
8303 long when;
8304 robj *argv[3];
8305
8306 /* Make sure we can use strtol */
8307 seconds = getDecodedObject(seconds);
8308 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8309 decrRefCount(seconds);
8310
8311 argv[0] = createStringObject("EXPIREAT",8);
8312 argv[1] = key;
8313 argv[2] = createObject(REDIS_STRING,
8314 sdscatprintf(sdsempty(),"%ld",when));
8315 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8316 decrRefCount(argv[0]);
8317 decrRefCount(argv[2]);
8318 return buf;
8319 }
8320
8321 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8322 sds buf = sdsempty();
8323 robj *tmpargv[3];
8324
8325 /* The DB this command was targetting is not the same as the last command
8326 * we appendend. To issue a SELECT command is needed. */
8327 if (dictid != server.appendseldb) {
8328 char seldb[64];
8329
8330 snprintf(seldb,sizeof(seldb),"%d",dictid);
8331 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8332 (unsigned long)strlen(seldb),seldb);
8333 server.appendseldb = dictid;
8334 }
8335
8336 if (cmd->proc == expireCommand) {
8337 /* Translate EXPIRE into EXPIREAT */
8338 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8339 } else if (cmd->proc == setexCommand) {
8340 /* Translate SETEX to SET and EXPIREAT */
8341 tmpargv[0] = createStringObject("SET",3);
8342 tmpargv[1] = argv[1];
8343 tmpargv[2] = argv[3];
8344 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8345 decrRefCount(tmpargv[0]);
8346 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8347 } else {
8348 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8349 }
8350
8351 /* Append to the AOF buffer. This will be flushed on disk just before
8352 * of re-entering the event loop, so before the client will get a
8353 * positive reply about the operation performed. */
8354 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8355
8356 /* If a background append only file rewriting is in progress we want to
8357 * accumulate the differences between the child DB and the current one
8358 * in a buffer, so that when the child process will do its work we
8359 * can append the differences to the new append only file. */
8360 if (server.bgrewritechildpid != -1)
8361 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8362
8363 sdsfree(buf);
8364 }
8365
8366 /* In Redis commands are always executed in the context of a client, so in
8367 * order to load the append only file we need to create a fake client. */
8368 static struct redisClient *createFakeClient(void) {
8369 struct redisClient *c = zmalloc(sizeof(*c));
8370
8371 selectDb(c,0);
8372 c->fd = -1;
8373 c->querybuf = sdsempty();
8374 c->argc = 0;
8375 c->argv = NULL;
8376 c->flags = 0;
8377 /* We set the fake client as a slave waiting for the synchronization
8378 * so that Redis will not try to send replies to this client. */
8379 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8380 c->reply = listCreate();
8381 listSetFreeMethod(c->reply,decrRefCount);
8382 listSetDupMethod(c->reply,dupClientReplyValue);
8383 initClientMultiState(c);
8384 return c;
8385 }
8386
8387 static void freeFakeClient(struct redisClient *c) {
8388 sdsfree(c->querybuf);
8389 listRelease(c->reply);
8390 freeClientMultiState(c);
8391 zfree(c);
8392 }
8393
8394 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8395 * error (the append only file is zero-length) REDIS_ERR is returned. On
8396 * fatal error an error message is logged and the program exists. */
8397 int loadAppendOnlyFile(char *filename) {
8398 struct redisClient *fakeClient;
8399 FILE *fp = fopen(filename,"r");
8400 struct redis_stat sb;
8401 unsigned long long loadedkeys = 0;
8402 int appendonly = server.appendonly;
8403
8404 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8405 return REDIS_ERR;
8406
8407 if (fp == NULL) {
8408 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8409 exit(1);
8410 }
8411
8412 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8413 * to the same file we're about to read. */
8414 server.appendonly = 0;
8415
8416 fakeClient = createFakeClient();
8417 while(1) {
8418 int argc, j;
8419 unsigned long len;
8420 robj **argv;
8421 char buf[128];
8422 sds argsds;
8423 struct redisCommand *cmd;
8424
8425 if (fgets(buf,sizeof(buf),fp) == NULL) {
8426 if (feof(fp))
8427 break;
8428 else
8429 goto readerr;
8430 }
8431 if (buf[0] != '*') goto fmterr;
8432 argc = atoi(buf+1);
8433 argv = zmalloc(sizeof(robj*)*argc);
8434 for (j = 0; j < argc; j++) {
8435 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8436 if (buf[0] != '$') goto fmterr;
8437 len = strtol(buf+1,NULL,10);
8438 argsds = sdsnewlen(NULL,len);
8439 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8440 argv[j] = createObject(REDIS_STRING,argsds);
8441 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8442 }
8443
8444 /* Command lookup */
8445 cmd = lookupCommand(argv[0]->ptr);
8446 if (!cmd) {
8447 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8448 exit(1);
8449 }
8450 /* Try object encoding */
8451 if (cmd->flags & REDIS_CMD_BULK)
8452 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8453 /* Run the command in the context of a fake client */
8454 fakeClient->argc = argc;
8455 fakeClient->argv = argv;
8456 cmd->proc(fakeClient);
8457 /* Discard the reply objects list from the fake client */
8458 while(listLength(fakeClient->reply))
8459 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8460 /* Clean up, ready for the next command */
8461 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8462 zfree(argv);
8463 /* Handle swapping while loading big datasets when VM is on */
8464 loadedkeys++;
8465 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8466 while (zmalloc_used_memory() > server.vm_max_memory) {
8467 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8468 }
8469 }
8470 }
8471
8472 /* This point can only be reached when EOF is reached without errors.
8473 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8474 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8475
8476 fclose(fp);
8477 freeFakeClient(fakeClient);
8478 server.appendonly = appendonly;
8479 return REDIS_OK;
8480
8481 readerr:
8482 if (feof(fp)) {
8483 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8484 } else {
8485 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8486 }
8487 exit(1);
8488 fmterr:
8489 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8490 exit(1);
8491 }
8492
8493 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8494 static int fwriteBulkObject(FILE *fp, robj *obj) {
8495 char buf[128];
8496 int decrrc = 0;
8497
8498 /* Avoid the incr/decr ref count business if possible to help
8499 * copy-on-write (we are often in a child process when this function
8500 * is called).
8501 * Also makes sure that key objects don't get incrRefCount-ed when VM
8502 * is enabled */
8503 if (obj->encoding != REDIS_ENCODING_RAW) {
8504 obj = getDecodedObject(obj);
8505 decrrc = 1;
8506 }
8507 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8508 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8509 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8510 goto err;
8511 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8512 if (decrrc) decrRefCount(obj);
8513 return 1;
8514 err:
8515 if (decrrc) decrRefCount(obj);
8516 return 0;
8517 }
8518
8519 /* Write binary-safe string into a file in the bulkformat
8520 * $<count>\r\n<payload>\r\n */
8521 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8522 char buf[128];
8523
8524 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8525 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8526 if (len && fwrite(s,len,1,fp) == 0) return 0;
8527 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8528 return 1;
8529 }
8530
8531 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8532 static int fwriteBulkDouble(FILE *fp, double d) {
8533 char buf[128], dbuf[128];
8534
8535 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8536 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8537 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8538 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8539 return 1;
8540 }
8541
8542 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8543 static int fwriteBulkLong(FILE *fp, long l) {
8544 char buf[128], lbuf[128];
8545
8546 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8547 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8548 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8549 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8550 return 1;
8551 }
8552
8553 /* Write a sequence of commands able to fully rebuild the dataset into
8554 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8555 static int rewriteAppendOnlyFile(char *filename) {
8556 dictIterator *di = NULL;
8557 dictEntry *de;
8558 FILE *fp;
8559 char tmpfile[256];
8560 int j;
8561 time_t now = time(NULL);
8562
8563 /* Note that we have to use a different temp name here compared to the
8564 * one used by rewriteAppendOnlyFileBackground() function. */
8565 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8566 fp = fopen(tmpfile,"w");
8567 if (!fp) {
8568 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8569 return REDIS_ERR;
8570 }
8571 for (j = 0; j < server.dbnum; j++) {
8572 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8573 redisDb *db = server.db+j;
8574 dict *d = db->dict;
8575 if (dictSize(d) == 0) continue;
8576 di = dictGetIterator(d);
8577 if (!di) {
8578 fclose(fp);
8579 return REDIS_ERR;
8580 }
8581
8582 /* SELECT the new DB */
8583 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8584 if (fwriteBulkLong(fp,j) == 0) goto werr;
8585
8586 /* Iterate this DB writing every entry */
8587 while((de = dictNext(di)) != NULL) {
8588 robj *key, *o;
8589 time_t expiretime;
8590 int swapped;
8591
8592 key = dictGetEntryKey(de);
8593 /* If the value for this key is swapped, load a preview in memory.
8594 * We use a "swapped" flag to remember if we need to free the
8595 * value object instead to just increment the ref count anyway
8596 * in order to avoid copy-on-write of pages if we are forked() */
8597 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8598 key->storage == REDIS_VM_SWAPPING) {
8599 o = dictGetEntryVal(de);
8600 swapped = 0;
8601 } else {
8602 o = vmPreviewObject(key);
8603 swapped = 1;
8604 }
8605 expiretime = getExpire(db,key);
8606
8607 /* Save the key and associated value */
8608 if (o->type == REDIS_STRING) {
8609 /* Emit a SET command */
8610 char cmd[]="*3\r\n$3\r\nSET\r\n";
8611 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8612 /* Key and value */
8613 if (fwriteBulkObject(fp,key) == 0) goto werr;
8614 if (fwriteBulkObject(fp,o) == 0) goto werr;
8615 } else if (o->type == REDIS_LIST) {
8616 /* Emit the RPUSHes needed to rebuild the list */
8617 list *list = o->ptr;
8618 listNode *ln;
8619 listIter li;
8620
8621 listRewind(list,&li);
8622 while((ln = listNext(&li))) {
8623 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8624 robj *eleobj = listNodeValue(ln);
8625
8626 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8627 if (fwriteBulkObject(fp,key) == 0) goto werr;
8628 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8629 }
8630 } else if (o->type == REDIS_SET) {
8631 /* Emit the SADDs needed to rebuild the set */
8632 dict *set = o->ptr;
8633 dictIterator *di = dictGetIterator(set);
8634 dictEntry *de;
8635
8636 while((de = dictNext(di)) != NULL) {
8637 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8638 robj *eleobj = dictGetEntryKey(de);
8639
8640 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8641 if (fwriteBulkObject(fp,key) == 0) goto werr;
8642 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8643 }
8644 dictReleaseIterator(di);
8645 } else if (o->type == REDIS_ZSET) {
8646 /* Emit the ZADDs needed to rebuild the sorted set */
8647 zset *zs = o->ptr;
8648 dictIterator *di = dictGetIterator(zs->dict);
8649 dictEntry *de;
8650
8651 while((de = dictNext(di)) != NULL) {
8652 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8653 robj *eleobj = dictGetEntryKey(de);
8654 double *score = dictGetEntryVal(de);
8655
8656 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8657 if (fwriteBulkObject(fp,key) == 0) goto werr;
8658 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8659 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8660 }
8661 dictReleaseIterator(di);
8662 } else if (o->type == REDIS_HASH) {
8663 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8664
8665 /* Emit the HSETs needed to rebuild the hash */
8666 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8667 unsigned char *p = zipmapRewind(o->ptr);
8668 unsigned char *field, *val;
8669 unsigned int flen, vlen;
8670
8671 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8672 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8673 if (fwriteBulkObject(fp,key) == 0) goto werr;
8674 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8675 return -1;
8676 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8677 return -1;
8678 }
8679 } else {
8680 dictIterator *di = dictGetIterator(o->ptr);
8681 dictEntry *de;
8682
8683 while((de = dictNext(di)) != NULL) {
8684 robj *field = dictGetEntryKey(de);
8685 robj *val = dictGetEntryVal(de);
8686
8687 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8688 if (fwriteBulkObject(fp,key) == 0) goto werr;
8689 if (fwriteBulkObject(fp,field) == -1) return -1;
8690 if (fwriteBulkObject(fp,val) == -1) return -1;
8691 }
8692 dictReleaseIterator(di);
8693 }
8694 } else {
8695 redisPanic("Unknown object type");
8696 }
8697 /* Save the expire time */
8698 if (expiretime != -1) {
8699 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8700 /* If this key is already expired skip it */
8701 if (expiretime < now) continue;
8702 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8703 if (fwriteBulkObject(fp,key) == 0) goto werr;
8704 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8705 }
8706 if (swapped) decrRefCount(o);
8707 }
8708 dictReleaseIterator(di);
8709 }
8710
8711 /* Make sure data will not remain on the OS's output buffers */
8712 fflush(fp);
8713 fsync(fileno(fp));
8714 fclose(fp);
8715
8716 /* Use RENAME to make sure the DB file is changed atomically only
8717 * if the generate DB file is ok. */
8718 if (rename(tmpfile,filename) == -1) {
8719 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8720 unlink(tmpfile);
8721 return REDIS_ERR;
8722 }
8723 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8724 return REDIS_OK;
8725
8726 werr:
8727 fclose(fp);
8728 unlink(tmpfile);
8729 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8730 if (di) dictReleaseIterator(di);
8731 return REDIS_ERR;
8732 }
8733
8734 /* This is how rewriting of the append only file in background works:
8735 *
8736 * 1) The user calls BGREWRITEAOF
8737 * 2) Redis calls this function, that forks():
8738 * 2a) the child rewrite the append only file in a temp file.
8739 * 2b) the parent accumulates differences in server.bgrewritebuf.
8740 * 3) When the child finished '2a' exists.
8741 * 4) The parent will trap the exit code, if it's OK, will append the
8742 * data accumulated into server.bgrewritebuf into the temp file, and
8743 * finally will rename(2) the temp file in the actual file name.
8744 * The the new file is reopened as the new append only file. Profit!
8745 */
8746 static int rewriteAppendOnlyFileBackground(void) {
8747 pid_t childpid;
8748
8749 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8750 if (server.vm_enabled) waitEmptyIOJobsQueue();
8751 if ((childpid = fork()) == 0) {
8752 /* Child */
8753 char tmpfile[256];
8754
8755 if (server.vm_enabled) vmReopenSwapFile();
8756 close(server.fd);
8757 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8758 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8759 _exit(0);
8760 } else {
8761 _exit(1);
8762 }
8763 } else {
8764 /* Parent */
8765 if (childpid == -1) {
8766 redisLog(REDIS_WARNING,
8767 "Can't rewrite append only file in background: fork: %s",
8768 strerror(errno));
8769 return REDIS_ERR;
8770 }
8771 redisLog(REDIS_NOTICE,
8772 "Background append only file rewriting started by pid %d",childpid);
8773 server.bgrewritechildpid = childpid;
8774 updateDictResizePolicy();
8775 /* We set appendseldb to -1 in order to force the next call to the
8776 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8777 * accumulated by the parent into server.bgrewritebuf will start
8778 * with a SELECT statement and it will be safe to merge. */
8779 server.appendseldb = -1;
8780 return REDIS_OK;
8781 }
8782 return REDIS_OK; /* unreached */
8783 }
8784
8785 static void bgrewriteaofCommand(redisClient *c) {
8786 if (server.bgrewritechildpid != -1) {
8787 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8788 return;
8789 }
8790 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8791 char *status = "+Background append only file rewriting started\r\n";
8792 addReplySds(c,sdsnew(status));
8793 } else {
8794 addReply(c,shared.err);
8795 }
8796 }
8797
8798 static void aofRemoveTempFile(pid_t childpid) {
8799 char tmpfile[256];
8800
8801 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8802 unlink(tmpfile);
8803 }
8804
8805 /* Virtual Memory is composed mainly of two subsystems:
8806 * - Blocking Virutal Memory
8807 * - Threaded Virtual Memory I/O
8808 * The two parts are not fully decoupled, but functions are split among two
8809 * different sections of the source code (delimited by comments) in order to
8810 * make more clear what functionality is about the blocking VM and what about
8811 * the threaded (not blocking) VM.
8812 *
8813 * Redis VM design:
8814 *
8815 * Redis VM is a blocking VM (one that blocks reading swapped values from
8816 * disk into memory when a value swapped out is needed in memory) that is made
8817 * unblocking by trying to examine the command argument vector in order to
8818 * load in background values that will likely be needed in order to exec
8819 * the command. The command is executed only once all the relevant keys
8820 * are loaded into memory.
8821 *
8822 * This basically is almost as simple of a blocking VM, but almost as parallel
8823 * as a fully non-blocking VM.
8824 */
8825
8826 /* Called when the user switches from "appendonly yes" to "appendonly no"
8827 * at runtime using the CONFIG command. */
8828 static void stopAppendOnly(void) {
8829 flushAppendOnlyFile();
8830 fsync(server.appendfd);
8831 close(server.appendfd);
8832
8833 server.appendfd = -1;
8834 server.appendseldb = -1;
8835 server.appendonly = 0;
8836 /* rewrite operation in progress? kill it, wait child exit */
8837 if (server.bgsavechildpid != -1) {
8838 int statloc;
8839
8840 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8841 wait3(&statloc,0,NULL);
8842 /* reset the buffer accumulating changes while the child saves */
8843 sdsfree(server.bgrewritebuf);
8844 server.bgrewritebuf = sdsempty();
8845 server.bgsavechildpid = -1;
8846 }
8847 }
8848
8849 /* Called when the user switches from "appendonly no" to "appendonly yes"
8850 * at runtime using the CONFIG command. */
8851 static int startAppendOnly(void) {
8852 server.appendonly = 1;
8853 server.lastfsync = time(NULL);
8854 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8855 if (server.appendfd == -1) {
8856 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8857 return REDIS_ERR;
8858 }
8859 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8860 server.appendonly = 0;
8861 close(server.appendfd);
8862 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8863 return REDIS_ERR;
8864 }
8865 return REDIS_OK;
8866 }
8867
8868 /* =================== Virtual Memory - Blocking Side ====================== */
8869
8870 static void vmInit(void) {
8871 off_t totsize;
8872 int pipefds[2];
8873 size_t stacksize;
8874 struct flock fl;
8875
8876 if (server.vm_max_threads != 0)
8877 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8878
8879 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8880 /* Try to open the old swap file, otherwise create it */
8881 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8882 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8883 }
8884 if (server.vm_fp == NULL) {
8885 redisLog(REDIS_WARNING,
8886 "Can't open the swap file: %s. Exiting.",
8887 strerror(errno));
8888 exit(1);
8889 }
8890 server.vm_fd = fileno(server.vm_fp);
8891 /* Lock the swap file for writing, this is useful in order to avoid
8892 * another instance to use the same swap file for a config error. */
8893 fl.l_type = F_WRLCK;
8894 fl.l_whence = SEEK_SET;
8895 fl.l_start = fl.l_len = 0;
8896 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8897 redisLog(REDIS_WARNING,
8898 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8899 exit(1);
8900 }
8901 /* Initialize */
8902 server.vm_next_page = 0;
8903 server.vm_near_pages = 0;
8904 server.vm_stats_used_pages = 0;
8905 server.vm_stats_swapped_objects = 0;
8906 server.vm_stats_swapouts = 0;
8907 server.vm_stats_swapins = 0;
8908 totsize = server.vm_pages*server.vm_page_size;
8909 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8910 if (ftruncate(server.vm_fd,totsize) == -1) {
8911 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8912 strerror(errno));
8913 exit(1);
8914 } else {
8915 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8916 }
8917 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8918 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8919 (long long) (server.vm_pages+7)/8, server.vm_pages);
8920 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8921
8922 /* Initialize threaded I/O (used by Virtual Memory) */
8923 server.io_newjobs = listCreate();
8924 server.io_processing = listCreate();
8925 server.io_processed = listCreate();
8926 server.io_ready_clients = listCreate();
8927 pthread_mutex_init(&server.io_mutex,NULL);
8928 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8929 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8930 server.io_active_threads = 0;
8931 if (pipe(pipefds) == -1) {
8932 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8933 ,strerror(errno));
8934 exit(1);
8935 }
8936 server.io_ready_pipe_read = pipefds[0];
8937 server.io_ready_pipe_write = pipefds[1];
8938 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8939 /* LZF requires a lot of stack */
8940 pthread_attr_init(&server.io_threads_attr);
8941 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8942 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8943 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8944 /* Listen for events in the threaded I/O pipe */
8945 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8946 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8947 oom("creating file event");
8948 }
8949
8950 /* Mark the page as used */
8951 static void vmMarkPageUsed(off_t page) {
8952 off_t byte = page/8;
8953 int bit = page&7;
8954 redisAssert(vmFreePage(page) == 1);
8955 server.vm_bitmap[byte] |= 1<<bit;
8956 }
8957
8958 /* Mark N contiguous pages as used, with 'page' being the first. */
8959 static void vmMarkPagesUsed(off_t page, off_t count) {
8960 off_t j;
8961
8962 for (j = 0; j < count; j++)
8963 vmMarkPageUsed(page+j);
8964 server.vm_stats_used_pages += count;
8965 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8966 (long long)count, (long long)page);
8967 }
8968
8969 /* Mark the page as free */
8970 static void vmMarkPageFree(off_t page) {
8971 off_t byte = page/8;
8972 int bit = page&7;
8973 redisAssert(vmFreePage(page) == 0);
8974 server.vm_bitmap[byte] &= ~(1<<bit);
8975 }
8976
8977 /* Mark N contiguous pages as free, with 'page' being the first. */
8978 static void vmMarkPagesFree(off_t page, off_t count) {
8979 off_t j;
8980
8981 for (j = 0; j < count; j++)
8982 vmMarkPageFree(page+j);
8983 server.vm_stats_used_pages -= count;
8984 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8985 (long long)count, (long long)page);
8986 }
8987
8988 /* Test if the page is free */
8989 static int vmFreePage(off_t page) {
8990 off_t byte = page/8;
8991 int bit = page&7;
8992 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8993 }
8994
8995 /* Find N contiguous free pages storing the first page of the cluster in *first.
8996 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8997 * REDIS_ERR is returned.
8998 *
8999 * This function uses a simple algorithm: we try to allocate
9000 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9001 * again from the start of the swap file searching for free spaces.
9002 *
9003 * If it looks pretty clear that there are no free pages near our offset
9004 * we try to find less populated places doing a forward jump of
9005 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9006 * without hurry, and then we jump again and so forth...
9007 *
9008 * This function can be improved using a free list to avoid to guess
9009 * too much, since we could collect data about freed pages.
9010 *
9011 * note: I implemented this function just after watching an episode of
9012 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9013 */
9014 static int vmFindContiguousPages(off_t *first, off_t n) {
9015 off_t base, offset = 0, since_jump = 0, numfree = 0;
9016
9017 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9018 server.vm_near_pages = 0;
9019 server.vm_next_page = 0;
9020 }
9021 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9022 base = server.vm_next_page;
9023
9024 while(offset < server.vm_pages) {
9025 off_t this = base+offset;
9026
9027 /* If we overflow, restart from page zero */
9028 if (this >= server.vm_pages) {
9029 this -= server.vm_pages;
9030 if (this == 0) {
9031 /* Just overflowed, what we found on tail is no longer
9032 * interesting, as it's no longer contiguous. */
9033 numfree = 0;
9034 }
9035 }
9036 if (vmFreePage(this)) {
9037 /* This is a free page */
9038 numfree++;
9039 /* Already got N free pages? Return to the caller, with success */
9040 if (numfree == n) {
9041 *first = this-(n-1);
9042 server.vm_next_page = this+1;
9043 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9044 return REDIS_OK;
9045 }
9046 } else {
9047 /* The current one is not a free page */
9048 numfree = 0;
9049 }
9050
9051 /* Fast-forward if the current page is not free and we already
9052 * searched enough near this place. */
9053 since_jump++;
9054 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9055 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9056 since_jump = 0;
9057 /* Note that even if we rewind after the jump, we are don't need
9058 * to make sure numfree is set to zero as we only jump *if* it
9059 * is set to zero. */
9060 } else {
9061 /* Otherwise just check the next page */
9062 offset++;
9063 }
9064 }
9065 return REDIS_ERR;
9066 }
9067
9068 /* Write the specified object at the specified page of the swap file */
9069 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9070 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9071 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9072 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9073 redisLog(REDIS_WARNING,
9074 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9075 strerror(errno));
9076 return REDIS_ERR;
9077 }
9078 rdbSaveObject(server.vm_fp,o);
9079 fflush(server.vm_fp);
9080 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9081 return REDIS_OK;
9082 }
9083
9084 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9085 * needed to later retrieve the object into the key object.
9086 * If we can't find enough contiguous empty pages to swap the object on disk
9087 * REDIS_ERR is returned. */
9088 static int vmSwapObjectBlocking(robj *key, robj *val) {
9089 off_t pages = rdbSavedObjectPages(val,NULL);
9090 off_t page;
9091
9092 assert(key->storage == REDIS_VM_MEMORY);
9093 assert(key->refcount == 1);
9094 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9095 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9096 key->vm.page = page;
9097 key->vm.usedpages = pages;
9098 key->storage = REDIS_VM_SWAPPED;
9099 key->vtype = val->type;
9100 decrRefCount(val); /* Deallocate the object from memory. */
9101 vmMarkPagesUsed(page,pages);
9102 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9103 (unsigned char*) key->ptr,
9104 (unsigned long long) page, (unsigned long long) pages);
9105 server.vm_stats_swapped_objects++;
9106 server.vm_stats_swapouts++;
9107 return REDIS_OK;
9108 }
9109
9110 static robj *vmReadObjectFromSwap(off_t page, int type) {
9111 robj *o;
9112
9113 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9114 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9115 redisLog(REDIS_WARNING,
9116 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9117 strerror(errno));
9118 _exit(1);
9119 }
9120 o = rdbLoadObject(type,server.vm_fp);
9121 if (o == NULL) {
9122 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9123 _exit(1);
9124 }
9125 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9126 return o;
9127 }
9128
9129 /* Load the value object relative to the 'key' object from swap to memory.
9130 * The newly allocated object is returned.
9131 *
9132 * If preview is true the unserialized object is returned to the caller but
9133 * no changes are made to the key object, nor the pages are marked as freed */
9134 static robj *vmGenericLoadObject(robj *key, int preview) {
9135 robj *val;
9136
9137 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9138 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9139 if (!preview) {
9140 key->storage = REDIS_VM_MEMORY;
9141 key->vm.atime = server.unixtime;
9142 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9143 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9144 (unsigned char*) key->ptr);
9145 server.vm_stats_swapped_objects--;
9146 } else {
9147 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9148 (unsigned char*) key->ptr);
9149 }
9150 server.vm_stats_swapins++;
9151 return val;
9152 }
9153
9154 /* Plain object loading, from swap to memory */
9155 static robj *vmLoadObject(robj *key) {
9156 /* If we are loading the object in background, stop it, we
9157 * need to load this object synchronously ASAP. */
9158 if (key->storage == REDIS_VM_LOADING)
9159 vmCancelThreadedIOJob(key);
9160 return vmGenericLoadObject(key,0);
9161 }
9162
9163 /* Just load the value on disk, without to modify the key.
9164 * This is useful when we want to perform some operation on the value
9165 * without to really bring it from swap to memory, like while saving the
9166 * dataset or rewriting the append only log. */
9167 static robj *vmPreviewObject(robj *key) {
9168 return vmGenericLoadObject(key,1);
9169 }
9170
9171 /* How a good candidate is this object for swapping?
9172 * The better candidate it is, the greater the returned value.
9173 *
9174 * Currently we try to perform a fast estimation of the object size in
9175 * memory, and combine it with aging informations.
9176 *
9177 * Basically swappability = idle-time * log(estimated size)
9178 *
9179 * Bigger objects are preferred over smaller objects, but not
9180 * proportionally, this is why we use the logarithm. This algorithm is
9181 * just a first try and will probably be tuned later. */
9182 static double computeObjectSwappability(robj *o) {
9183 time_t age = server.unixtime - o->vm.atime;
9184 long asize = 0;
9185 list *l;
9186 dict *d;
9187 struct dictEntry *de;
9188 int z;
9189
9190 if (age <= 0) return 0;
9191 switch(o->type) {
9192 case REDIS_STRING:
9193 if (o->encoding != REDIS_ENCODING_RAW) {
9194 asize = sizeof(*o);
9195 } else {
9196 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9197 }
9198 break;
9199 case REDIS_LIST:
9200 l = o->ptr;
9201 listNode *ln = listFirst(l);
9202
9203 asize = sizeof(list);
9204 if (ln) {
9205 robj *ele = ln->value;
9206 long elesize;
9207
9208 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9209 (sizeof(*o)+sdslen(ele->ptr)) :
9210 sizeof(*o);
9211 asize += (sizeof(listNode)+elesize)*listLength(l);
9212 }
9213 break;
9214 case REDIS_SET:
9215 case REDIS_ZSET:
9216 z = (o->type == REDIS_ZSET);
9217 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9218
9219 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9220 if (z) asize += sizeof(zset)-sizeof(dict);
9221 if (dictSize(d)) {
9222 long elesize;
9223 robj *ele;
9224
9225 de = dictGetRandomKey(d);
9226 ele = dictGetEntryKey(de);
9227 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9228 (sizeof(*o)+sdslen(ele->ptr)) :
9229 sizeof(*o);
9230 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9231 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9232 }
9233 break;
9234 case REDIS_HASH:
9235 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9236 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9237 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9238 unsigned int klen, vlen;
9239 unsigned char *key, *val;
9240
9241 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9242 klen = 0;
9243 vlen = 0;
9244 }
9245 asize = len*(klen+vlen+3);
9246 } else if (o->encoding == REDIS_ENCODING_HT) {
9247 d = o->ptr;
9248 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9249 if (dictSize(d)) {
9250 long elesize;
9251 robj *ele;
9252
9253 de = dictGetRandomKey(d);
9254 ele = dictGetEntryKey(de);
9255 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9256 (sizeof(*o)+sdslen(ele->ptr)) :
9257 sizeof(*o);
9258 ele = dictGetEntryVal(de);
9259 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9260 (sizeof(*o)+sdslen(ele->ptr)) :
9261 sizeof(*o);
9262 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9263 }
9264 }
9265 break;
9266 }
9267 return (double)age*log(1+asize);
9268 }
9269
9270 /* Try to swap an object that's a good candidate for swapping.
9271 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9272 * to swap any object at all.
9273 *
9274 * If 'usethreaded' is true, Redis will try to swap the object in background
9275 * using I/O threads. */
9276 static int vmSwapOneObject(int usethreads) {
9277 int j, i;
9278 struct dictEntry *best = NULL;
9279 double best_swappability = 0;
9280 redisDb *best_db = NULL;
9281 robj *key, *val;
9282
9283 for (j = 0; j < server.dbnum; j++) {
9284 redisDb *db = server.db+j;
9285 /* Why maxtries is set to 100?
9286 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9287 * are swappable objects */
9288 int maxtries = 100;
9289
9290 if (dictSize(db->dict) == 0) continue;
9291 for (i = 0; i < 5; i++) {
9292 dictEntry *de;
9293 double swappability;
9294
9295 if (maxtries) maxtries--;
9296 de = dictGetRandomKey(db->dict);
9297 key = dictGetEntryKey(de);
9298 val = dictGetEntryVal(de);
9299 /* Only swap objects that are currently in memory.
9300 *
9301 * Also don't swap shared objects if threaded VM is on, as we
9302 * try to ensure that the main thread does not touch the
9303 * object while the I/O thread is using it, but we can't
9304 * control other keys without adding additional mutex. */
9305 if (key->storage != REDIS_VM_MEMORY ||
9306 (server.vm_max_threads != 0 && val->refcount != 1)) {
9307 if (maxtries) i--; /* don't count this try */
9308 continue;
9309 }
9310 swappability = computeObjectSwappability(val);
9311 if (!best || swappability > best_swappability) {
9312 best = de;
9313 best_swappability = swappability;
9314 best_db = db;
9315 }
9316 }
9317 }
9318 if (best == NULL) return REDIS_ERR;
9319 key = dictGetEntryKey(best);
9320 val = dictGetEntryVal(best);
9321
9322 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9323 key->ptr, best_swappability);
9324
9325 /* Unshare the key if needed */
9326 if (key->refcount > 1) {
9327 robj *newkey = dupStringObject(key);
9328 decrRefCount(key);
9329 key = dictGetEntryKey(best) = newkey;
9330 }
9331 /* Swap it */
9332 if (usethreads) {
9333 vmSwapObjectThreaded(key,val,best_db);
9334 return REDIS_OK;
9335 } else {
9336 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9337 dictGetEntryVal(best) = NULL;
9338 return REDIS_OK;
9339 } else {
9340 return REDIS_ERR;
9341 }
9342 }
9343 }
9344
9345 static int vmSwapOneObjectBlocking() {
9346 return vmSwapOneObject(0);
9347 }
9348
9349 static int vmSwapOneObjectThreaded() {
9350 return vmSwapOneObject(1);
9351 }
9352
9353 /* Return true if it's safe to swap out objects in a given moment.
9354 * Basically we don't want to swap objects out while there is a BGSAVE
9355 * or a BGAEOREWRITE running in backgroud. */
9356 static int vmCanSwapOut(void) {
9357 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9358 }
9359
9360 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9361 * and was deleted. Otherwise 0 is returned. */
9362 static int deleteIfSwapped(redisDb *db, robj *key) {
9363 dictEntry *de;
9364 robj *foundkey;
9365
9366 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9367 foundkey = dictGetEntryKey(de);
9368 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9369 deleteKey(db,key);
9370 return 1;
9371 }
9372
9373 /* =================== Virtual Memory - Threaded I/O ======================= */
9374
9375 static void freeIOJob(iojob *j) {
9376 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9377 j->type == REDIS_IOJOB_DO_SWAP ||
9378 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9379 decrRefCount(j->val);
9380 /* We don't decrRefCount the j->key field as we did't incremented
9381 * the count creating IO Jobs. This is because the key field here is
9382 * just used as an indentifier and if a key is removed the Job should
9383 * never be touched again. */
9384 zfree(j);
9385 }
9386
9387 /* Every time a thread finished a Job, it writes a byte into the write side
9388 * of an unix pipe in order to "awake" the main thread, and this function
9389 * is called. */
9390 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9391 int mask)
9392 {
9393 char buf[1];
9394 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9395 REDIS_NOTUSED(el);
9396 REDIS_NOTUSED(mask);
9397 REDIS_NOTUSED(privdata);
9398
9399 /* For every byte we read in the read side of the pipe, there is one
9400 * I/O job completed to process. */
9401 while((retval = read(fd,buf,1)) == 1) {
9402 iojob *j;
9403 listNode *ln;
9404 robj *key;
9405 struct dictEntry *de;
9406
9407 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9408
9409 /* Get the processed element (the oldest one) */
9410 lockThreadedIO();
9411 assert(listLength(server.io_processed) != 0);
9412 if (toprocess == -1) {
9413 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9414 if (toprocess <= 0) toprocess = 1;
9415 }
9416 ln = listFirst(server.io_processed);
9417 j = ln->value;
9418 listDelNode(server.io_processed,ln);
9419 unlockThreadedIO();
9420 /* If this job is marked as canceled, just ignore it */
9421 if (j->canceled) {
9422 freeIOJob(j);
9423 continue;
9424 }
9425 /* Post process it in the main thread, as there are things we
9426 * can do just here to avoid race conditions and/or invasive locks */
9427 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9428 de = dictFind(j->db->dict,j->key);
9429 assert(de != NULL);
9430 key = dictGetEntryKey(de);
9431 if (j->type == REDIS_IOJOB_LOAD) {
9432 redisDb *db;
9433
9434 /* Key loaded, bring it at home */
9435 key->storage = REDIS_VM_MEMORY;
9436 key->vm.atime = server.unixtime;
9437 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9438 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9439 (unsigned char*) key->ptr);
9440 server.vm_stats_swapped_objects--;
9441 server.vm_stats_swapins++;
9442 dictGetEntryVal(de) = j->val;
9443 incrRefCount(j->val);
9444 db = j->db;
9445 freeIOJob(j);
9446 /* Handle clients waiting for this key to be loaded. */
9447 handleClientsBlockedOnSwappedKey(db,key);
9448 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9449 /* Now we know the amount of pages required to swap this object.
9450 * Let's find some space for it, and queue this task again
9451 * rebranded as REDIS_IOJOB_DO_SWAP. */
9452 if (!vmCanSwapOut() ||
9453 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9454 {
9455 /* Ooops... no space or we can't swap as there is
9456 * a fork()ed Redis trying to save stuff on disk. */
9457 freeIOJob(j);
9458 key->storage = REDIS_VM_MEMORY; /* undo operation */
9459 } else {
9460 /* Note that we need to mark this pages as used now,
9461 * if the job will be canceled, we'll mark them as freed
9462 * again. */
9463 vmMarkPagesUsed(j->page,j->pages);
9464 j->type = REDIS_IOJOB_DO_SWAP;
9465 lockThreadedIO();
9466 queueIOJob(j);
9467 unlockThreadedIO();
9468 }
9469 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9470 robj *val;
9471
9472 /* Key swapped. We can finally free some memory. */
9473 if (key->storage != REDIS_VM_SWAPPING) {
9474 printf("key->storage: %d\n",key->storage);
9475 printf("key->name: %s\n",(char*)key->ptr);
9476 printf("key->refcount: %d\n",key->refcount);
9477 printf("val: %p\n",(void*)j->val);
9478 printf("val->type: %d\n",j->val->type);
9479 printf("val->ptr: %s\n",(char*)j->val->ptr);
9480 }
9481 redisAssert(key->storage == REDIS_VM_SWAPPING);
9482 val = dictGetEntryVal(de);
9483 key->vm.page = j->page;
9484 key->vm.usedpages = j->pages;
9485 key->storage = REDIS_VM_SWAPPED;
9486 key->vtype = j->val->type;
9487 decrRefCount(val); /* Deallocate the object from memory. */
9488 dictGetEntryVal(de) = NULL;
9489 redisLog(REDIS_DEBUG,
9490 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9491 (unsigned char*) key->ptr,
9492 (unsigned long long) j->page, (unsigned long long) j->pages);
9493 server.vm_stats_swapped_objects++;
9494 server.vm_stats_swapouts++;
9495 freeIOJob(j);
9496 /* Put a few more swap requests in queue if we are still
9497 * out of memory */
9498 if (trytoswap && vmCanSwapOut() &&
9499 zmalloc_used_memory() > server.vm_max_memory)
9500 {
9501 int more = 1;
9502 while(more) {
9503 lockThreadedIO();
9504 more = listLength(server.io_newjobs) <
9505 (unsigned) server.vm_max_threads;
9506 unlockThreadedIO();
9507 /* Don't waste CPU time if swappable objects are rare. */
9508 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9509 trytoswap = 0;
9510 break;
9511 }
9512 }
9513 }
9514 }
9515 processed++;
9516 if (processed == toprocess) return;
9517 }
9518 if (retval < 0 && errno != EAGAIN) {
9519 redisLog(REDIS_WARNING,
9520 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9521 strerror(errno));
9522 }
9523 }
9524
9525 static void lockThreadedIO(void) {
9526 pthread_mutex_lock(&server.io_mutex);
9527 }
9528
9529 static void unlockThreadedIO(void) {
9530 pthread_mutex_unlock(&server.io_mutex);
9531 }
9532
9533 /* Remove the specified object from the threaded I/O queue if still not
9534 * processed, otherwise make sure to flag it as canceled. */
9535 static void vmCancelThreadedIOJob(robj *o) {
9536 list *lists[3] = {
9537 server.io_newjobs, /* 0 */
9538 server.io_processing, /* 1 */
9539 server.io_processed /* 2 */
9540 };
9541 int i;
9542
9543 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9544 again:
9545 lockThreadedIO();
9546 /* Search for a matching key in one of the queues */
9547 for (i = 0; i < 3; i++) {
9548 listNode *ln;
9549 listIter li;
9550
9551 listRewind(lists[i],&li);
9552 while ((ln = listNext(&li)) != NULL) {
9553 iojob *job = ln->value;
9554
9555 if (job->canceled) continue; /* Skip this, already canceled. */
9556 if (job->key == o) {
9557 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9558 (void*)job, (char*)o->ptr, job->type, i);
9559 /* Mark the pages as free since the swap didn't happened
9560 * or happened but is now discarded. */
9561 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9562 vmMarkPagesFree(job->page,job->pages);
9563 /* Cancel the job. It depends on the list the job is
9564 * living in. */
9565 switch(i) {
9566 case 0: /* io_newjobs */
9567 /* If the job was yet not processed the best thing to do
9568 * is to remove it from the queue at all */
9569 freeIOJob(job);
9570 listDelNode(lists[i],ln);
9571 break;
9572 case 1: /* io_processing */
9573 /* Oh Shi- the thread is messing with the Job:
9574 *
9575 * Probably it's accessing the object if this is a
9576 * PREPARE_SWAP or DO_SWAP job.
9577 * If it's a LOAD job it may be reading from disk and
9578 * if we don't wait for the job to terminate before to
9579 * cancel it, maybe in a few microseconds data can be
9580 * corrupted in this pages. So the short story is:
9581 *
9582 * Better to wait for the job to move into the
9583 * next queue (processed)... */
9584
9585 /* We try again and again until the job is completed. */
9586 unlockThreadedIO();
9587 /* But let's wait some time for the I/O thread
9588 * to finish with this job. After all this condition
9589 * should be very rare. */
9590 usleep(1);
9591 goto again;
9592 case 2: /* io_processed */
9593 /* The job was already processed, that's easy...
9594 * just mark it as canceled so that we'll ignore it
9595 * when processing completed jobs. */
9596 job->canceled = 1;
9597 break;
9598 }
9599 /* Finally we have to adjust the storage type of the object
9600 * in order to "UNDO" the operaiton. */
9601 if (o->storage == REDIS_VM_LOADING)
9602 o->storage = REDIS_VM_SWAPPED;
9603 else if (o->storage == REDIS_VM_SWAPPING)
9604 o->storage = REDIS_VM_MEMORY;
9605 unlockThreadedIO();
9606 return;
9607 }
9608 }
9609 }
9610 unlockThreadedIO();
9611 assert(1 != 1); /* We should never reach this */
9612 }
9613
9614 static void *IOThreadEntryPoint(void *arg) {
9615 iojob *j;
9616 listNode *ln;
9617 REDIS_NOTUSED(arg);
9618
9619 pthread_detach(pthread_self());
9620 while(1) {
9621 /* Get a new job to process */
9622 lockThreadedIO();
9623 if (listLength(server.io_newjobs) == 0) {
9624 /* No new jobs in queue, exit. */
9625 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9626 (long) pthread_self());
9627 server.io_active_threads--;
9628 unlockThreadedIO();
9629 return NULL;
9630 }
9631 ln = listFirst(server.io_newjobs);
9632 j = ln->value;
9633 listDelNode(server.io_newjobs,ln);
9634 /* Add the job in the processing queue */
9635 j->thread = pthread_self();
9636 listAddNodeTail(server.io_processing,j);
9637 ln = listLast(server.io_processing); /* We use ln later to remove it */
9638 unlockThreadedIO();
9639 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9640 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9641
9642 /* Process the Job */
9643 if (j->type == REDIS_IOJOB_LOAD) {
9644 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9645 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9646 FILE *fp = fopen("/dev/null","w+");
9647 j->pages = rdbSavedObjectPages(j->val,fp);
9648 fclose(fp);
9649 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9650 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9651 j->canceled = 1;
9652 }
9653
9654 /* Done: insert the job into the processed queue */
9655 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9656 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9657 lockThreadedIO();
9658 listDelNode(server.io_processing,ln);
9659 listAddNodeTail(server.io_processed,j);
9660 unlockThreadedIO();
9661
9662 /* Signal the main thread there is new stuff to process */
9663 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9664 }
9665 return NULL; /* never reached */
9666 }
9667
9668 static void spawnIOThread(void) {
9669 pthread_t thread;
9670 sigset_t mask, omask;
9671 int err;
9672
9673 sigemptyset(&mask);
9674 sigaddset(&mask,SIGCHLD);
9675 sigaddset(&mask,SIGHUP);
9676 sigaddset(&mask,SIGPIPE);
9677 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9678 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9679 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9680 strerror(err));
9681 usleep(1000000);
9682 }
9683 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9684 server.io_active_threads++;
9685 }
9686
9687 /* We need to wait for the last thread to exit before we are able to
9688 * fork() in order to BGSAVE or BGREWRITEAOF. */
9689 static void waitEmptyIOJobsQueue(void) {
9690 while(1) {
9691 int io_processed_len;
9692
9693 lockThreadedIO();
9694 if (listLength(server.io_newjobs) == 0 &&
9695 listLength(server.io_processing) == 0 &&
9696 server.io_active_threads == 0)
9697 {
9698 unlockThreadedIO();
9699 return;
9700 }
9701 /* While waiting for empty jobs queue condition we post-process some
9702 * finshed job, as I/O threads may be hanging trying to write against
9703 * the io_ready_pipe_write FD but there are so much pending jobs that
9704 * it's blocking. */
9705 io_processed_len = listLength(server.io_processed);
9706 unlockThreadedIO();
9707 if (io_processed_len) {
9708 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9709 usleep(1000); /* 1 millisecond */
9710 } else {
9711 usleep(10000); /* 10 milliseconds */
9712 }
9713 }
9714 }
9715
9716 static void vmReopenSwapFile(void) {
9717 /* Note: we don't close the old one as we are in the child process
9718 * and don't want to mess at all with the original file object. */
9719 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9720 if (server.vm_fp == NULL) {
9721 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9722 server.vm_swap_file);
9723 _exit(1);
9724 }
9725 server.vm_fd = fileno(server.vm_fp);
9726 }
9727
9728 /* This function must be called while with threaded IO locked */
9729 static void queueIOJob(iojob *j) {
9730 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9731 (void*)j, j->type, (char*)j->key->ptr);
9732 listAddNodeTail(server.io_newjobs,j);
9733 if (server.io_active_threads < server.vm_max_threads)
9734 spawnIOThread();
9735 }
9736
9737 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9738 iojob *j;
9739
9740 assert(key->storage == REDIS_VM_MEMORY);
9741 assert(key->refcount == 1);
9742
9743 j = zmalloc(sizeof(*j));
9744 j->type = REDIS_IOJOB_PREPARE_SWAP;
9745 j->db = db;
9746 j->key = key;
9747 j->val = val;
9748 incrRefCount(val);
9749 j->canceled = 0;
9750 j->thread = (pthread_t) -1;
9751 key->storage = REDIS_VM_SWAPPING;
9752
9753 lockThreadedIO();
9754 queueIOJob(j);
9755 unlockThreadedIO();
9756 return REDIS_OK;
9757 }
9758
9759 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9760
9761 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9762 * If there is not already a job loading the key, it is craeted.
9763 * The key is added to the io_keys list in the client structure, and also
9764 * in the hash table mapping swapped keys to waiting clients, that is,
9765 * server.io_waited_keys. */
9766 static int waitForSwappedKey(redisClient *c, robj *key) {
9767 struct dictEntry *de;
9768 robj *o;
9769 list *l;
9770
9771 /* If the key does not exist or is already in RAM we don't need to
9772 * block the client at all. */
9773 de = dictFind(c->db->dict,key);
9774 if (de == NULL) return 0;
9775 o = dictGetEntryKey(de);
9776 if (o->storage == REDIS_VM_MEMORY) {
9777 return 0;
9778 } else if (o->storage == REDIS_VM_SWAPPING) {
9779 /* We were swapping the key, undo it! */
9780 vmCancelThreadedIOJob(o);
9781 return 0;
9782 }
9783
9784 /* OK: the key is either swapped, or being loaded just now. */
9785
9786 /* Add the key to the list of keys this client is waiting for.
9787 * This maps clients to keys they are waiting for. */
9788 listAddNodeTail(c->io_keys,key);
9789 incrRefCount(key);
9790
9791 /* Add the client to the swapped keys => clients waiting map. */
9792 de = dictFind(c->db->io_keys,key);
9793 if (de == NULL) {
9794 int retval;
9795
9796 /* For every key we take a list of clients blocked for it */
9797 l = listCreate();
9798 retval = dictAdd(c->db->io_keys,key,l);
9799 incrRefCount(key);
9800 assert(retval == DICT_OK);
9801 } else {
9802 l = dictGetEntryVal(de);
9803 }
9804 listAddNodeTail(l,c);
9805
9806 /* Are we already loading the key from disk? If not create a job */
9807 if (o->storage == REDIS_VM_SWAPPED) {
9808 iojob *j;
9809
9810 o->storage = REDIS_VM_LOADING;
9811 j = zmalloc(sizeof(*j));
9812 j->type = REDIS_IOJOB_LOAD;
9813 j->db = c->db;
9814 j->key = o;
9815 j->key->vtype = o->vtype;
9816 j->page = o->vm.page;
9817 j->val = NULL;
9818 j->canceled = 0;
9819 j->thread = (pthread_t) -1;
9820 lockThreadedIO();
9821 queueIOJob(j);
9822 unlockThreadedIO();
9823 }
9824 return 1;
9825 }
9826
9827 /* Preload keys for any command with first, last and step values for
9828 * the command keys prototype, as defined in the command table. */
9829 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9830 int j, last;
9831 if (cmd->vm_firstkey == 0) return;
9832 last = cmd->vm_lastkey;
9833 if (last < 0) last = argc+last;
9834 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9835 redisAssert(j < argc);
9836 waitForSwappedKey(c,argv[j]);
9837 }
9838 }
9839
9840 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9841 * Note that the number of keys to preload is user-defined, so we need to
9842 * apply a sanity check against argc. */
9843 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9844 int i, num;
9845 REDIS_NOTUSED(cmd);
9846
9847 num = atoi(argv[2]->ptr);
9848 if (num > (argc-3)) return;
9849 for (i = 0; i < num; i++) {
9850 waitForSwappedKey(c,argv[3+i]);
9851 }
9852 }
9853
9854 /* Preload keys needed to execute the entire MULTI/EXEC block.
9855 *
9856 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9857 * and will block the client when any command requires a swapped out value. */
9858 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9859 int i, margc;
9860 struct redisCommand *mcmd;
9861 robj **margv;
9862 REDIS_NOTUSED(cmd);
9863 REDIS_NOTUSED(argc);
9864 REDIS_NOTUSED(argv);
9865
9866 if (!(c->flags & REDIS_MULTI)) return;
9867 for (i = 0; i < c->mstate.count; i++) {
9868 mcmd = c->mstate.commands[i].cmd;
9869 margc = c->mstate.commands[i].argc;
9870 margv = c->mstate.commands[i].argv;
9871
9872 if (mcmd->vm_preload_proc != NULL) {
9873 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9874 } else {
9875 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9876 }
9877 }
9878 }
9879
9880 /* Is this client attempting to run a command against swapped keys?
9881 * If so, block it ASAP, load the keys in background, then resume it.
9882 *
9883 * The important idea about this function is that it can fail! If keys will
9884 * still be swapped when the client is resumed, this key lookups will
9885 * just block loading keys from disk. In practical terms this should only
9886 * happen with SORT BY command or if there is a bug in this function.
9887 *
9888 * Return 1 if the client is marked as blocked, 0 if the client can
9889 * continue as the keys it is going to access appear to be in memory. */
9890 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9891 if (cmd->vm_preload_proc != NULL) {
9892 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9893 } else {
9894 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9895 }
9896
9897 /* If the client was blocked for at least one key, mark it as blocked. */
9898 if (listLength(c->io_keys)) {
9899 c->flags |= REDIS_IO_WAIT;
9900 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9901 server.vm_blocked_clients++;
9902 return 1;
9903 } else {
9904 return 0;
9905 }
9906 }
9907
9908 /* Remove the 'key' from the list of blocked keys for a given client.
9909 *
9910 * The function returns 1 when there are no longer blocking keys after
9911 * the current one was removed (and the client can be unblocked). */
9912 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9913 list *l;
9914 listNode *ln;
9915 listIter li;
9916 struct dictEntry *de;
9917
9918 /* Remove the key from the list of keys this client is waiting for. */
9919 listRewind(c->io_keys,&li);
9920 while ((ln = listNext(&li)) != NULL) {
9921 if (equalStringObjects(ln->value,key)) {
9922 listDelNode(c->io_keys,ln);
9923 break;
9924 }
9925 }
9926 assert(ln != NULL);
9927
9928 /* Remove the client form the key => waiting clients map. */
9929 de = dictFind(c->db->io_keys,key);
9930 assert(de != NULL);
9931 l = dictGetEntryVal(de);
9932 ln = listSearchKey(l,c);
9933 assert(ln != NULL);
9934 listDelNode(l,ln);
9935 if (listLength(l) == 0)
9936 dictDelete(c->db->io_keys,key);
9937
9938 return listLength(c->io_keys) == 0;
9939 }
9940
9941 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9942 struct dictEntry *de;
9943 list *l;
9944 listNode *ln;
9945 int len;
9946
9947 de = dictFind(db->io_keys,key);
9948 if (!de) return;
9949
9950 l = dictGetEntryVal(de);
9951 len = listLength(l);
9952 /* Note: we can't use something like while(listLength(l)) as the list
9953 * can be freed by the calling function when we remove the last element. */
9954 while (len--) {
9955 ln = listFirst(l);
9956 redisClient *c = ln->value;
9957
9958 if (dontWaitForSwappedKey(c,key)) {
9959 /* Put the client in the list of clients ready to go as we
9960 * loaded all the keys about it. */
9961 listAddNodeTail(server.io_ready_clients,c);
9962 }
9963 }
9964 }
9965
9966 /* =========================== Remote Configuration ========================= */
9967
9968 static void configSetCommand(redisClient *c) {
9969 robj *o = getDecodedObject(c->argv[3]);
9970 long long ll;
9971
9972 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9973 zfree(server.dbfilename);
9974 server.dbfilename = zstrdup(o->ptr);
9975 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9976 zfree(server.requirepass);
9977 server.requirepass = zstrdup(o->ptr);
9978 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9979 zfree(server.masterauth);
9980 server.masterauth = zstrdup(o->ptr);
9981 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9982 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9983 ll < 0) goto badfmt;
9984 server.maxmemory = ll;
9985 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9986 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9987 ll < 0 || ll > LONG_MAX) goto badfmt;
9988 server.maxidletime = ll;
9989 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9990 if (!strcasecmp(o->ptr,"no")) {
9991 server.appendfsync = APPENDFSYNC_NO;
9992 } else if (!strcasecmp(o->ptr,"everysec")) {
9993 server.appendfsync = APPENDFSYNC_EVERYSEC;
9994 } else if (!strcasecmp(o->ptr,"always")) {
9995 server.appendfsync = APPENDFSYNC_ALWAYS;
9996 } else {
9997 goto badfmt;
9998 }
9999 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10000 int old = server.appendonly;
10001 int new = yesnotoi(o->ptr);
10002
10003 if (new == -1) goto badfmt;
10004 if (old != new) {
10005 if (new == 0) {
10006 stopAppendOnly();
10007 } else {
10008 if (startAppendOnly() == REDIS_ERR) {
10009 addReplySds(c,sdscatprintf(sdsempty(),
10010 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10011 decrRefCount(o);
10012 return;
10013 }
10014 }
10015 }
10016 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10017 int vlen, j;
10018 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10019
10020 /* Perform sanity check before setting the new config:
10021 * - Even number of args
10022 * - Seconds >= 1, changes >= 0 */
10023 if (vlen & 1) {
10024 sdsfreesplitres(v,vlen);
10025 goto badfmt;
10026 }
10027 for (j = 0; j < vlen; j++) {
10028 char *eptr;
10029 long val;
10030
10031 val = strtoll(v[j], &eptr, 10);
10032 if (eptr[0] != '\0' ||
10033 ((j & 1) == 0 && val < 1) ||
10034 ((j & 1) == 1 && val < 0)) {
10035 sdsfreesplitres(v,vlen);
10036 goto badfmt;
10037 }
10038 }
10039 /* Finally set the new config */
10040 resetServerSaveParams();
10041 for (j = 0; j < vlen; j += 2) {
10042 time_t seconds;
10043 int changes;
10044
10045 seconds = strtoll(v[j],NULL,10);
10046 changes = strtoll(v[j+1],NULL,10);
10047 appendServerSaveParams(seconds, changes);
10048 }
10049 sdsfreesplitres(v,vlen);
10050 } else {
10051 addReplySds(c,sdscatprintf(sdsempty(),
10052 "-ERR not supported CONFIG parameter %s\r\n",
10053 (char*)c->argv[2]->ptr));
10054 decrRefCount(o);
10055 return;
10056 }
10057 decrRefCount(o);
10058 addReply(c,shared.ok);
10059 return;
10060
10061 badfmt: /* Bad format errors */
10062 addReplySds(c,sdscatprintf(sdsempty(),
10063 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10064 (char*)o->ptr,
10065 (char*)c->argv[2]->ptr));
10066 decrRefCount(o);
10067 }
10068
10069 static void configGetCommand(redisClient *c) {
10070 robj *o = getDecodedObject(c->argv[2]);
10071 robj *lenobj = createObject(REDIS_STRING,NULL);
10072 char *pattern = o->ptr;
10073 int matches = 0;
10074
10075 addReply(c,lenobj);
10076 decrRefCount(lenobj);
10077
10078 if (stringmatch(pattern,"dbfilename",0)) {
10079 addReplyBulkCString(c,"dbfilename");
10080 addReplyBulkCString(c,server.dbfilename);
10081 matches++;
10082 }
10083 if (stringmatch(pattern,"requirepass",0)) {
10084 addReplyBulkCString(c,"requirepass");
10085 addReplyBulkCString(c,server.requirepass);
10086 matches++;
10087 }
10088 if (stringmatch(pattern,"masterauth",0)) {
10089 addReplyBulkCString(c,"masterauth");
10090 addReplyBulkCString(c,server.masterauth);
10091 matches++;
10092 }
10093 if (stringmatch(pattern,"maxmemory",0)) {
10094 char buf[128];
10095
10096 ll2string(buf,128,server.maxmemory);
10097 addReplyBulkCString(c,"maxmemory");
10098 addReplyBulkCString(c,buf);
10099 matches++;
10100 }
10101 if (stringmatch(pattern,"timeout",0)) {
10102 char buf[128];
10103
10104 ll2string(buf,128,server.maxidletime);
10105 addReplyBulkCString(c,"timeout");
10106 addReplyBulkCString(c,buf);
10107 matches++;
10108 }
10109 if (stringmatch(pattern,"appendonly",0)) {
10110 addReplyBulkCString(c,"appendonly");
10111 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10112 matches++;
10113 }
10114 if (stringmatch(pattern,"appendfsync",0)) {
10115 char *policy;
10116
10117 switch(server.appendfsync) {
10118 case APPENDFSYNC_NO: policy = "no"; break;
10119 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10120 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10121 default: policy = "unknown"; break; /* too harmless to panic */
10122 }
10123 addReplyBulkCString(c,"appendfsync");
10124 addReplyBulkCString(c,policy);
10125 matches++;
10126 }
10127 if (stringmatch(pattern,"save",0)) {
10128 sds buf = sdsempty();
10129 int j;
10130
10131 for (j = 0; j < server.saveparamslen; j++) {
10132 buf = sdscatprintf(buf,"%ld %d",
10133 server.saveparams[j].seconds,
10134 server.saveparams[j].changes);
10135 if (j != server.saveparamslen-1)
10136 buf = sdscatlen(buf," ",1);
10137 }
10138 addReplyBulkCString(c,"save");
10139 addReplyBulkCString(c,buf);
10140 sdsfree(buf);
10141 matches++;
10142 }
10143 decrRefCount(o);
10144 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10145 }
10146
10147 static void configCommand(redisClient *c) {
10148 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10149 if (c->argc != 4) goto badarity;
10150 configSetCommand(c);
10151 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10152 if (c->argc != 3) goto badarity;
10153 configGetCommand(c);
10154 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10155 if (c->argc != 2) goto badarity;
10156 server.stat_numcommands = 0;
10157 server.stat_numconnections = 0;
10158 server.stat_expiredkeys = 0;
10159 server.stat_starttime = time(NULL);
10160 addReply(c,shared.ok);
10161 } else {
10162 addReplySds(c,sdscatprintf(sdsempty(),
10163 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10164 }
10165 return;
10166
10167 badarity:
10168 addReplySds(c,sdscatprintf(sdsempty(),
10169 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10170 (char*) c->argv[1]->ptr));
10171 }
10172
10173 /* =========================== Pubsub implementation ======================== */
10174
10175 static void freePubsubPattern(void *p) {
10176 pubsubPattern *pat = p;
10177
10178 decrRefCount(pat->pattern);
10179 zfree(pat);
10180 }
10181
10182 static int listMatchPubsubPattern(void *a, void *b) {
10183 pubsubPattern *pa = a, *pb = b;
10184
10185 return (pa->client == pb->client) &&
10186 (equalStringObjects(pa->pattern,pb->pattern));
10187 }
10188
10189 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10190 * 0 if the client was already subscribed to that channel. */
10191 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10192 struct dictEntry *de;
10193 list *clients = NULL;
10194 int retval = 0;
10195
10196 /* Add the channel to the client -> channels hash table */
10197 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10198 retval = 1;
10199 incrRefCount(channel);
10200 /* Add the client to the channel -> list of clients hash table */
10201 de = dictFind(server.pubsub_channels,channel);
10202 if (de == NULL) {
10203 clients = listCreate();
10204 dictAdd(server.pubsub_channels,channel,clients);
10205 incrRefCount(channel);
10206 } else {
10207 clients = dictGetEntryVal(de);
10208 }
10209 listAddNodeTail(clients,c);
10210 }
10211 /* Notify the client */
10212 addReply(c,shared.mbulk3);
10213 addReply(c,shared.subscribebulk);
10214 addReplyBulk(c,channel);
10215 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10216 return retval;
10217 }
10218
10219 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10220 * 0 if the client was not subscribed to the specified channel. */
10221 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10222 struct dictEntry *de;
10223 list *clients;
10224 listNode *ln;
10225 int retval = 0;
10226
10227 /* Remove the channel from the client -> channels hash table */
10228 incrRefCount(channel); /* channel may be just a pointer to the same object
10229 we have in the hash tables. Protect it... */
10230 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10231 retval = 1;
10232 /* Remove the client from the channel -> clients list hash table */
10233 de = dictFind(server.pubsub_channels,channel);
10234 assert(de != NULL);
10235 clients = dictGetEntryVal(de);
10236 ln = listSearchKey(clients,c);
10237 assert(ln != NULL);
10238 listDelNode(clients,ln);
10239 if (listLength(clients) == 0) {
10240 /* Free the list and associated hash entry at all if this was
10241 * the latest client, so that it will be possible to abuse
10242 * Redis PUBSUB creating millions of channels. */
10243 dictDelete(server.pubsub_channels,channel);
10244 }
10245 }
10246 /* Notify the client */
10247 if (notify) {
10248 addReply(c,shared.mbulk3);
10249 addReply(c,shared.unsubscribebulk);
10250 addReplyBulk(c,channel);
10251 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10252 listLength(c->pubsub_patterns));
10253
10254 }
10255 decrRefCount(channel); /* it is finally safe to release it */
10256 return retval;
10257 }
10258
10259 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10260 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10261 int retval = 0;
10262
10263 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10264 retval = 1;
10265 pubsubPattern *pat;
10266 listAddNodeTail(c->pubsub_patterns,pattern);
10267 incrRefCount(pattern);
10268 pat = zmalloc(sizeof(*pat));
10269 pat->pattern = getDecodedObject(pattern);
10270 pat->client = c;
10271 listAddNodeTail(server.pubsub_patterns,pat);
10272 }
10273 /* Notify the client */
10274 addReply(c,shared.mbulk3);
10275 addReply(c,shared.psubscribebulk);
10276 addReplyBulk(c,pattern);
10277 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10278 return retval;
10279 }
10280
10281 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10282 * 0 if the client was not subscribed to the specified channel. */
10283 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10284 listNode *ln;
10285 pubsubPattern pat;
10286 int retval = 0;
10287
10288 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10289 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10290 retval = 1;
10291 listDelNode(c->pubsub_patterns,ln);
10292 pat.client = c;
10293 pat.pattern = pattern;
10294 ln = listSearchKey(server.pubsub_patterns,&pat);
10295 listDelNode(server.pubsub_patterns,ln);
10296 }
10297 /* Notify the client */
10298 if (notify) {
10299 addReply(c,shared.mbulk3);
10300 addReply(c,shared.punsubscribebulk);
10301 addReplyBulk(c,pattern);
10302 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10303 listLength(c->pubsub_patterns));
10304 }
10305 decrRefCount(pattern);
10306 return retval;
10307 }
10308
10309 /* Unsubscribe from all the channels. Return the number of channels the
10310 * client was subscribed from. */
10311 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10312 dictIterator *di = dictGetIterator(c->pubsub_channels);
10313 dictEntry *de;
10314 int count = 0;
10315
10316 while((de = dictNext(di)) != NULL) {
10317 robj *channel = dictGetEntryKey(de);
10318
10319 count += pubsubUnsubscribeChannel(c,channel,notify);
10320 }
10321 dictReleaseIterator(di);
10322 return count;
10323 }
10324
10325 /* Unsubscribe from all the patterns. Return the number of patterns the
10326 * client was subscribed from. */
10327 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10328 listNode *ln;
10329 listIter li;
10330 int count = 0;
10331
10332 listRewind(c->pubsub_patterns,&li);
10333 while ((ln = listNext(&li)) != NULL) {
10334 robj *pattern = ln->value;
10335
10336 count += pubsubUnsubscribePattern(c,pattern,notify);
10337 }
10338 return count;
10339 }
10340
10341 /* Publish a message */
10342 static int pubsubPublishMessage(robj *channel, robj *message) {
10343 int receivers = 0;
10344 struct dictEntry *de;
10345 listNode *ln;
10346 listIter li;
10347
10348 /* Send to clients listening for that channel */
10349 de = dictFind(server.pubsub_channels,channel);
10350 if (de) {
10351 list *list = dictGetEntryVal(de);
10352 listNode *ln;
10353 listIter li;
10354
10355 listRewind(list,&li);
10356 while ((ln = listNext(&li)) != NULL) {
10357 redisClient *c = ln->value;
10358
10359 addReply(c,shared.mbulk3);
10360 addReply(c,shared.messagebulk);
10361 addReplyBulk(c,channel);
10362 addReplyBulk(c,message);
10363 receivers++;
10364 }
10365 }
10366 /* Send to clients listening to matching channels */
10367 if (listLength(server.pubsub_patterns)) {
10368 listRewind(server.pubsub_patterns,&li);
10369 channel = getDecodedObject(channel);
10370 while ((ln = listNext(&li)) != NULL) {
10371 pubsubPattern *pat = ln->value;
10372
10373 if (stringmatchlen((char*)pat->pattern->ptr,
10374 sdslen(pat->pattern->ptr),
10375 (char*)channel->ptr,
10376 sdslen(channel->ptr),0)) {
10377 addReply(pat->client,shared.mbulk4);
10378 addReply(pat->client,shared.pmessagebulk);
10379 addReplyBulk(pat->client,pat->pattern);
10380 addReplyBulk(pat->client,channel);
10381 addReplyBulk(pat->client,message);
10382 receivers++;
10383 }
10384 }
10385 decrRefCount(channel);
10386 }
10387 return receivers;
10388 }
10389
10390 static void subscribeCommand(redisClient *c) {
10391 int j;
10392
10393 for (j = 1; j < c->argc; j++)
10394 pubsubSubscribeChannel(c,c->argv[j]);
10395 }
10396
10397 static void unsubscribeCommand(redisClient *c) {
10398 if (c->argc == 1) {
10399 pubsubUnsubscribeAllChannels(c,1);
10400 return;
10401 } else {
10402 int j;
10403
10404 for (j = 1; j < c->argc; j++)
10405 pubsubUnsubscribeChannel(c,c->argv[j],1);
10406 }
10407 }
10408
10409 static void psubscribeCommand(redisClient *c) {
10410 int j;
10411
10412 for (j = 1; j < c->argc; j++)
10413 pubsubSubscribePattern(c,c->argv[j]);
10414 }
10415
10416 static void punsubscribeCommand(redisClient *c) {
10417 if (c->argc == 1) {
10418 pubsubUnsubscribeAllPatterns(c,1);
10419 return;
10420 } else {
10421 int j;
10422
10423 for (j = 1; j < c->argc; j++)
10424 pubsubUnsubscribePattern(c,c->argv[j],1);
10425 }
10426 }
10427
10428 static void publishCommand(redisClient *c) {
10429 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10430 addReplyLongLong(c,receivers);
10431 }
10432
10433 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10434 *
10435 * The implementation uses a per-DB hash table mapping keys to list of clients
10436 * WATCHing those keys, so that given a key that is going to be modified
10437 * we can mark all the associated clients as dirty.
10438 *
10439 * Also every client contains a list of WATCHed keys so that's possible to
10440 * un-watch such keys when the client is freed or when UNWATCH is called. */
10441
10442 /* In the client->watched_keys list we need to use watchedKey structures
10443 * as in order to identify a key in Redis we need both the key name and the
10444 * DB */
10445 typedef struct watchedKey {
10446 robj *key;
10447 redisDb *db;
10448 } watchedKey;
10449
10450 /* Watch for the specified key */
10451 static void watchForKey(redisClient *c, robj *key) {
10452 list *clients = NULL;
10453 listIter li;
10454 listNode *ln;
10455 watchedKey *wk;
10456
10457 /* Check if we are already watching for this key */
10458 listRewind(c->watched_keys,&li);
10459 while((ln = listNext(&li))) {
10460 wk = listNodeValue(ln);
10461 if (wk->db == c->db && equalStringObjects(key,wk->key))
10462 return; /* Key already watched */
10463 }
10464 /* This key is not already watched in this DB. Let's add it */
10465 clients = dictFetchValue(c->db->watched_keys,key);
10466 if (!clients) {
10467 clients = listCreate();
10468 dictAdd(c->db->watched_keys,key,clients);
10469 incrRefCount(key);
10470 }
10471 listAddNodeTail(clients,c);
10472 /* Add the new key to the lits of keys watched by this client */
10473 wk = zmalloc(sizeof(*wk));
10474 wk->key = key;
10475 wk->db = c->db;
10476 incrRefCount(key);
10477 listAddNodeTail(c->watched_keys,wk);
10478 }
10479
10480 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10481 * flag is up to the caller. */
10482 static void unwatchAllKeys(redisClient *c) {
10483 listIter li;
10484 listNode *ln;
10485
10486 if (listLength(c->watched_keys) == 0) return;
10487 listRewind(c->watched_keys,&li);
10488 while((ln = listNext(&li))) {
10489 list *clients;
10490 watchedKey *wk;
10491
10492 /* Lookup the watched key -> clients list and remove the client
10493 * from the list */
10494 wk = listNodeValue(ln);
10495 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10496 assert(clients != NULL);
10497 listDelNode(clients,listSearchKey(clients,c));
10498 /* Kill the entry at all if this was the only client */
10499 if (listLength(clients) == 0)
10500 dictDelete(wk->db->watched_keys, wk->key);
10501 /* Remove this watched key from the client->watched list */
10502 listDelNode(c->watched_keys,ln);
10503 decrRefCount(wk->key);
10504 zfree(wk);
10505 }
10506 }
10507
10508 /* "Touch" a key, so that if this key is being WATCHed by some client the
10509 * next EXEC will fail. */
10510 static void touchWatchedKey(redisDb *db, robj *key) {
10511 list *clients;
10512 listIter li;
10513 listNode *ln;
10514
10515 if (dictSize(db->watched_keys) == 0) return;
10516 clients = dictFetchValue(db->watched_keys, key);
10517 if (!clients) return;
10518
10519 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10520 /* Check if we are already watching for this key */
10521 listRewind(clients,&li);
10522 while((ln = listNext(&li))) {
10523 redisClient *c = listNodeValue(ln);
10524
10525 c->flags |= REDIS_DIRTY_CAS;
10526 }
10527 }
10528
10529 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10530 * flush but will be deleted as effect of the flushing operation should
10531 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10532 * a FLUSHALL operation (all the DBs flushed). */
10533 static void touchWatchedKeysOnFlush(int dbid) {
10534 listIter li1, li2;
10535 listNode *ln;
10536
10537 /* For every client, check all the waited keys */
10538 listRewind(server.clients,&li1);
10539 while((ln = listNext(&li1))) {
10540 redisClient *c = listNodeValue(ln);
10541 listRewind(c->watched_keys,&li2);
10542 while((ln = listNext(&li2))) {
10543 watchedKey *wk = listNodeValue(ln);
10544
10545 /* For every watched key matching the specified DB, if the
10546 * key exists, mark the client as dirty, as the key will be
10547 * removed. */
10548 if (dbid == -1 || wk->db->id == dbid) {
10549 if (dictFind(wk->db->dict, wk->key) != NULL)
10550 c->flags |= REDIS_DIRTY_CAS;
10551 }
10552 }
10553 }
10554 }
10555
10556 static void watchCommand(redisClient *c) {
10557 int j;
10558
10559 if (c->flags & REDIS_MULTI) {
10560 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10561 return;
10562 }
10563 for (j = 1; j < c->argc; j++)
10564 watchForKey(c,c->argv[j]);
10565 addReply(c,shared.ok);
10566 }
10567
10568 static void unwatchCommand(redisClient *c) {
10569 unwatchAllKeys(c);
10570 c->flags &= (~REDIS_DIRTY_CAS);
10571 addReply(c,shared.ok);
10572 }
10573
10574 /* ================================= Debugging ============================== */
10575
10576 /* Compute the sha1 of string at 's' with 'len' bytes long.
10577 * The SHA1 is then xored againt the string pointed by digest.
10578 * Since xor is commutative, this operation is used in order to
10579 * "add" digests relative to unordered elements.
10580 *
10581 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10582 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10583 SHA1_CTX ctx;
10584 unsigned char hash[20], *s = ptr;
10585 int j;
10586
10587 SHA1Init(&ctx);
10588 SHA1Update(&ctx,s,len);
10589 SHA1Final(hash,&ctx);
10590
10591 for (j = 0; j < 20; j++)
10592 digest[j] ^= hash[j];
10593 }
10594
10595 static void xorObjectDigest(unsigned char *digest, robj *o) {
10596 o = getDecodedObject(o);
10597 xorDigest(digest,o->ptr,sdslen(o->ptr));
10598 decrRefCount(o);
10599 }
10600
10601 /* This function instead of just computing the SHA1 and xoring it
10602 * against diget, also perform the digest of "digest" itself and
10603 * replace the old value with the new one.
10604 *
10605 * So the final digest will be:
10606 *
10607 * digest = SHA1(digest xor SHA1(data))
10608 *
10609 * This function is used every time we want to preserve the order so
10610 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10611 *
10612 * Also note that mixdigest("foo") followed by mixdigest("bar")
10613 * will lead to a different digest compared to "fo", "obar".
10614 */
10615 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10616 SHA1_CTX ctx;
10617 char *s = ptr;
10618
10619 xorDigest(digest,s,len);
10620 SHA1Init(&ctx);
10621 SHA1Update(&ctx,digest,20);
10622 SHA1Final(digest,&ctx);
10623 }
10624
10625 static void mixObjectDigest(unsigned char *digest, robj *o) {
10626 o = getDecodedObject(o);
10627 mixDigest(digest,o->ptr,sdslen(o->ptr));
10628 decrRefCount(o);
10629 }
10630
10631 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10632 * are not ordered, we use a trick: every aggregate digest is the xor
10633 * of the digests of their elements. This way the order will not change
10634 * the result. For list instead we use a feedback entering the output digest
10635 * as input in order to ensure that a different ordered list will result in
10636 * a different digest. */
10637 static void computeDatasetDigest(unsigned char *final) {
10638 unsigned char digest[20];
10639 char buf[128];
10640 dictIterator *di = NULL;
10641 dictEntry *de;
10642 int j;
10643 uint32_t aux;
10644
10645 memset(final,0,20); /* Start with a clean result */
10646
10647 for (j = 0; j < server.dbnum; j++) {
10648 redisDb *db = server.db+j;
10649
10650 if (dictSize(db->dict) == 0) continue;
10651 di = dictGetIterator(db->dict);
10652
10653 /* hash the DB id, so the same dataset moved in a different
10654 * DB will lead to a different digest */
10655 aux = htonl(j);
10656 mixDigest(final,&aux,sizeof(aux));
10657
10658 /* Iterate this DB writing every entry */
10659 while((de = dictNext(di)) != NULL) {
10660 robj *key, *o, *kcopy;
10661 time_t expiretime;
10662
10663 memset(digest,0,20); /* This key-val digest */
10664 key = dictGetEntryKey(de);
10665
10666 if (!server.vm_enabled) {
10667 mixObjectDigest(digest,key);
10668 o = dictGetEntryVal(de);
10669 } else {
10670 /* Don't work with the key directly as when VM is active
10671 * this is unsafe: TODO: fix decrRefCount to check if the
10672 * count really reached 0 to avoid this mess */
10673 kcopy = dupStringObject(key);
10674 mixObjectDigest(digest,kcopy);
10675 o = lookupKeyRead(db,kcopy);
10676 decrRefCount(kcopy);
10677 }
10678 aux = htonl(o->type);
10679 mixDigest(digest,&aux,sizeof(aux));
10680 expiretime = getExpire(db,key);
10681
10682 /* Save the key and associated value */
10683 if (o->type == REDIS_STRING) {
10684 mixObjectDigest(digest,o);
10685 } else if (o->type == REDIS_LIST) {
10686 list *list = o->ptr;
10687 listNode *ln;
10688 listIter li;
10689
10690 listRewind(list,&li);
10691 while((ln = listNext(&li))) {
10692 robj *eleobj = listNodeValue(ln);
10693
10694 mixObjectDigest(digest,eleobj);
10695 }
10696 } else if (o->type == REDIS_SET) {
10697 dict *set = o->ptr;
10698 dictIterator *di = dictGetIterator(set);
10699 dictEntry *de;
10700
10701 while((de = dictNext(di)) != NULL) {
10702 robj *eleobj = dictGetEntryKey(de);
10703
10704 xorObjectDigest(digest,eleobj);
10705 }
10706 dictReleaseIterator(di);
10707 } else if (o->type == REDIS_ZSET) {
10708 zset *zs = o->ptr;
10709 dictIterator *di = dictGetIterator(zs->dict);
10710 dictEntry *de;
10711
10712 while((de = dictNext(di)) != NULL) {
10713 robj *eleobj = dictGetEntryKey(de);
10714 double *score = dictGetEntryVal(de);
10715 unsigned char eledigest[20];
10716
10717 snprintf(buf,sizeof(buf),"%.17g",*score);
10718 memset(eledigest,0,20);
10719 mixObjectDigest(eledigest,eleobj);
10720 mixDigest(eledigest,buf,strlen(buf));
10721 xorDigest(digest,eledigest,20);
10722 }
10723 dictReleaseIterator(di);
10724 } else if (o->type == REDIS_HASH) {
10725 hashIterator *hi;
10726 robj *obj;
10727
10728 hi = hashInitIterator(o);
10729 while (hashNext(hi) != REDIS_ERR) {
10730 unsigned char eledigest[20];
10731
10732 memset(eledigest,0,20);
10733 obj = hashCurrent(hi,REDIS_HASH_KEY);
10734 mixObjectDigest(eledigest,obj);
10735 decrRefCount(obj);
10736 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10737 mixObjectDigest(eledigest,obj);
10738 decrRefCount(obj);
10739 xorDigest(digest,eledigest,20);
10740 }
10741 hashReleaseIterator(hi);
10742 } else {
10743 redisPanic("Unknown object type");
10744 }
10745 /* If the key has an expire, add it to the mix */
10746 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10747 /* We can finally xor the key-val digest to the final digest */
10748 xorDigest(final,digest,20);
10749 }
10750 dictReleaseIterator(di);
10751 }
10752 }
10753
10754 static void debugCommand(redisClient *c) {
10755 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10756 *((char*)-1) = 'x';
10757 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10758 if (rdbSave(server.dbfilename) != REDIS_OK) {
10759 addReply(c,shared.err);
10760 return;
10761 }
10762 emptyDb();
10763 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10764 addReply(c,shared.err);
10765 return;
10766 }
10767 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10768 addReply(c,shared.ok);
10769 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10770 emptyDb();
10771 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10772 addReply(c,shared.err);
10773 return;
10774 }
10775 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10776 addReply(c,shared.ok);
10777 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10778 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10779 robj *key, *val;
10780
10781 if (!de) {
10782 addReply(c,shared.nokeyerr);
10783 return;
10784 }
10785 key = dictGetEntryKey(de);
10786 val = dictGetEntryVal(de);
10787 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10788 key->storage == REDIS_VM_SWAPPING)) {
10789 char *strenc;
10790 char buf[128];
10791
10792 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10793 strenc = strencoding[val->encoding];
10794 } else {
10795 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10796 strenc = buf;
10797 }
10798 addReplySds(c,sdscatprintf(sdsempty(),
10799 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10800 "encoding:%s serializedlength:%lld\r\n",
10801 (void*)key, key->refcount, (void*)val, val->refcount,
10802 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10803 } else {
10804 addReplySds(c,sdscatprintf(sdsempty(),
10805 "+Key at:%p refcount:%d, value swapped at: page %llu "
10806 "using %llu pages\r\n",
10807 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10808 (unsigned long long) key->vm.usedpages));
10809 }
10810 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10811 lookupKeyRead(c->db,c->argv[2]);
10812 addReply(c,shared.ok);
10813 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10814 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10815 robj *key, *val;
10816
10817 if (!server.vm_enabled) {
10818 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10819 return;
10820 }
10821 if (!de) {
10822 addReply(c,shared.nokeyerr);
10823 return;
10824 }
10825 key = dictGetEntryKey(de);
10826 val = dictGetEntryVal(de);
10827 /* If the key is shared we want to create a copy */
10828 if (key->refcount > 1) {
10829 robj *newkey = dupStringObject(key);
10830 decrRefCount(key);
10831 key = dictGetEntryKey(de) = newkey;
10832 }
10833 /* Swap it */
10834 if (key->storage != REDIS_VM_MEMORY) {
10835 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10836 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10837 dictGetEntryVal(de) = NULL;
10838 addReply(c,shared.ok);
10839 } else {
10840 addReply(c,shared.err);
10841 }
10842 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10843 long keys, j;
10844 robj *key, *val;
10845 char buf[128];
10846
10847 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10848 return;
10849 for (j = 0; j < keys; j++) {
10850 snprintf(buf,sizeof(buf),"key:%lu",j);
10851 key = createStringObject(buf,strlen(buf));
10852 if (lookupKeyRead(c->db,key) != NULL) {
10853 decrRefCount(key);
10854 continue;
10855 }
10856 snprintf(buf,sizeof(buf),"value:%lu",j);
10857 val = createStringObject(buf,strlen(buf));
10858 dictAdd(c->db->dict,key,val);
10859 }
10860 addReply(c,shared.ok);
10861 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10862 unsigned char digest[20];
10863 sds d = sdsnew("+");
10864 int j;
10865
10866 computeDatasetDigest(digest);
10867 for (j = 0; j < 20; j++)
10868 d = sdscatprintf(d, "%02x",digest[j]);
10869
10870 d = sdscatlen(d,"\r\n",2);
10871 addReplySds(c,d);
10872 } else {
10873 addReplySds(c,sdsnew(
10874 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10875 }
10876 }
10877
10878 static void _redisAssert(char *estr, char *file, int line) {
10879 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10880 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10881 #ifdef HAVE_BACKTRACE
10882 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10883 *((char*)-1) = 'x';
10884 #endif
10885 }
10886
10887 static void _redisPanic(char *msg, char *file, int line) {
10888 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10889 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10890 #ifdef HAVE_BACKTRACE
10891 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10892 *((char*)-1) = 'x';
10893 #endif
10894 }
10895
10896 /* =================================== Main! ================================ */
10897
10898 #ifdef __linux__
10899 int linuxOvercommitMemoryValue(void) {
10900 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10901 char buf[64];
10902
10903 if (!fp) return -1;
10904 if (fgets(buf,64,fp) == NULL) {
10905 fclose(fp);
10906 return -1;
10907 }
10908 fclose(fp);
10909
10910 return atoi(buf);
10911 }
10912
10913 void linuxOvercommitMemoryWarning(void) {
10914 if (linuxOvercommitMemoryValue() == 0) {
10915 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10916 }
10917 }
10918 #endif /* __linux__ */
10919
10920 static void daemonize(void) {
10921 int fd;
10922 FILE *fp;
10923
10924 if (fork() != 0) exit(0); /* parent exits */
10925 setsid(); /* create a new session */
10926
10927 /* Every output goes to /dev/null. If Redis is daemonized but
10928 * the 'logfile' is set to 'stdout' in the configuration file
10929 * it will not log at all. */
10930 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10931 dup2(fd, STDIN_FILENO);
10932 dup2(fd, STDOUT_FILENO);
10933 dup2(fd, STDERR_FILENO);
10934 if (fd > STDERR_FILENO) close(fd);
10935 }
10936 /* Try to write the pid file */
10937 fp = fopen(server.pidfile,"w");
10938 if (fp) {
10939 fprintf(fp,"%d\n",getpid());
10940 fclose(fp);
10941 }
10942 }
10943
10944 static void version() {
10945 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10946 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
10947 exit(0);
10948 }
10949
10950 static void usage() {
10951 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10952 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10953 exit(1);
10954 }
10955
10956 int main(int argc, char **argv) {
10957 time_t start;
10958
10959 initServerConfig();
10960 sortCommandTable();
10961 if (argc == 2) {
10962 if (strcmp(argv[1], "-v") == 0 ||
10963 strcmp(argv[1], "--version") == 0) version();
10964 if (strcmp(argv[1], "--help") == 0) usage();
10965 resetServerSaveParams();
10966 loadServerConfig(argv[1]);
10967 } else if ((argc > 2)) {
10968 usage();
10969 } else {
10970 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10971 }
10972 if (server.daemonize) daemonize();
10973 initServer();
10974 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10975 #ifdef __linux__
10976 linuxOvercommitMemoryWarning();
10977 #endif
10978 start = time(NULL);
10979 if (server.appendonly) {
10980 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10981 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10982 } else {
10983 if (rdbLoad(server.dbfilename) == REDIS_OK)
10984 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10985 }
10986 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10987 aeSetBeforeSleepProc(server.el,beforeSleep);
10988 aeMain(server.el);
10989 aeDeleteEventLoop(server.el);
10990 return 0;
10991 }
10992
10993 /* ============================= Backtrace support ========================= */
10994
10995 #ifdef HAVE_BACKTRACE
10996 static char *findFuncName(void *pointer, unsigned long *offset);
10997
10998 static void *getMcontextEip(ucontext_t *uc) {
10999 #if defined(__FreeBSD__)
11000 return (void*) uc->uc_mcontext.mc_eip;
11001 #elif defined(__dietlibc__)
11002 return (void*) uc->uc_mcontext.eip;
11003 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11004 #if __x86_64__
11005 return (void*) uc->uc_mcontext->__ss.__rip;
11006 #else
11007 return (void*) uc->uc_mcontext->__ss.__eip;
11008 #endif
11009 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11010 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11011 return (void*) uc->uc_mcontext->__ss.__rip;
11012 #else
11013 return (void*) uc->uc_mcontext->__ss.__eip;
11014 #endif
11015 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11016 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11017 #elif defined(__ia64__) /* Linux IA64 */
11018 return (void*) uc->uc_mcontext.sc_ip;
11019 #else
11020 return NULL;
11021 #endif
11022 }
11023
11024 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11025 void *trace[100];
11026 char **messages = NULL;
11027 int i, trace_size = 0;
11028 unsigned long offset=0;
11029 ucontext_t *uc = (ucontext_t*) secret;
11030 sds infostring;
11031 REDIS_NOTUSED(info);
11032
11033 redisLog(REDIS_WARNING,
11034 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11035 infostring = genRedisInfoString();
11036 redisLog(REDIS_WARNING, "%s",infostring);
11037 /* It's not safe to sdsfree() the returned string under memory
11038 * corruption conditions. Let it leak as we are going to abort */
11039
11040 trace_size = backtrace(trace, 100);
11041 /* overwrite sigaction with caller's address */
11042 if (getMcontextEip(uc) != NULL) {
11043 trace[1] = getMcontextEip(uc);
11044 }
11045 messages = backtrace_symbols(trace, trace_size);
11046
11047 for (i=1; i<trace_size; ++i) {
11048 char *fn = findFuncName(trace[i], &offset), *p;
11049
11050 p = strchr(messages[i],'+');
11051 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11052 redisLog(REDIS_WARNING,"%s", messages[i]);
11053 } else {
11054 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11055 }
11056 }
11057 /* free(messages); Don't call free() with possibly corrupted memory. */
11058 _exit(0);
11059 }
11060
11061 static void sigtermHandler(int sig) {
11062 REDIS_NOTUSED(sig);
11063
11064 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11065 server.shutdown_asap = 1;
11066 }
11067
11068 static void setupSigSegvAction(void) {
11069 struct sigaction act;
11070
11071 sigemptyset (&act.sa_mask);
11072 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11073 * is used. Otherwise, sa_handler is used */
11074 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11075 act.sa_sigaction = segvHandler;
11076 sigaction (SIGSEGV, &act, NULL);
11077 sigaction (SIGBUS, &act, NULL);
11078 sigaction (SIGFPE, &act, NULL);
11079 sigaction (SIGILL, &act, NULL);
11080 sigaction (SIGBUS, &act, NULL);
11081
11082 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11083 act.sa_handler = sigtermHandler;
11084 sigaction (SIGTERM, &act, NULL);
11085 return;
11086 }
11087
11088 #include "staticsymbols.h"
11089 /* This function try to convert a pointer into a function name. It's used in
11090 * oreder to provide a backtrace under segmentation fault that's able to
11091 * display functions declared as static (otherwise the backtrace is useless). */
11092 static char *findFuncName(void *pointer, unsigned long *offset){
11093 int i, ret = -1;
11094 unsigned long off, minoff = 0;
11095
11096 /* Try to match against the Symbol with the smallest offset */
11097 for (i=0; symsTable[i].pointer; i++) {
11098 unsigned long lp = (unsigned long) pointer;
11099
11100 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11101 off=lp-symsTable[i].pointer;
11102 if (ret < 0 || off < minoff) {
11103 minoff=off;
11104 ret=i;
11105 }
11106 }
11107 }
11108 if (ret == -1) return NULL;
11109 *offset = minoff;
11110 return symsTable[ret].name;
11111 }
11112 #else /* HAVE_BACKTRACE */
11113 static void setupSigSegvAction(void) {
11114 }
11115 #endif /* HAVE_BACKTRACE */
11116
11117
11118
11119 /* The End */
11120
11121
11122