]> git.saurik.com Git - redis.git/blob - redis.c
allow regular sets to be passed to zunionstore/zinterstore
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int shutdown_asap;
373 time_t lastfsync;
374 int appendfd;
375 int appendseldb;
376 char *pidfile;
377 pid_t bgsavechildpid;
378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
380 sds aofbuf; /* AOF buffer, written before entering the event loop */
381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
386 char *appendfilename;
387 char *requirepass;
388 int rdbcompression;
389 int activerehashing;
390 /* Replication related */
391 int isslave;
392 char *masterauth;
393 char *masterhost;
394 int masterport;
395 redisClient *master; /* client that is master for this slave */
396 int replstate;
397 unsigned int maxclients;
398 unsigned long long maxmemory;
399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
406 /* Virtual memory configuration */
407 int vm_enabled;
408 char *vm_swap_file;
409 off_t vm_page_size;
410 off_t vm_pages;
411 unsigned long long vm_max_memory;
412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
421 time_t unixtime; /* Unix time sampled every second. */
422 /* Virtual memory I/O threads stuff */
423 /* An I/O thread process an element taken from the io_jobs queue and
424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
447 /* Pubsub */
448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
450 /* Misc */
451 FILE *devnull;
452 };
453
454 typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457 } pubsubPattern;
458
459 typedef void redisCommandProc(redisClient *c);
460 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
461 struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
469 redisVmPreloadProc *vm_preload_proc;
470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
474 };
475
476 struct redisFunctionSym {
477 char *name;
478 unsigned long pointer;
479 };
480
481 typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487 } redisSortObject;
488
489 typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492 } redisSortOperation;
493
494 /* ZSETs use a specialized version of Skiplists */
495
496 typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
498 struct zskiplistNode *backward;
499 unsigned int *span;
500 double score;
501 robj *obj;
502 } zskiplistNode;
503
504 typedef struct zskiplist {
505 struct zskiplistNode *header, *tail;
506 unsigned long length;
507 int level;
508 } zskiplist;
509
510 typedef struct zset {
511 dict *dict;
512 zskiplist *zsl;
513 } zset;
514
515 /* Our shared "common" objects */
516
517 #define REDIS_SHARED_INTEGERS 10000
518 struct sharedObjectsStruct {
519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
520 *colon, *nullbulk, *nullmultibulk, *queued,
521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
523 *select0, *select1, *select2, *select3, *select4,
524 *select5, *select6, *select7, *select8, *select9,
525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
528 } shared;
529
530 /* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
536 /* VM threaded I/O request message */
537 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
540 typedef struct iojob {
541 int type; /* Request type, REDIS_IOJOB_* */
542 redisDb *db;/* Redis database */
543 robj *key; /* This I/O request is about swapping this key */
544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550 } iojob;
551
552 /*================================ Prototypes =============================== */
553
554 static void freeStringObject(robj *o);
555 static void freeListObject(robj *o);
556 static void freeSetObject(robj *o);
557 static void decrRefCount(void *o);
558 static robj *createObject(int type, void *ptr);
559 static void freeClient(redisClient *c);
560 static int rdbLoad(char *filename);
561 static void addReply(redisClient *c, robj *obj);
562 static void addReplySds(redisClient *c, sds s);
563 static void incrRefCount(robj *o);
564 static int rdbSaveBackground(char *filename);
565 static robj *createStringObject(char *ptr, size_t len);
566 static robj *dupStringObject(robj *o);
567 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
568 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
569 static void flushAppendOnlyFile(void);
570 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
571 static int syncWithMaster(void);
572 static robj *tryObjectEncoding(robj *o);
573 static robj *getDecodedObject(robj *o);
574 static int removeExpire(redisDb *db, robj *key);
575 static int expireIfNeeded(redisDb *db, robj *key);
576 static int deleteIfVolatile(redisDb *db, robj *key);
577 static int deleteIfSwapped(redisDb *db, robj *key);
578 static int deleteKey(redisDb *db, robj *key);
579 static time_t getExpire(redisDb *db, robj *key);
580 static int setExpire(redisDb *db, robj *key, time_t when);
581 static void updateSlavesWaitingBgsave(int bgsaveerr);
582 static void freeMemoryIfNeeded(void);
583 static int processCommand(redisClient *c);
584 static void setupSigSegvAction(void);
585 static void rdbRemoveTempFile(pid_t childpid);
586 static void aofRemoveTempFile(pid_t childpid);
587 static size_t stringObjectLen(robj *o);
588 static void processInputBuffer(redisClient *c);
589 static zskiplist *zslCreate(void);
590 static void zslFree(zskiplist *zsl);
591 static void zslInsert(zskiplist *zsl, double score, robj *obj);
592 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
593 static void initClientMultiState(redisClient *c);
594 static void freeClientMultiState(redisClient *c);
595 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
596 static void unblockClientWaitingData(redisClient *c);
597 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
598 static void vmInit(void);
599 static void vmMarkPagesFree(off_t page, off_t count);
600 static robj *vmLoadObject(robj *key);
601 static robj *vmPreviewObject(robj *key);
602 static int vmSwapOneObjectBlocking(void);
603 static int vmSwapOneObjectThreaded(void);
604 static int vmCanSwapOut(void);
605 static int tryFreeOneObjectFromFreelist(void);
606 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmCancelThreadedIOJob(robj *o);
609 static void lockThreadedIO(void);
610 static void unlockThreadedIO(void);
611 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612 static void freeIOJob(iojob *j);
613 static void queueIOJob(iojob *j);
614 static int vmWriteObjectOnSwap(robj *o, off_t page);
615 static robj *vmReadObjectFromSwap(off_t page, int type);
616 static void waitEmptyIOJobsQueue(void);
617 static void vmReopenSwapFile(void);
618 static int vmFreePage(off_t page);
619 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
620 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
622 static int dontWaitForSwappedKey(redisClient *c, robj *key);
623 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625 static struct redisCommand *lookupCommand(char *name);
626 static void call(redisClient *c, struct redisCommand *cmd);
627 static void resetClient(redisClient *c);
628 static void convertToRealHash(robj *o);
629 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631 static void freePubsubPattern(void *p);
632 static int listMatchPubsubPattern(void *a, void *b);
633 static int compareStringObjects(robj *a, robj *b);
634 static int equalStringObjects(robj *a, robj *b);
635 static void usage();
636 static int rewriteAppendOnlyFileBackground(void);
637 static int vmSwapObjectBlocking(robj *key, robj *val);
638 static int prepareForShutdown();
639 static void touchWatchedKey(redisDb *db, robj *key);
640 static void touchWatchedKeysOnFlush(int dbid);
641 static void unwatchAllKeys(redisClient *c);
642
643 static void authCommand(redisClient *c);
644 static void pingCommand(redisClient *c);
645 static void echoCommand(redisClient *c);
646 static void setCommand(redisClient *c);
647 static void setnxCommand(redisClient *c);
648 static void setexCommand(redisClient *c);
649 static void getCommand(redisClient *c);
650 static void delCommand(redisClient *c);
651 static void existsCommand(redisClient *c);
652 static void incrCommand(redisClient *c);
653 static void decrCommand(redisClient *c);
654 static void incrbyCommand(redisClient *c);
655 static void decrbyCommand(redisClient *c);
656 static void selectCommand(redisClient *c);
657 static void randomkeyCommand(redisClient *c);
658 static void keysCommand(redisClient *c);
659 static void dbsizeCommand(redisClient *c);
660 static void lastsaveCommand(redisClient *c);
661 static void saveCommand(redisClient *c);
662 static void bgsaveCommand(redisClient *c);
663 static void bgrewriteaofCommand(redisClient *c);
664 static void shutdownCommand(redisClient *c);
665 static void moveCommand(redisClient *c);
666 static void renameCommand(redisClient *c);
667 static void renamenxCommand(redisClient *c);
668 static void lpushCommand(redisClient *c);
669 static void rpushCommand(redisClient *c);
670 static void lpopCommand(redisClient *c);
671 static void rpopCommand(redisClient *c);
672 static void llenCommand(redisClient *c);
673 static void lindexCommand(redisClient *c);
674 static void lrangeCommand(redisClient *c);
675 static void ltrimCommand(redisClient *c);
676 static void typeCommand(redisClient *c);
677 static void lsetCommand(redisClient *c);
678 static void saddCommand(redisClient *c);
679 static void sremCommand(redisClient *c);
680 static void smoveCommand(redisClient *c);
681 static void sismemberCommand(redisClient *c);
682 static void scardCommand(redisClient *c);
683 static void spopCommand(redisClient *c);
684 static void srandmemberCommand(redisClient *c);
685 static void sinterCommand(redisClient *c);
686 static void sinterstoreCommand(redisClient *c);
687 static void sunionCommand(redisClient *c);
688 static void sunionstoreCommand(redisClient *c);
689 static void sdiffCommand(redisClient *c);
690 static void sdiffstoreCommand(redisClient *c);
691 static void syncCommand(redisClient *c);
692 static void flushdbCommand(redisClient *c);
693 static void flushallCommand(redisClient *c);
694 static void sortCommand(redisClient *c);
695 static void lremCommand(redisClient *c);
696 static void rpoplpushcommand(redisClient *c);
697 static void infoCommand(redisClient *c);
698 static void mgetCommand(redisClient *c);
699 static void monitorCommand(redisClient *c);
700 static void expireCommand(redisClient *c);
701 static void expireatCommand(redisClient *c);
702 static void getsetCommand(redisClient *c);
703 static void ttlCommand(redisClient *c);
704 static void slaveofCommand(redisClient *c);
705 static void debugCommand(redisClient *c);
706 static void msetCommand(redisClient *c);
707 static void msetnxCommand(redisClient *c);
708 static void zaddCommand(redisClient *c);
709 static void zincrbyCommand(redisClient *c);
710 static void zrangeCommand(redisClient *c);
711 static void zrangebyscoreCommand(redisClient *c);
712 static void zcountCommand(redisClient *c);
713 static void zrevrangeCommand(redisClient *c);
714 static void zcardCommand(redisClient *c);
715 static void zremCommand(redisClient *c);
716 static void zscoreCommand(redisClient *c);
717 static void zremrangebyscoreCommand(redisClient *c);
718 static void multiCommand(redisClient *c);
719 static void execCommand(redisClient *c);
720 static void discardCommand(redisClient *c);
721 static void blpopCommand(redisClient *c);
722 static void brpopCommand(redisClient *c);
723 static void appendCommand(redisClient *c);
724 static void substrCommand(redisClient *c);
725 static void zrankCommand(redisClient *c);
726 static void zrevrankCommand(redisClient *c);
727 static void hsetCommand(redisClient *c);
728 static void hsetnxCommand(redisClient *c);
729 static void hgetCommand(redisClient *c);
730 static void hmsetCommand(redisClient *c);
731 static void hmgetCommand(redisClient *c);
732 static void hdelCommand(redisClient *c);
733 static void hlenCommand(redisClient *c);
734 static void zremrangebyrankCommand(redisClient *c);
735 static void zunionstoreCommand(redisClient *c);
736 static void zinterstoreCommand(redisClient *c);
737 static void hkeysCommand(redisClient *c);
738 static void hvalsCommand(redisClient *c);
739 static void hgetallCommand(redisClient *c);
740 static void hexistsCommand(redisClient *c);
741 static void configCommand(redisClient *c);
742 static void hincrbyCommand(redisClient *c);
743 static void subscribeCommand(redisClient *c);
744 static void unsubscribeCommand(redisClient *c);
745 static void psubscribeCommand(redisClient *c);
746 static void punsubscribeCommand(redisClient *c);
747 static void publishCommand(redisClient *c);
748 static void watchCommand(redisClient *c);
749 static void unwatchCommand(redisClient *c);
750
751 /*================================= Globals ================================= */
752
753 /* Global vars */
754 static struct redisServer server; /* server global state */
755 static struct redisCommand cmdTable[] = {
756 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
758 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
763 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
765 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
767 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
779 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
780 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
783 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
788 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
789 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
800 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
801 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
809 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
810 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
811 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
812 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
821 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
825 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
826 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
838 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
844 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
846 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
851 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
854 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
857 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
862 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
864 {NULL,NULL,0,0,NULL,0,0,0}
865 };
866
867 /*============================ Utility functions ============================ */
868
869 /* Glob-style pattern matching. */
870 static int stringmatchlen(const char *pattern, int patternLen,
871 const char *string, int stringLen, int nocase)
872 {
873 while(patternLen) {
874 switch(pattern[0]) {
875 case '*':
876 while (pattern[1] == '*') {
877 pattern++;
878 patternLen--;
879 }
880 if (patternLen == 1)
881 return 1; /* match */
882 while(stringLen) {
883 if (stringmatchlen(pattern+1, patternLen-1,
884 string, stringLen, nocase))
885 return 1; /* match */
886 string++;
887 stringLen--;
888 }
889 return 0; /* no match */
890 break;
891 case '?':
892 if (stringLen == 0)
893 return 0; /* no match */
894 string++;
895 stringLen--;
896 break;
897 case '[':
898 {
899 int not, match;
900
901 pattern++;
902 patternLen--;
903 not = pattern[0] == '^';
904 if (not) {
905 pattern++;
906 patternLen--;
907 }
908 match = 0;
909 while(1) {
910 if (pattern[0] == '\\') {
911 pattern++;
912 patternLen--;
913 if (pattern[0] == string[0])
914 match = 1;
915 } else if (pattern[0] == ']') {
916 break;
917 } else if (patternLen == 0) {
918 pattern--;
919 patternLen++;
920 break;
921 } else if (pattern[1] == '-' && patternLen >= 3) {
922 int start = pattern[0];
923 int end = pattern[2];
924 int c = string[0];
925 if (start > end) {
926 int t = start;
927 start = end;
928 end = t;
929 }
930 if (nocase) {
931 start = tolower(start);
932 end = tolower(end);
933 c = tolower(c);
934 }
935 pattern += 2;
936 patternLen -= 2;
937 if (c >= start && c <= end)
938 match = 1;
939 } else {
940 if (!nocase) {
941 if (pattern[0] == string[0])
942 match = 1;
943 } else {
944 if (tolower((int)pattern[0]) == tolower((int)string[0]))
945 match = 1;
946 }
947 }
948 pattern++;
949 patternLen--;
950 }
951 if (not)
952 match = !match;
953 if (!match)
954 return 0; /* no match */
955 string++;
956 stringLen--;
957 break;
958 }
959 case '\\':
960 if (patternLen >= 2) {
961 pattern++;
962 patternLen--;
963 }
964 /* fall through */
965 default:
966 if (!nocase) {
967 if (pattern[0] != string[0])
968 return 0; /* no match */
969 } else {
970 if (tolower((int)pattern[0]) != tolower((int)string[0]))
971 return 0; /* no match */
972 }
973 string++;
974 stringLen--;
975 break;
976 }
977 pattern++;
978 patternLen--;
979 if (stringLen == 0) {
980 while(*pattern == '*') {
981 pattern++;
982 patternLen--;
983 }
984 break;
985 }
986 }
987 if (patternLen == 0 && stringLen == 0)
988 return 1;
989 return 0;
990 }
991
992 static int stringmatch(const char *pattern, const char *string, int nocase) {
993 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
994 }
995
996 /* Convert a string representing an amount of memory into the number of
997 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
998 * (1024*1024*1024).
999 *
1000 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1001 * set to 0 */
1002 static long long memtoll(const char *p, int *err) {
1003 const char *u;
1004 char buf[128];
1005 long mul; /* unit multiplier */
1006 long long val;
1007 unsigned int digits;
1008
1009 if (err) *err = 0;
1010 /* Search the first non digit character. */
1011 u = p;
1012 if (*u == '-') u++;
1013 while(*u && isdigit(*u)) u++;
1014 if (*u == '\0' || !strcasecmp(u,"b")) {
1015 mul = 1;
1016 } else if (!strcasecmp(u,"k")) {
1017 mul = 1000;
1018 } else if (!strcasecmp(u,"kb")) {
1019 mul = 1024;
1020 } else if (!strcasecmp(u,"m")) {
1021 mul = 1000*1000;
1022 } else if (!strcasecmp(u,"mb")) {
1023 mul = 1024*1024;
1024 } else if (!strcasecmp(u,"g")) {
1025 mul = 1000L*1000*1000;
1026 } else if (!strcasecmp(u,"gb")) {
1027 mul = 1024L*1024*1024;
1028 } else {
1029 if (err) *err = 1;
1030 mul = 1;
1031 }
1032 digits = u-p;
1033 if (digits >= sizeof(buf)) {
1034 if (err) *err = 1;
1035 return LLONG_MAX;
1036 }
1037 memcpy(buf,p,digits);
1038 buf[digits] = '\0';
1039 val = strtoll(buf,NULL,10);
1040 return val*mul;
1041 }
1042
1043 /* Convert a long long into a string. Returns the number of
1044 * characters needed to represent the number, that can be shorter if passed
1045 * buffer length is not enough to store the whole number. */
1046 static int ll2string(char *s, size_t len, long long value) {
1047 char buf[32], *p;
1048 unsigned long long v;
1049 size_t l;
1050
1051 if (len == 0) return 0;
1052 v = (value < 0) ? -value : value;
1053 p = buf+31; /* point to the last character */
1054 do {
1055 *p-- = '0'+(v%10);
1056 v /= 10;
1057 } while(v);
1058 if (value < 0) *p-- = '-';
1059 p++;
1060 l = 32-(p-buf);
1061 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1062 memcpy(s,p,l);
1063 s[l] = '\0';
1064 return l;
1065 }
1066
1067 static void redisLog(int level, const char *fmt, ...) {
1068 va_list ap;
1069 FILE *fp;
1070
1071 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1072 if (!fp) return;
1073
1074 va_start(ap, fmt);
1075 if (level >= server.verbosity) {
1076 char *c = ".-*#";
1077 char buf[64];
1078 time_t now;
1079
1080 now = time(NULL);
1081 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1082 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1083 vfprintf(fp, fmt, ap);
1084 fprintf(fp,"\n");
1085 fflush(fp);
1086 }
1087 va_end(ap);
1088
1089 if (server.logfile) fclose(fp);
1090 }
1091
1092 /*====================== Hash table type implementation ==================== */
1093
1094 /* This is an hash table type that uses the SDS dynamic strings libary as
1095 * keys and radis objects as values (objects can hold SDS strings,
1096 * lists, sets). */
1097
1098 static void dictVanillaFree(void *privdata, void *val)
1099 {
1100 DICT_NOTUSED(privdata);
1101 zfree(val);
1102 }
1103
1104 static void dictListDestructor(void *privdata, void *val)
1105 {
1106 DICT_NOTUSED(privdata);
1107 listRelease((list*)val);
1108 }
1109
1110 static int sdsDictKeyCompare(void *privdata, const void *key1,
1111 const void *key2)
1112 {
1113 int l1,l2;
1114 DICT_NOTUSED(privdata);
1115
1116 l1 = sdslen((sds)key1);
1117 l2 = sdslen((sds)key2);
1118 if (l1 != l2) return 0;
1119 return memcmp(key1, key2, l1) == 0;
1120 }
1121
1122 static void dictRedisObjectDestructor(void *privdata, void *val)
1123 {
1124 DICT_NOTUSED(privdata);
1125
1126 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1127 decrRefCount(val);
1128 }
1129
1130 static int dictObjKeyCompare(void *privdata, const void *key1,
1131 const void *key2)
1132 {
1133 const robj *o1 = key1, *o2 = key2;
1134 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1135 }
1136
1137 static unsigned int dictObjHash(const void *key) {
1138 const robj *o = key;
1139 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1140 }
1141
1142 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1143 const void *key2)
1144 {
1145 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1146 int cmp;
1147
1148 if (o1->encoding == REDIS_ENCODING_INT &&
1149 o2->encoding == REDIS_ENCODING_INT)
1150 return o1->ptr == o2->ptr;
1151
1152 o1 = getDecodedObject(o1);
1153 o2 = getDecodedObject(o2);
1154 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1155 decrRefCount(o1);
1156 decrRefCount(o2);
1157 return cmp;
1158 }
1159
1160 static unsigned int dictEncObjHash(const void *key) {
1161 robj *o = (robj*) key;
1162
1163 if (o->encoding == REDIS_ENCODING_RAW) {
1164 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1165 } else {
1166 if (o->encoding == REDIS_ENCODING_INT) {
1167 char buf[32];
1168 int len;
1169
1170 len = ll2string(buf,32,(long)o->ptr);
1171 return dictGenHashFunction((unsigned char*)buf, len);
1172 } else {
1173 unsigned int hash;
1174
1175 o = getDecodedObject(o);
1176 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177 decrRefCount(o);
1178 return hash;
1179 }
1180 }
1181 }
1182
1183 /* Sets type and expires */
1184 static dictType setDictType = {
1185 dictEncObjHash, /* hash function */
1186 NULL, /* key dup */
1187 NULL, /* val dup */
1188 dictEncObjKeyCompare, /* key compare */
1189 dictRedisObjectDestructor, /* key destructor */
1190 NULL /* val destructor */
1191 };
1192
1193 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1194 static dictType zsetDictType = {
1195 dictEncObjHash, /* hash function */
1196 NULL, /* key dup */
1197 NULL, /* val dup */
1198 dictEncObjKeyCompare, /* key compare */
1199 dictRedisObjectDestructor, /* key destructor */
1200 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1201 };
1202
1203 /* Db->dict */
1204 static dictType dbDictType = {
1205 dictObjHash, /* hash function */
1206 NULL, /* key dup */
1207 NULL, /* val dup */
1208 dictObjKeyCompare, /* key compare */
1209 dictRedisObjectDestructor, /* key destructor */
1210 dictRedisObjectDestructor /* val destructor */
1211 };
1212
1213 /* Db->expires */
1214 static dictType keyptrDictType = {
1215 dictObjHash, /* hash function */
1216 NULL, /* key dup */
1217 NULL, /* val dup */
1218 dictObjKeyCompare, /* key compare */
1219 dictRedisObjectDestructor, /* key destructor */
1220 NULL /* val destructor */
1221 };
1222
1223 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1224 static dictType hashDictType = {
1225 dictEncObjHash, /* hash function */
1226 NULL, /* key dup */
1227 NULL, /* val dup */
1228 dictEncObjKeyCompare, /* key compare */
1229 dictRedisObjectDestructor, /* key destructor */
1230 dictRedisObjectDestructor /* val destructor */
1231 };
1232
1233 /* Keylist hash table type has unencoded redis objects as keys and
1234 * lists as values. It's used for blocking operations (BLPOP) and to
1235 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1236 static dictType keylistDictType = {
1237 dictObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
1242 dictListDestructor /* val destructor */
1243 };
1244
1245 static void version();
1246
1247 /* ========================= Random utility functions ======================= */
1248
1249 /* Redis generally does not try to recover from out of memory conditions
1250 * when allocating objects or strings, it is not clear if it will be possible
1251 * to report this condition to the client since the networking layer itself
1252 * is based on heap allocation for send buffers, so we simply abort.
1253 * At least the code will be simpler to read... */
1254 static void oom(const char *msg) {
1255 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1256 sleep(1);
1257 abort();
1258 }
1259
1260 /* ====================== Redis server networking stuff ===================== */
1261 static void closeTimedoutClients(void) {
1262 redisClient *c;
1263 listNode *ln;
1264 time_t now = time(NULL);
1265 listIter li;
1266
1267 listRewind(server.clients,&li);
1268 while ((ln = listNext(&li)) != NULL) {
1269 c = listNodeValue(ln);
1270 if (server.maxidletime &&
1271 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1272 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1273 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1274 listLength(c->pubsub_patterns) == 0 &&
1275 (now - c->lastinteraction > server.maxidletime))
1276 {
1277 redisLog(REDIS_VERBOSE,"Closing idle client");
1278 freeClient(c);
1279 } else if (c->flags & REDIS_BLOCKED) {
1280 if (c->blockingto != 0 && c->blockingto < now) {
1281 addReply(c,shared.nullmultibulk);
1282 unblockClientWaitingData(c);
1283 }
1284 }
1285 }
1286 }
1287
1288 static int htNeedsResize(dict *dict) {
1289 long long size, used;
1290
1291 size = dictSlots(dict);
1292 used = dictSize(dict);
1293 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1294 (used*100/size < REDIS_HT_MINFILL));
1295 }
1296
1297 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1298 * we resize the hash table to save memory */
1299 static void tryResizeHashTables(void) {
1300 int j;
1301
1302 for (j = 0; j < server.dbnum; j++) {
1303 if (htNeedsResize(server.db[j].dict))
1304 dictResize(server.db[j].dict);
1305 if (htNeedsResize(server.db[j].expires))
1306 dictResize(server.db[j].expires);
1307 }
1308 }
1309
1310 /* Our hash table implementation performs rehashing incrementally while
1311 * we write/read from the hash table. Still if the server is idle, the hash
1312 * table will use two tables for a long time. So we try to use 1 millisecond
1313 * of CPU time at every serverCron() loop in order to rehash some key. */
1314 static void incrementallyRehash(void) {
1315 int j;
1316
1317 for (j = 0; j < server.dbnum; j++) {
1318 if (dictIsRehashing(server.db[j].dict)) {
1319 dictRehashMilliseconds(server.db[j].dict,1);
1320 break; /* already used our millisecond for this loop... */
1321 }
1322 }
1323 }
1324
1325 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1326 void backgroundSaveDoneHandler(int statloc) {
1327 int exitcode = WEXITSTATUS(statloc);
1328 int bysignal = WIFSIGNALED(statloc);
1329
1330 if (!bysignal && exitcode == 0) {
1331 redisLog(REDIS_NOTICE,
1332 "Background saving terminated with success");
1333 server.dirty = 0;
1334 server.lastsave = time(NULL);
1335 } else if (!bysignal && exitcode != 0) {
1336 redisLog(REDIS_WARNING, "Background saving error");
1337 } else {
1338 redisLog(REDIS_WARNING,
1339 "Background saving terminated by signal %d", WTERMSIG(statloc));
1340 rdbRemoveTempFile(server.bgsavechildpid);
1341 }
1342 server.bgsavechildpid = -1;
1343 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1344 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1345 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1346 }
1347
1348 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1349 * Handle this. */
1350 void backgroundRewriteDoneHandler(int statloc) {
1351 int exitcode = WEXITSTATUS(statloc);
1352 int bysignal = WIFSIGNALED(statloc);
1353
1354 if (!bysignal && exitcode == 0) {
1355 int fd;
1356 char tmpfile[256];
1357
1358 redisLog(REDIS_NOTICE,
1359 "Background append only file rewriting terminated with success");
1360 /* Now it's time to flush the differences accumulated by the parent */
1361 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1362 fd = open(tmpfile,O_WRONLY|O_APPEND);
1363 if (fd == -1) {
1364 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1365 goto cleanup;
1366 }
1367 /* Flush our data... */
1368 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1369 (signed) sdslen(server.bgrewritebuf)) {
1370 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1371 close(fd);
1372 goto cleanup;
1373 }
1374 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1375 /* Now our work is to rename the temp file into the stable file. And
1376 * switch the file descriptor used by the server for append only. */
1377 if (rename(tmpfile,server.appendfilename) == -1) {
1378 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1379 close(fd);
1380 goto cleanup;
1381 }
1382 /* Mission completed... almost */
1383 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1384 if (server.appendfd != -1) {
1385 /* If append only is actually enabled... */
1386 close(server.appendfd);
1387 server.appendfd = fd;
1388 fsync(fd);
1389 server.appendseldb = -1; /* Make sure it will issue SELECT */
1390 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1391 } else {
1392 /* If append only is disabled we just generate a dump in this
1393 * format. Why not? */
1394 close(fd);
1395 }
1396 } else if (!bysignal && exitcode != 0) {
1397 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1398 } else {
1399 redisLog(REDIS_WARNING,
1400 "Background append only file rewriting terminated by signal %d",
1401 WTERMSIG(statloc));
1402 }
1403 cleanup:
1404 sdsfree(server.bgrewritebuf);
1405 server.bgrewritebuf = sdsempty();
1406 aofRemoveTempFile(server.bgrewritechildpid);
1407 server.bgrewritechildpid = -1;
1408 }
1409
1410 /* This function is called once a background process of some kind terminates,
1411 * as we want to avoid resizing the hash tables when there is a child in order
1412 * to play well with copy-on-write (otherwise when a resize happens lots of
1413 * memory pages are copied). The goal of this function is to update the ability
1414 * for dict.c to resize the hash tables accordingly to the fact we have o not
1415 * running childs. */
1416 static void updateDictResizePolicy(void) {
1417 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1418 dictEnableResize();
1419 else
1420 dictDisableResize();
1421 }
1422
1423 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1424 int j, loops = server.cronloops++;
1425 REDIS_NOTUSED(eventLoop);
1426 REDIS_NOTUSED(id);
1427 REDIS_NOTUSED(clientData);
1428
1429 /* We take a cached value of the unix time in the global state because
1430 * with virtual memory and aging there is to store the current time
1431 * in objects at every object access, and accuracy is not needed.
1432 * To access a global var is faster than calling time(NULL) */
1433 server.unixtime = time(NULL);
1434
1435 /* We received a SIGTERM, shutting down here in a safe way, as it is
1436 * not ok doing so inside the signal handler. */
1437 if (server.shutdown_asap) {
1438 if (prepareForShutdown() == REDIS_OK) exit(0);
1439 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1440 }
1441
1442 /* Show some info about non-empty databases */
1443 for (j = 0; j < server.dbnum; j++) {
1444 long long size, used, vkeys;
1445
1446 size = dictSlots(server.db[j].dict);
1447 used = dictSize(server.db[j].dict);
1448 vkeys = dictSize(server.db[j].expires);
1449 if (!(loops % 50) && (used || vkeys)) {
1450 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1451 /* dictPrintStats(server.dict); */
1452 }
1453 }
1454
1455 /* We don't want to resize the hash tables while a bacground saving
1456 * is in progress: the saving child is created using fork() that is
1457 * implemented with a copy-on-write semantic in most modern systems, so
1458 * if we resize the HT while there is the saving child at work actually
1459 * a lot of memory movements in the parent will cause a lot of pages
1460 * copied. */
1461 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1462 if (!(loops % 10)) tryResizeHashTables();
1463 if (server.activerehashing) incrementallyRehash();
1464 }
1465
1466 /* Show information about connected clients */
1467 if (!(loops % 50)) {
1468 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1469 listLength(server.clients)-listLength(server.slaves),
1470 listLength(server.slaves),
1471 zmalloc_used_memory());
1472 }
1473
1474 /* Close connections of timedout clients */
1475 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1476 closeTimedoutClients();
1477
1478 /* Check if a background saving or AOF rewrite in progress terminated */
1479 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1480 int statloc;
1481 pid_t pid;
1482
1483 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1484 if (pid == server.bgsavechildpid) {
1485 backgroundSaveDoneHandler(statloc);
1486 } else {
1487 backgroundRewriteDoneHandler(statloc);
1488 }
1489 updateDictResizePolicy();
1490 }
1491 } else {
1492 /* If there is not a background saving in progress check if
1493 * we have to save now */
1494 time_t now = time(NULL);
1495 for (j = 0; j < server.saveparamslen; j++) {
1496 struct saveparam *sp = server.saveparams+j;
1497
1498 if (server.dirty >= sp->changes &&
1499 now-server.lastsave > sp->seconds) {
1500 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1501 sp->changes, sp->seconds);
1502 rdbSaveBackground(server.dbfilename);
1503 break;
1504 }
1505 }
1506 }
1507
1508 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1509 * will use few CPU cycles if there are few expiring keys, otherwise
1510 * it will get more aggressive to avoid that too much memory is used by
1511 * keys that can be removed from the keyspace. */
1512 for (j = 0; j < server.dbnum; j++) {
1513 int expired;
1514 redisDb *db = server.db+j;
1515
1516 /* Continue to expire if at the end of the cycle more than 25%
1517 * of the keys were expired. */
1518 do {
1519 long num = dictSize(db->expires);
1520 time_t now = time(NULL);
1521
1522 expired = 0;
1523 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1524 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1525 while (num--) {
1526 dictEntry *de;
1527 time_t t;
1528
1529 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1530 t = (time_t) dictGetEntryVal(de);
1531 if (now > t) {
1532 deleteKey(db,dictGetEntryKey(de));
1533 expired++;
1534 server.stat_expiredkeys++;
1535 }
1536 }
1537 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1538 }
1539
1540 /* Swap a few keys on disk if we are over the memory limit and VM
1541 * is enbled. Try to free objects from the free list first. */
1542 if (vmCanSwapOut()) {
1543 while (server.vm_enabled && zmalloc_used_memory() >
1544 server.vm_max_memory)
1545 {
1546 int retval;
1547
1548 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1549 retval = (server.vm_max_threads == 0) ?
1550 vmSwapOneObjectBlocking() :
1551 vmSwapOneObjectThreaded();
1552 if (retval == REDIS_ERR && !(loops % 300) &&
1553 zmalloc_used_memory() >
1554 (server.vm_max_memory+server.vm_max_memory/10))
1555 {
1556 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1557 }
1558 /* Note that when using threade I/O we free just one object,
1559 * because anyway when the I/O thread in charge to swap this
1560 * object out will finish, the handler of completed jobs
1561 * will try to swap more objects if we are still out of memory. */
1562 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1563 }
1564 }
1565
1566 /* Check if we should connect to a MASTER */
1567 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1568 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1569 if (syncWithMaster() == REDIS_OK) {
1570 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1571 if (server.appendonly) rewriteAppendOnlyFileBackground();
1572 }
1573 }
1574 return 100;
1575 }
1576
1577 /* This function gets called every time Redis is entering the
1578 * main loop of the event driven library, that is, before to sleep
1579 * for ready file descriptors. */
1580 static void beforeSleep(struct aeEventLoop *eventLoop) {
1581 REDIS_NOTUSED(eventLoop);
1582
1583 /* Awake clients that got all the swapped keys they requested */
1584 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1585 listIter li;
1586 listNode *ln;
1587
1588 listRewind(server.io_ready_clients,&li);
1589 while((ln = listNext(&li))) {
1590 redisClient *c = ln->value;
1591 struct redisCommand *cmd;
1592
1593 /* Resume the client. */
1594 listDelNode(server.io_ready_clients,ln);
1595 c->flags &= (~REDIS_IO_WAIT);
1596 server.vm_blocked_clients--;
1597 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1598 readQueryFromClient, c);
1599 cmd = lookupCommand(c->argv[0]->ptr);
1600 assert(cmd != NULL);
1601 call(c,cmd);
1602 resetClient(c);
1603 /* There may be more data to process in the input buffer. */
1604 if (c->querybuf && sdslen(c->querybuf) > 0)
1605 processInputBuffer(c);
1606 }
1607 }
1608 /* Write the AOF buffer on disk */
1609 flushAppendOnlyFile();
1610 }
1611
1612 static void createSharedObjects(void) {
1613 int j;
1614
1615 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1616 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1617 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1618 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1619 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1620 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1621 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1622 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1623 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1624 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1625 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1626 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1627 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1628 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1629 "-ERR no such key\r\n"));
1630 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1631 "-ERR syntax error\r\n"));
1632 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1633 "-ERR source and destination objects are the same\r\n"));
1634 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1635 "-ERR index out of range\r\n"));
1636 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1637 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1638 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1639 shared.select0 = createStringObject("select 0\r\n",10);
1640 shared.select1 = createStringObject("select 1\r\n",10);
1641 shared.select2 = createStringObject("select 2\r\n",10);
1642 shared.select3 = createStringObject("select 3\r\n",10);
1643 shared.select4 = createStringObject("select 4\r\n",10);
1644 shared.select5 = createStringObject("select 5\r\n",10);
1645 shared.select6 = createStringObject("select 6\r\n",10);
1646 shared.select7 = createStringObject("select 7\r\n",10);
1647 shared.select8 = createStringObject("select 8\r\n",10);
1648 shared.select9 = createStringObject("select 9\r\n",10);
1649 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1650 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1651 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1652 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1653 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1654 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1655 shared.mbulk3 = createStringObject("*3\r\n",4);
1656 shared.mbulk4 = createStringObject("*4\r\n",4);
1657 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1658 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1659 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1660 }
1661 }
1662
1663 static void appendServerSaveParams(time_t seconds, int changes) {
1664 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1665 server.saveparams[server.saveparamslen].seconds = seconds;
1666 server.saveparams[server.saveparamslen].changes = changes;
1667 server.saveparamslen++;
1668 }
1669
1670 static void resetServerSaveParams() {
1671 zfree(server.saveparams);
1672 server.saveparams = NULL;
1673 server.saveparamslen = 0;
1674 }
1675
1676 static void initServerConfig() {
1677 server.dbnum = REDIS_DEFAULT_DBNUM;
1678 server.port = REDIS_SERVERPORT;
1679 server.verbosity = REDIS_VERBOSE;
1680 server.maxidletime = REDIS_MAXIDLETIME;
1681 server.saveparams = NULL;
1682 server.logfile = NULL; /* NULL = log on standard output */
1683 server.bindaddr = NULL;
1684 server.glueoutputbuf = 1;
1685 server.daemonize = 0;
1686 server.appendonly = 0;
1687 server.appendfsync = APPENDFSYNC_EVERYSEC;
1688 server.lastfsync = time(NULL);
1689 server.appendfd = -1;
1690 server.appendseldb = -1; /* Make sure the first time will not match */
1691 server.pidfile = zstrdup("/var/run/redis.pid");
1692 server.dbfilename = zstrdup("dump.rdb");
1693 server.appendfilename = zstrdup("appendonly.aof");
1694 server.requirepass = NULL;
1695 server.rdbcompression = 1;
1696 server.activerehashing = 1;
1697 server.maxclients = 0;
1698 server.blpop_blocked_clients = 0;
1699 server.maxmemory = 0;
1700 server.vm_enabled = 0;
1701 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1702 server.vm_page_size = 256; /* 256 bytes per page */
1703 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1704 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1705 server.vm_max_threads = 4;
1706 server.vm_blocked_clients = 0;
1707 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1708 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1709 server.shutdown_asap = 0;
1710
1711 resetServerSaveParams();
1712
1713 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1714 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1715 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1716 /* Replication related */
1717 server.isslave = 0;
1718 server.masterauth = NULL;
1719 server.masterhost = NULL;
1720 server.masterport = 6379;
1721 server.master = NULL;
1722 server.replstate = REDIS_REPL_NONE;
1723
1724 /* Double constants initialization */
1725 R_Zero = 0.0;
1726 R_PosInf = 1.0/R_Zero;
1727 R_NegInf = -1.0/R_Zero;
1728 R_Nan = R_Zero/R_Zero;
1729 }
1730
1731 static void initServer() {
1732 int j;
1733
1734 signal(SIGHUP, SIG_IGN);
1735 signal(SIGPIPE, SIG_IGN);
1736 setupSigSegvAction();
1737
1738 server.devnull = fopen("/dev/null","w");
1739 if (server.devnull == NULL) {
1740 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1741 exit(1);
1742 }
1743 server.clients = listCreate();
1744 server.slaves = listCreate();
1745 server.monitors = listCreate();
1746 server.objfreelist = listCreate();
1747 createSharedObjects();
1748 server.el = aeCreateEventLoop();
1749 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1750 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1751 if (server.fd == -1) {
1752 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1753 exit(1);
1754 }
1755 for (j = 0; j < server.dbnum; j++) {
1756 server.db[j].dict = dictCreate(&dbDictType,NULL);
1757 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1758 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1759 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1760 if (server.vm_enabled)
1761 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1762 server.db[j].id = j;
1763 }
1764 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1765 server.pubsub_patterns = listCreate();
1766 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1767 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1768 server.cronloops = 0;
1769 server.bgsavechildpid = -1;
1770 server.bgrewritechildpid = -1;
1771 server.bgrewritebuf = sdsempty();
1772 server.aofbuf = sdsempty();
1773 server.lastsave = time(NULL);
1774 server.dirty = 0;
1775 server.stat_numcommands = 0;
1776 server.stat_numconnections = 0;
1777 server.stat_expiredkeys = 0;
1778 server.stat_starttime = time(NULL);
1779 server.unixtime = time(NULL);
1780 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1781 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1782 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1783
1784 if (server.appendonly) {
1785 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1786 if (server.appendfd == -1) {
1787 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1788 strerror(errno));
1789 exit(1);
1790 }
1791 }
1792
1793 if (server.vm_enabled) vmInit();
1794 }
1795
1796 /* Empty the whole database */
1797 static long long emptyDb() {
1798 int j;
1799 long long removed = 0;
1800
1801 for (j = 0; j < server.dbnum; j++) {
1802 removed += dictSize(server.db[j].dict);
1803 dictEmpty(server.db[j].dict);
1804 dictEmpty(server.db[j].expires);
1805 }
1806 return removed;
1807 }
1808
1809 static int yesnotoi(char *s) {
1810 if (!strcasecmp(s,"yes")) return 1;
1811 else if (!strcasecmp(s,"no")) return 0;
1812 else return -1;
1813 }
1814
1815 /* I agree, this is a very rudimental way to load a configuration...
1816 will improve later if the config gets more complex */
1817 static void loadServerConfig(char *filename) {
1818 FILE *fp;
1819 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1820 int linenum = 0;
1821 sds line = NULL;
1822
1823 if (filename[0] == '-' && filename[1] == '\0')
1824 fp = stdin;
1825 else {
1826 if ((fp = fopen(filename,"r")) == NULL) {
1827 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1828 exit(1);
1829 }
1830 }
1831
1832 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1833 sds *argv;
1834 int argc, j;
1835
1836 linenum++;
1837 line = sdsnew(buf);
1838 line = sdstrim(line," \t\r\n");
1839
1840 /* Skip comments and blank lines*/
1841 if (line[0] == '#' || line[0] == '\0') {
1842 sdsfree(line);
1843 continue;
1844 }
1845
1846 /* Split into arguments */
1847 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1848 sdstolower(argv[0]);
1849
1850 /* Execute config directives */
1851 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1852 server.maxidletime = atoi(argv[1]);
1853 if (server.maxidletime < 0) {
1854 err = "Invalid timeout value"; goto loaderr;
1855 }
1856 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1857 server.port = atoi(argv[1]);
1858 if (server.port < 1 || server.port > 65535) {
1859 err = "Invalid port"; goto loaderr;
1860 }
1861 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1862 server.bindaddr = zstrdup(argv[1]);
1863 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1864 int seconds = atoi(argv[1]);
1865 int changes = atoi(argv[2]);
1866 if (seconds < 1 || changes < 0) {
1867 err = "Invalid save parameters"; goto loaderr;
1868 }
1869 appendServerSaveParams(seconds,changes);
1870 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1871 if (chdir(argv[1]) == -1) {
1872 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1873 argv[1], strerror(errno));
1874 exit(1);
1875 }
1876 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1877 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1878 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1879 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1880 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1881 else {
1882 err = "Invalid log level. Must be one of debug, notice, warning";
1883 goto loaderr;
1884 }
1885 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1886 FILE *logfp;
1887
1888 server.logfile = zstrdup(argv[1]);
1889 if (!strcasecmp(server.logfile,"stdout")) {
1890 zfree(server.logfile);
1891 server.logfile = NULL;
1892 }
1893 if (server.logfile) {
1894 /* Test if we are able to open the file. The server will not
1895 * be able to abort just for this problem later... */
1896 logfp = fopen(server.logfile,"a");
1897 if (logfp == NULL) {
1898 err = sdscatprintf(sdsempty(),
1899 "Can't open the log file: %s", strerror(errno));
1900 goto loaderr;
1901 }
1902 fclose(logfp);
1903 }
1904 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1905 server.dbnum = atoi(argv[1]);
1906 if (server.dbnum < 1) {
1907 err = "Invalid number of databases"; goto loaderr;
1908 }
1909 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1910 loadServerConfig(argv[1]);
1911 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1912 server.maxclients = atoi(argv[1]);
1913 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1914 server.maxmemory = memtoll(argv[1],NULL);
1915 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1916 server.masterhost = sdsnew(argv[1]);
1917 server.masterport = atoi(argv[2]);
1918 server.replstate = REDIS_REPL_CONNECT;
1919 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1920 server.masterauth = zstrdup(argv[1]);
1921 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1922 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1923 err = "argument must be 'yes' or 'no'"; goto loaderr;
1924 }
1925 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1926 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1927 err = "argument must be 'yes' or 'no'"; goto loaderr;
1928 }
1929 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1930 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1931 err = "argument must be 'yes' or 'no'"; goto loaderr;
1932 }
1933 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1934 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1935 err = "argument must be 'yes' or 'no'"; goto loaderr;
1936 }
1937 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1938 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1939 err = "argument must be 'yes' or 'no'"; goto loaderr;
1940 }
1941 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1942 zfree(server.appendfilename);
1943 server.appendfilename = zstrdup(argv[1]);
1944 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1945 if (!strcasecmp(argv[1],"no")) {
1946 server.appendfsync = APPENDFSYNC_NO;
1947 } else if (!strcasecmp(argv[1],"always")) {
1948 server.appendfsync = APPENDFSYNC_ALWAYS;
1949 } else if (!strcasecmp(argv[1],"everysec")) {
1950 server.appendfsync = APPENDFSYNC_EVERYSEC;
1951 } else {
1952 err = "argument must be 'no', 'always' or 'everysec'";
1953 goto loaderr;
1954 }
1955 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1956 server.requirepass = zstrdup(argv[1]);
1957 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1958 zfree(server.pidfile);
1959 server.pidfile = zstrdup(argv[1]);
1960 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1961 zfree(server.dbfilename);
1962 server.dbfilename = zstrdup(argv[1]);
1963 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1964 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1965 err = "argument must be 'yes' or 'no'"; goto loaderr;
1966 }
1967 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1968 zfree(server.vm_swap_file);
1969 server.vm_swap_file = zstrdup(argv[1]);
1970 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1971 server.vm_max_memory = memtoll(argv[1],NULL);
1972 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1973 server.vm_page_size = memtoll(argv[1], NULL);
1974 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1975 server.vm_pages = memtoll(argv[1], NULL);
1976 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1977 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1978 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1979 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1980 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1981 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1982 } else {
1983 err = "Bad directive or wrong number of arguments"; goto loaderr;
1984 }
1985 for (j = 0; j < argc; j++)
1986 sdsfree(argv[j]);
1987 zfree(argv);
1988 sdsfree(line);
1989 }
1990 if (fp != stdin) fclose(fp);
1991 return;
1992
1993 loaderr:
1994 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1995 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1996 fprintf(stderr, ">>> '%s'\n", line);
1997 fprintf(stderr, "%s\n", err);
1998 exit(1);
1999 }
2000
2001 static void freeClientArgv(redisClient *c) {
2002 int j;
2003
2004 for (j = 0; j < c->argc; j++)
2005 decrRefCount(c->argv[j]);
2006 for (j = 0; j < c->mbargc; j++)
2007 decrRefCount(c->mbargv[j]);
2008 c->argc = 0;
2009 c->mbargc = 0;
2010 }
2011
2012 static void freeClient(redisClient *c) {
2013 listNode *ln;
2014
2015 /* Note that if the client we are freeing is blocked into a blocking
2016 * call, we have to set querybuf to NULL *before* to call
2017 * unblockClientWaitingData() to avoid processInputBuffer() will get
2018 * called. Also it is important to remove the file events after
2019 * this, because this call adds the READABLE event. */
2020 sdsfree(c->querybuf);
2021 c->querybuf = NULL;
2022 if (c->flags & REDIS_BLOCKED)
2023 unblockClientWaitingData(c);
2024
2025 /* UNWATCH all the keys */
2026 unwatchAllKeys(c);
2027 listRelease(c->watched_keys);
2028 /* Unsubscribe from all the pubsub channels */
2029 pubsubUnsubscribeAllChannels(c,0);
2030 pubsubUnsubscribeAllPatterns(c,0);
2031 dictRelease(c->pubsub_channels);
2032 listRelease(c->pubsub_patterns);
2033 /* Obvious cleanup */
2034 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2035 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2036 listRelease(c->reply);
2037 freeClientArgv(c);
2038 close(c->fd);
2039 /* Remove from the list of clients */
2040 ln = listSearchKey(server.clients,c);
2041 redisAssert(ln != NULL);
2042 listDelNode(server.clients,ln);
2043 /* Remove from the list of clients that are now ready to be restarted
2044 * after waiting for swapped keys */
2045 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2046 ln = listSearchKey(server.io_ready_clients,c);
2047 if (ln) {
2048 listDelNode(server.io_ready_clients,ln);
2049 server.vm_blocked_clients--;
2050 }
2051 }
2052 /* Remove from the list of clients waiting for swapped keys */
2053 while (server.vm_enabled && listLength(c->io_keys)) {
2054 ln = listFirst(c->io_keys);
2055 dontWaitForSwappedKey(c,ln->value);
2056 }
2057 listRelease(c->io_keys);
2058 /* Master/slave cleanup */
2059 if (c->flags & REDIS_SLAVE) {
2060 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2061 close(c->repldbfd);
2062 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2063 ln = listSearchKey(l,c);
2064 redisAssert(ln != NULL);
2065 listDelNode(l,ln);
2066 }
2067 if (c->flags & REDIS_MASTER) {
2068 server.master = NULL;
2069 server.replstate = REDIS_REPL_CONNECT;
2070 }
2071 /* Release memory */
2072 zfree(c->argv);
2073 zfree(c->mbargv);
2074 freeClientMultiState(c);
2075 zfree(c);
2076 }
2077
2078 #define GLUEREPLY_UP_TO (1024)
2079 static void glueReplyBuffersIfNeeded(redisClient *c) {
2080 int copylen = 0;
2081 char buf[GLUEREPLY_UP_TO];
2082 listNode *ln;
2083 listIter li;
2084 robj *o;
2085
2086 listRewind(c->reply,&li);
2087 while((ln = listNext(&li))) {
2088 int objlen;
2089
2090 o = ln->value;
2091 objlen = sdslen(o->ptr);
2092 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2093 memcpy(buf+copylen,o->ptr,objlen);
2094 copylen += objlen;
2095 listDelNode(c->reply,ln);
2096 } else {
2097 if (copylen == 0) return;
2098 break;
2099 }
2100 }
2101 /* Now the output buffer is empty, add the new single element */
2102 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2103 listAddNodeHead(c->reply,o);
2104 }
2105
2106 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2107 redisClient *c = privdata;
2108 int nwritten = 0, totwritten = 0, objlen;
2109 robj *o;
2110 REDIS_NOTUSED(el);
2111 REDIS_NOTUSED(mask);
2112
2113 /* Use writev() if we have enough buffers to send */
2114 if (!server.glueoutputbuf &&
2115 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2116 !(c->flags & REDIS_MASTER))
2117 {
2118 sendReplyToClientWritev(el, fd, privdata, mask);
2119 return;
2120 }
2121
2122 while(listLength(c->reply)) {
2123 if (server.glueoutputbuf && listLength(c->reply) > 1)
2124 glueReplyBuffersIfNeeded(c);
2125
2126 o = listNodeValue(listFirst(c->reply));
2127 objlen = sdslen(o->ptr);
2128
2129 if (objlen == 0) {
2130 listDelNode(c->reply,listFirst(c->reply));
2131 continue;
2132 }
2133
2134 if (c->flags & REDIS_MASTER) {
2135 /* Don't reply to a master */
2136 nwritten = objlen - c->sentlen;
2137 } else {
2138 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2139 if (nwritten <= 0) break;
2140 }
2141 c->sentlen += nwritten;
2142 totwritten += nwritten;
2143 /* If we fully sent the object on head go to the next one */
2144 if (c->sentlen == objlen) {
2145 listDelNode(c->reply,listFirst(c->reply));
2146 c->sentlen = 0;
2147 }
2148 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2149 * bytes, in a single threaded server it's a good idea to serve
2150 * other clients as well, even if a very large request comes from
2151 * super fast link that is always able to accept data (in real world
2152 * scenario think about 'KEYS *' against the loopback interfae) */
2153 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2154 }
2155 if (nwritten == -1) {
2156 if (errno == EAGAIN) {
2157 nwritten = 0;
2158 } else {
2159 redisLog(REDIS_VERBOSE,
2160 "Error writing to client: %s", strerror(errno));
2161 freeClient(c);
2162 return;
2163 }
2164 }
2165 if (totwritten > 0) c->lastinteraction = time(NULL);
2166 if (listLength(c->reply) == 0) {
2167 c->sentlen = 0;
2168 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2169 }
2170 }
2171
2172 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2173 {
2174 redisClient *c = privdata;
2175 int nwritten = 0, totwritten = 0, objlen, willwrite;
2176 robj *o;
2177 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2178 int offset, ion = 0;
2179 REDIS_NOTUSED(el);
2180 REDIS_NOTUSED(mask);
2181
2182 listNode *node;
2183 while (listLength(c->reply)) {
2184 offset = c->sentlen;
2185 ion = 0;
2186 willwrite = 0;
2187
2188 /* fill-in the iov[] array */
2189 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2190 o = listNodeValue(node);
2191 objlen = sdslen(o->ptr);
2192
2193 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2194 break;
2195
2196 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2197 break; /* no more iovecs */
2198
2199 iov[ion].iov_base = ((char*)o->ptr) + offset;
2200 iov[ion].iov_len = objlen - offset;
2201 willwrite += objlen - offset;
2202 offset = 0; /* just for the first item */
2203 ion++;
2204 }
2205
2206 if(willwrite == 0)
2207 break;
2208
2209 /* write all collected blocks at once */
2210 if((nwritten = writev(fd, iov, ion)) < 0) {
2211 if (errno != EAGAIN) {
2212 redisLog(REDIS_VERBOSE,
2213 "Error writing to client: %s", strerror(errno));
2214 freeClient(c);
2215 return;
2216 }
2217 break;
2218 }
2219
2220 totwritten += nwritten;
2221 offset = c->sentlen;
2222
2223 /* remove written robjs from c->reply */
2224 while (nwritten && listLength(c->reply)) {
2225 o = listNodeValue(listFirst(c->reply));
2226 objlen = sdslen(o->ptr);
2227
2228 if(nwritten >= objlen - offset) {
2229 listDelNode(c->reply, listFirst(c->reply));
2230 nwritten -= objlen - offset;
2231 c->sentlen = 0;
2232 } else {
2233 /* partial write */
2234 c->sentlen += nwritten;
2235 break;
2236 }
2237 offset = 0;
2238 }
2239 }
2240
2241 if (totwritten > 0)
2242 c->lastinteraction = time(NULL);
2243
2244 if (listLength(c->reply) == 0) {
2245 c->sentlen = 0;
2246 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2247 }
2248 }
2249
2250 static struct redisCommand *lookupCommand(char *name) {
2251 int j = 0;
2252 while(cmdTable[j].name != NULL) {
2253 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2254 j++;
2255 }
2256 return NULL;
2257 }
2258
2259 /* resetClient prepare the client to process the next command */
2260 static void resetClient(redisClient *c) {
2261 freeClientArgv(c);
2262 c->bulklen = -1;
2263 c->multibulk = 0;
2264 }
2265
2266 /* Call() is the core of Redis execution of a command */
2267 static void call(redisClient *c, struct redisCommand *cmd) {
2268 long long dirty;
2269
2270 dirty = server.dirty;
2271 cmd->proc(c);
2272 dirty = server.dirty-dirty;
2273
2274 if (server.appendonly && dirty)
2275 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2276 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2277 listLength(server.slaves))
2278 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2279 if (listLength(server.monitors))
2280 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2281 server.stat_numcommands++;
2282 }
2283
2284 /* If this function gets called we already read a whole
2285 * command, argments are in the client argv/argc fields.
2286 * processCommand() execute the command or prepare the
2287 * server for a bulk read from the client.
2288 *
2289 * If 1 is returned the client is still alive and valid and
2290 * and other operations can be performed by the caller. Otherwise
2291 * if 0 is returned the client was destroied (i.e. after QUIT). */
2292 static int processCommand(redisClient *c) {
2293 struct redisCommand *cmd;
2294
2295 /* Free some memory if needed (maxmemory setting) */
2296 if (server.maxmemory) freeMemoryIfNeeded();
2297
2298 /* Handle the multi bulk command type. This is an alternative protocol
2299 * supported by Redis in order to receive commands that are composed of
2300 * multiple binary-safe "bulk" arguments. The latency of processing is
2301 * a bit higher but this allows things like multi-sets, so if this
2302 * protocol is used only for MSET and similar commands this is a big win. */
2303 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2304 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2305 if (c->multibulk <= 0) {
2306 resetClient(c);
2307 return 1;
2308 } else {
2309 decrRefCount(c->argv[c->argc-1]);
2310 c->argc--;
2311 return 1;
2312 }
2313 } else if (c->multibulk) {
2314 if (c->bulklen == -1) {
2315 if (((char*)c->argv[0]->ptr)[0] != '$') {
2316 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2317 resetClient(c);
2318 return 1;
2319 } else {
2320 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2321 decrRefCount(c->argv[0]);
2322 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2323 c->argc--;
2324 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2325 resetClient(c);
2326 return 1;
2327 }
2328 c->argc--;
2329 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2330 return 1;
2331 }
2332 } else {
2333 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2334 c->mbargv[c->mbargc] = c->argv[0];
2335 c->mbargc++;
2336 c->argc--;
2337 c->multibulk--;
2338 if (c->multibulk == 0) {
2339 robj **auxargv;
2340 int auxargc;
2341
2342 /* Here we need to swap the multi-bulk argc/argv with the
2343 * normal argc/argv of the client structure. */
2344 auxargv = c->argv;
2345 c->argv = c->mbargv;
2346 c->mbargv = auxargv;
2347
2348 auxargc = c->argc;
2349 c->argc = c->mbargc;
2350 c->mbargc = auxargc;
2351
2352 /* We need to set bulklen to something different than -1
2353 * in order for the code below to process the command without
2354 * to try to read the last argument of a bulk command as
2355 * a special argument. */
2356 c->bulklen = 0;
2357 /* continue below and process the command */
2358 } else {
2359 c->bulklen = -1;
2360 return 1;
2361 }
2362 }
2363 }
2364 /* -- end of multi bulk commands processing -- */
2365
2366 /* The QUIT command is handled as a special case. Normal command
2367 * procs are unable to close the client connection safely */
2368 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2369 freeClient(c);
2370 return 0;
2371 }
2372
2373 /* Now lookup the command and check ASAP about trivial error conditions
2374 * such wrong arity, bad command name and so forth. */
2375 cmd = lookupCommand(c->argv[0]->ptr);
2376 if (!cmd) {
2377 addReplySds(c,
2378 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2379 (char*)c->argv[0]->ptr));
2380 resetClient(c);
2381 return 1;
2382 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2383 (c->argc < -cmd->arity)) {
2384 addReplySds(c,
2385 sdscatprintf(sdsempty(),
2386 "-ERR wrong number of arguments for '%s' command\r\n",
2387 cmd->name));
2388 resetClient(c);
2389 return 1;
2390 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2391 /* This is a bulk command, we have to read the last argument yet. */
2392 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2393
2394 decrRefCount(c->argv[c->argc-1]);
2395 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2396 c->argc--;
2397 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2398 resetClient(c);
2399 return 1;
2400 }
2401 c->argc--;
2402 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2403 /* It is possible that the bulk read is already in the
2404 * buffer. Check this condition and handle it accordingly.
2405 * This is just a fast path, alternative to call processInputBuffer().
2406 * It's a good idea since the code is small and this condition
2407 * happens most of the times. */
2408 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2409 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2410 c->argc++;
2411 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2412 } else {
2413 /* Otherwise return... there is to read the last argument
2414 * from the socket. */
2415 return 1;
2416 }
2417 }
2418 /* Let's try to encode the bulk object to save space. */
2419 if (cmd->flags & REDIS_CMD_BULK)
2420 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2421
2422 /* Check if the user is authenticated */
2423 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2424 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2425 resetClient(c);
2426 return 1;
2427 }
2428
2429 /* Handle the maxmemory directive */
2430 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2431 zmalloc_used_memory() > server.maxmemory)
2432 {
2433 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2434 resetClient(c);
2435 return 1;
2436 }
2437
2438 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2439 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2440 &&
2441 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2442 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2443 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2444 resetClient(c);
2445 return 1;
2446 }
2447
2448 /* Exec the command */
2449 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2450 queueMultiCommand(c,cmd);
2451 addReply(c,shared.queued);
2452 } else {
2453 if (server.vm_enabled && server.vm_max_threads > 0 &&
2454 blockClientOnSwappedKeys(c,cmd)) return 1;
2455 call(c,cmd);
2456 }
2457
2458 /* Prepare the client for the next command */
2459 resetClient(c);
2460 return 1;
2461 }
2462
2463 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2464 listNode *ln;
2465 listIter li;
2466 int outc = 0, j;
2467 robj **outv;
2468 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2469 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2470 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2471 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2472 robj *lenobj;
2473
2474 if (argc <= REDIS_STATIC_ARGS) {
2475 outv = static_outv;
2476 } else {
2477 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2478 }
2479
2480 lenobj = createObject(REDIS_STRING,
2481 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2482 lenobj->refcount = 0;
2483 outv[outc++] = lenobj;
2484 for (j = 0; j < argc; j++) {
2485 lenobj = createObject(REDIS_STRING,
2486 sdscatprintf(sdsempty(),"$%lu\r\n",
2487 (unsigned long) stringObjectLen(argv[j])));
2488 lenobj->refcount = 0;
2489 outv[outc++] = lenobj;
2490 outv[outc++] = argv[j];
2491 outv[outc++] = shared.crlf;
2492 }
2493
2494 /* Increment all the refcounts at start and decrement at end in order to
2495 * be sure to free objects if there is no slave in a replication state
2496 * able to be feed with commands */
2497 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2498 listRewind(slaves,&li);
2499 while((ln = listNext(&li))) {
2500 redisClient *slave = ln->value;
2501
2502 /* Don't feed slaves that are still waiting for BGSAVE to start */
2503 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2504
2505 /* Feed all the other slaves, MONITORs and so on */
2506 if (slave->slaveseldb != dictid) {
2507 robj *selectcmd;
2508
2509 switch(dictid) {
2510 case 0: selectcmd = shared.select0; break;
2511 case 1: selectcmd = shared.select1; break;
2512 case 2: selectcmd = shared.select2; break;
2513 case 3: selectcmd = shared.select3; break;
2514 case 4: selectcmd = shared.select4; break;
2515 case 5: selectcmd = shared.select5; break;
2516 case 6: selectcmd = shared.select6; break;
2517 case 7: selectcmd = shared.select7; break;
2518 case 8: selectcmd = shared.select8; break;
2519 case 9: selectcmd = shared.select9; break;
2520 default:
2521 selectcmd = createObject(REDIS_STRING,
2522 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2523 selectcmd->refcount = 0;
2524 break;
2525 }
2526 addReply(slave,selectcmd);
2527 slave->slaveseldb = dictid;
2528 }
2529 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2530 }
2531 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2532 if (outv != static_outv) zfree(outv);
2533 }
2534
2535 static sds sdscatrepr(sds s, char *p, size_t len) {
2536 s = sdscatlen(s,"\"",1);
2537 while(len--) {
2538 switch(*p) {
2539 case '\\':
2540 case '"':
2541 s = sdscatprintf(s,"\\%c",*p);
2542 break;
2543 case '\n': s = sdscatlen(s,"\\n",1); break;
2544 case '\r': s = sdscatlen(s,"\\r",1); break;
2545 case '\t': s = sdscatlen(s,"\\t",1); break;
2546 case '\a': s = sdscatlen(s,"\\a",1); break;
2547 case '\b': s = sdscatlen(s,"\\b",1); break;
2548 default:
2549 if (isprint(*p))
2550 s = sdscatprintf(s,"%c",*p);
2551 else
2552 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2553 break;
2554 }
2555 p++;
2556 }
2557 return sdscatlen(s,"\"",1);
2558 }
2559
2560 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2561 listNode *ln;
2562 listIter li;
2563 int j;
2564 sds cmdrepr = sdsnew("+");
2565 robj *cmdobj;
2566 struct timeval tv;
2567
2568 gettimeofday(&tv,NULL);
2569 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2570 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2571
2572 for (j = 0; j < argc; j++) {
2573 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2574 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2575 } else {
2576 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2577 sdslen(argv[j]->ptr));
2578 }
2579 if (j != argc-1)
2580 cmdrepr = sdscatlen(cmdrepr," ",1);
2581 }
2582 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2583 cmdobj = createObject(REDIS_STRING,cmdrepr);
2584
2585 listRewind(monitors,&li);
2586 while((ln = listNext(&li))) {
2587 redisClient *monitor = ln->value;
2588 addReply(monitor,cmdobj);
2589 }
2590 decrRefCount(cmdobj);
2591 }
2592
2593 static void processInputBuffer(redisClient *c) {
2594 again:
2595 /* Before to process the input buffer, make sure the client is not
2596 * waitig for a blocking operation such as BLPOP. Note that the first
2597 * iteration the client is never blocked, otherwise the processInputBuffer
2598 * would not be called at all, but after the execution of the first commands
2599 * in the input buffer the client may be blocked, and the "goto again"
2600 * will try to reiterate. The following line will make it return asap. */
2601 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2602 if (c->bulklen == -1) {
2603 /* Read the first line of the query */
2604 char *p = strchr(c->querybuf,'\n');
2605 size_t querylen;
2606
2607 if (p) {
2608 sds query, *argv;
2609 int argc, j;
2610
2611 query = c->querybuf;
2612 c->querybuf = sdsempty();
2613 querylen = 1+(p-(query));
2614 if (sdslen(query) > querylen) {
2615 /* leave data after the first line of the query in the buffer */
2616 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2617 }
2618 *p = '\0'; /* remove "\n" */
2619 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2620 sdsupdatelen(query);
2621
2622 /* Now we can split the query in arguments */
2623 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2624 sdsfree(query);
2625
2626 if (c->argv) zfree(c->argv);
2627 c->argv = zmalloc(sizeof(robj*)*argc);
2628
2629 for (j = 0; j < argc; j++) {
2630 if (sdslen(argv[j])) {
2631 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2632 c->argc++;
2633 } else {
2634 sdsfree(argv[j]);
2635 }
2636 }
2637 zfree(argv);
2638 if (c->argc) {
2639 /* Execute the command. If the client is still valid
2640 * after processCommand() return and there is something
2641 * on the query buffer try to process the next command. */
2642 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2643 } else {
2644 /* Nothing to process, argc == 0. Just process the query
2645 * buffer if it's not empty or return to the caller */
2646 if (sdslen(c->querybuf)) goto again;
2647 }
2648 return;
2649 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2650 redisLog(REDIS_VERBOSE, "Client protocol error");
2651 freeClient(c);
2652 return;
2653 }
2654 } else {
2655 /* Bulk read handling. Note that if we are at this point
2656 the client already sent a command terminated with a newline,
2657 we are reading the bulk data that is actually the last
2658 argument of the command. */
2659 int qbl = sdslen(c->querybuf);
2660
2661 if (c->bulklen <= qbl) {
2662 /* Copy everything but the final CRLF as final argument */
2663 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2664 c->argc++;
2665 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2666 /* Process the command. If the client is still valid after
2667 * the processing and there is more data in the buffer
2668 * try to parse it. */
2669 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2670 return;
2671 }
2672 }
2673 }
2674
2675 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2676 redisClient *c = (redisClient*) privdata;
2677 char buf[REDIS_IOBUF_LEN];
2678 int nread;
2679 REDIS_NOTUSED(el);
2680 REDIS_NOTUSED(mask);
2681
2682 nread = read(fd, buf, REDIS_IOBUF_LEN);
2683 if (nread == -1) {
2684 if (errno == EAGAIN) {
2685 nread = 0;
2686 } else {
2687 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2688 freeClient(c);
2689 return;
2690 }
2691 } else if (nread == 0) {
2692 redisLog(REDIS_VERBOSE, "Client closed connection");
2693 freeClient(c);
2694 return;
2695 }
2696 if (nread) {
2697 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2698 c->lastinteraction = time(NULL);
2699 } else {
2700 return;
2701 }
2702 processInputBuffer(c);
2703 }
2704
2705 static int selectDb(redisClient *c, int id) {
2706 if (id < 0 || id >= server.dbnum)
2707 return REDIS_ERR;
2708 c->db = &server.db[id];
2709 return REDIS_OK;
2710 }
2711
2712 static void *dupClientReplyValue(void *o) {
2713 incrRefCount((robj*)o);
2714 return o;
2715 }
2716
2717 static int listMatchObjects(void *a, void *b) {
2718 return equalStringObjects(a,b);
2719 }
2720
2721 static redisClient *createClient(int fd) {
2722 redisClient *c = zmalloc(sizeof(*c));
2723
2724 anetNonBlock(NULL,fd);
2725 anetTcpNoDelay(NULL,fd);
2726 if (!c) return NULL;
2727 selectDb(c,0);
2728 c->fd = fd;
2729 c->querybuf = sdsempty();
2730 c->argc = 0;
2731 c->argv = NULL;
2732 c->bulklen = -1;
2733 c->multibulk = 0;
2734 c->mbargc = 0;
2735 c->mbargv = NULL;
2736 c->sentlen = 0;
2737 c->flags = 0;
2738 c->lastinteraction = time(NULL);
2739 c->authenticated = 0;
2740 c->replstate = REDIS_REPL_NONE;
2741 c->reply = listCreate();
2742 listSetFreeMethod(c->reply,decrRefCount);
2743 listSetDupMethod(c->reply,dupClientReplyValue);
2744 c->blocking_keys = NULL;
2745 c->blocking_keys_num = 0;
2746 c->io_keys = listCreate();
2747 c->watched_keys = listCreate();
2748 listSetFreeMethod(c->io_keys,decrRefCount);
2749 c->pubsub_channels = dictCreate(&setDictType,NULL);
2750 c->pubsub_patterns = listCreate();
2751 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2752 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2753 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2754 readQueryFromClient, c) == AE_ERR) {
2755 freeClient(c);
2756 return NULL;
2757 }
2758 listAddNodeTail(server.clients,c);
2759 initClientMultiState(c);
2760 return c;
2761 }
2762
2763 static void addReply(redisClient *c, robj *obj) {
2764 if (listLength(c->reply) == 0 &&
2765 (c->replstate == REDIS_REPL_NONE ||
2766 c->replstate == REDIS_REPL_ONLINE) &&
2767 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2768 sendReplyToClient, c) == AE_ERR) return;
2769
2770 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2771 obj = dupStringObject(obj);
2772 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2773 }
2774 listAddNodeTail(c->reply,getDecodedObject(obj));
2775 }
2776
2777 static void addReplySds(redisClient *c, sds s) {
2778 robj *o = createObject(REDIS_STRING,s);
2779 addReply(c,o);
2780 decrRefCount(o);
2781 }
2782
2783 static void addReplyDouble(redisClient *c, double d) {
2784 char buf[128];
2785
2786 snprintf(buf,sizeof(buf),"%.17g",d);
2787 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2788 (unsigned long) strlen(buf),buf));
2789 }
2790
2791 static void addReplyLongLong(redisClient *c, long long ll) {
2792 char buf[128];
2793 size_t len;
2794
2795 if (ll == 0) {
2796 addReply(c,shared.czero);
2797 return;
2798 } else if (ll == 1) {
2799 addReply(c,shared.cone);
2800 return;
2801 }
2802 buf[0] = ':';
2803 len = ll2string(buf+1,sizeof(buf)-1,ll);
2804 buf[len+1] = '\r';
2805 buf[len+2] = '\n';
2806 addReplySds(c,sdsnewlen(buf,len+3));
2807 }
2808
2809 static void addReplyUlong(redisClient *c, unsigned long ul) {
2810 char buf[128];
2811 size_t len;
2812
2813 if (ul == 0) {
2814 addReply(c,shared.czero);
2815 return;
2816 } else if (ul == 1) {
2817 addReply(c,shared.cone);
2818 return;
2819 }
2820 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2821 addReplySds(c,sdsnewlen(buf,len));
2822 }
2823
2824 static void addReplyBulkLen(redisClient *c, robj *obj) {
2825 size_t len, intlen;
2826 char buf[128];
2827
2828 if (obj->encoding == REDIS_ENCODING_RAW) {
2829 len = sdslen(obj->ptr);
2830 } else {
2831 long n = (long)obj->ptr;
2832
2833 /* Compute how many bytes will take this integer as a radix 10 string */
2834 len = 1;
2835 if (n < 0) {
2836 len++;
2837 n = -n;
2838 }
2839 while((n = n/10) != 0) {
2840 len++;
2841 }
2842 }
2843 buf[0] = '$';
2844 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2845 buf[intlen+1] = '\r';
2846 buf[intlen+2] = '\n';
2847 addReplySds(c,sdsnewlen(buf,intlen+3));
2848 }
2849
2850 static void addReplyBulk(redisClient *c, robj *obj) {
2851 addReplyBulkLen(c,obj);
2852 addReply(c,obj);
2853 addReply(c,shared.crlf);
2854 }
2855
2856 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2857 static void addReplyBulkCString(redisClient *c, char *s) {
2858 if (s == NULL) {
2859 addReply(c,shared.nullbulk);
2860 } else {
2861 robj *o = createStringObject(s,strlen(s));
2862 addReplyBulk(c,o);
2863 decrRefCount(o);
2864 }
2865 }
2866
2867 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2868 int cport, cfd;
2869 char cip[128];
2870 redisClient *c;
2871 REDIS_NOTUSED(el);
2872 REDIS_NOTUSED(mask);
2873 REDIS_NOTUSED(privdata);
2874
2875 cfd = anetAccept(server.neterr, fd, cip, &cport);
2876 if (cfd == AE_ERR) {
2877 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2878 return;
2879 }
2880 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2881 if ((c = createClient(cfd)) == NULL) {
2882 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2883 close(cfd); /* May be already closed, just ingore errors */
2884 return;
2885 }
2886 /* If maxclient directive is set and this is one client more... close the
2887 * connection. Note that we create the client instead to check before
2888 * for this condition, since now the socket is already set in nonblocking
2889 * mode and we can send an error for free using the Kernel I/O */
2890 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2891 char *err = "-ERR max number of clients reached\r\n";
2892
2893 /* That's a best effort error message, don't check write errors */
2894 if (write(c->fd,err,strlen(err)) == -1) {
2895 /* Nothing to do, Just to avoid the warning... */
2896 }
2897 freeClient(c);
2898 return;
2899 }
2900 server.stat_numconnections++;
2901 }
2902
2903 /* ======================= Redis objects implementation ===================== */
2904
2905 static robj *createObject(int type, void *ptr) {
2906 robj *o;
2907
2908 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2909 if (listLength(server.objfreelist)) {
2910 listNode *head = listFirst(server.objfreelist);
2911 o = listNodeValue(head);
2912 listDelNode(server.objfreelist,head);
2913 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2914 } else {
2915 if (server.vm_enabled) {
2916 pthread_mutex_unlock(&server.obj_freelist_mutex);
2917 o = zmalloc(sizeof(*o));
2918 } else {
2919 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2920 }
2921 }
2922 o->type = type;
2923 o->encoding = REDIS_ENCODING_RAW;
2924 o->ptr = ptr;
2925 o->refcount = 1;
2926 if (server.vm_enabled) {
2927 /* Note that this code may run in the context of an I/O thread
2928 * and accessing to server.unixtime in theory is an error
2929 * (no locks). But in practice this is safe, and even if we read
2930 * garbage Redis will not fail, as it's just a statistical info */
2931 o->vm.atime = server.unixtime;
2932 o->storage = REDIS_VM_MEMORY;
2933 }
2934 return o;
2935 }
2936
2937 static robj *createStringObject(char *ptr, size_t len) {
2938 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2939 }
2940
2941 static robj *createStringObjectFromLongLong(long long value) {
2942 robj *o;
2943 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2944 incrRefCount(shared.integers[value]);
2945 o = shared.integers[value];
2946 } else {
2947 if (value >= LONG_MIN && value <= LONG_MAX) {
2948 o = createObject(REDIS_STRING, NULL);
2949 o->encoding = REDIS_ENCODING_INT;
2950 o->ptr = (void*)((long)value);
2951 } else {
2952 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2953 }
2954 }
2955 return o;
2956 }
2957
2958 static robj *dupStringObject(robj *o) {
2959 assert(o->encoding == REDIS_ENCODING_RAW);
2960 return createStringObject(o->ptr,sdslen(o->ptr));
2961 }
2962
2963 static robj *createListObject(void) {
2964 list *l = listCreate();
2965
2966 listSetFreeMethod(l,decrRefCount);
2967 return createObject(REDIS_LIST,l);
2968 }
2969
2970 static robj *createSetObject(void) {
2971 dict *d = dictCreate(&setDictType,NULL);
2972 return createObject(REDIS_SET,d);
2973 }
2974
2975 static robj *createHashObject(void) {
2976 /* All the Hashes start as zipmaps. Will be automatically converted
2977 * into hash tables if there are enough elements or big elements
2978 * inside. */
2979 unsigned char *zm = zipmapNew();
2980 robj *o = createObject(REDIS_HASH,zm);
2981 o->encoding = REDIS_ENCODING_ZIPMAP;
2982 return o;
2983 }
2984
2985 static robj *createZsetObject(void) {
2986 zset *zs = zmalloc(sizeof(*zs));
2987
2988 zs->dict = dictCreate(&zsetDictType,NULL);
2989 zs->zsl = zslCreate();
2990 return createObject(REDIS_ZSET,zs);
2991 }
2992
2993 static void freeStringObject(robj *o) {
2994 if (o->encoding == REDIS_ENCODING_RAW) {
2995 sdsfree(o->ptr);
2996 }
2997 }
2998
2999 static void freeListObject(robj *o) {
3000 listRelease((list*) o->ptr);
3001 }
3002
3003 static void freeSetObject(robj *o) {
3004 dictRelease((dict*) o->ptr);
3005 }
3006
3007 static void freeZsetObject(robj *o) {
3008 zset *zs = o->ptr;
3009
3010 dictRelease(zs->dict);
3011 zslFree(zs->zsl);
3012 zfree(zs);
3013 }
3014
3015 static void freeHashObject(robj *o) {
3016 switch (o->encoding) {
3017 case REDIS_ENCODING_HT:
3018 dictRelease((dict*) o->ptr);
3019 break;
3020 case REDIS_ENCODING_ZIPMAP:
3021 zfree(o->ptr);
3022 break;
3023 default:
3024 redisPanic("Unknown hash encoding type");
3025 break;
3026 }
3027 }
3028
3029 static void incrRefCount(robj *o) {
3030 o->refcount++;
3031 }
3032
3033 static void decrRefCount(void *obj) {
3034 robj *o = obj;
3035
3036 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3037 /* Object is a key of a swapped out value, or in the process of being
3038 * loaded. */
3039 if (server.vm_enabled &&
3040 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3041 {
3042 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3043 redisAssert(o->type == REDIS_STRING);
3044 freeStringObject(o);
3045 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3046 pthread_mutex_lock(&server.obj_freelist_mutex);
3047 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3048 !listAddNodeHead(server.objfreelist,o))
3049 zfree(o);
3050 pthread_mutex_unlock(&server.obj_freelist_mutex);
3051 server.vm_stats_swapped_objects--;
3052 return;
3053 }
3054 /* Object is in memory, or in the process of being swapped out. */
3055 if (--(o->refcount) == 0) {
3056 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3057 vmCancelThreadedIOJob(obj);
3058 switch(o->type) {
3059 case REDIS_STRING: freeStringObject(o); break;
3060 case REDIS_LIST: freeListObject(o); break;
3061 case REDIS_SET: freeSetObject(o); break;
3062 case REDIS_ZSET: freeZsetObject(o); break;
3063 case REDIS_HASH: freeHashObject(o); break;
3064 default: redisPanic("Unknown object type"); break;
3065 }
3066 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3067 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3068 !listAddNodeHead(server.objfreelist,o))
3069 zfree(o);
3070 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3071 }
3072 }
3073
3074 static robj *lookupKey(redisDb *db, robj *key) {
3075 dictEntry *de = dictFind(db->dict,key);
3076 if (de) {
3077 robj *key = dictGetEntryKey(de);
3078 robj *val = dictGetEntryVal(de);
3079
3080 if (server.vm_enabled) {
3081 if (key->storage == REDIS_VM_MEMORY ||
3082 key->storage == REDIS_VM_SWAPPING)
3083 {
3084 /* If we were swapping the object out, stop it, this key
3085 * was requested. */
3086 if (key->storage == REDIS_VM_SWAPPING)
3087 vmCancelThreadedIOJob(key);
3088 /* Update the access time of the key for the aging algorithm. */
3089 key->vm.atime = server.unixtime;
3090 } else {
3091 int notify = (key->storage == REDIS_VM_LOADING);
3092
3093 /* Our value was swapped on disk. Bring it at home. */
3094 redisAssert(val == NULL);
3095 val = vmLoadObject(key);
3096 dictGetEntryVal(de) = val;
3097
3098 /* Clients blocked by the VM subsystem may be waiting for
3099 * this key... */
3100 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3101 }
3102 }
3103 return val;
3104 } else {
3105 return NULL;
3106 }
3107 }
3108
3109 static robj *lookupKeyRead(redisDb *db, robj *key) {
3110 expireIfNeeded(db,key);
3111 return lookupKey(db,key);
3112 }
3113
3114 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3115 deleteIfVolatile(db,key);
3116 touchWatchedKey(db,key);
3117 return lookupKey(db,key);
3118 }
3119
3120 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3121 robj *o = lookupKeyRead(c->db, key);
3122 if (!o) addReply(c,reply);
3123 return o;
3124 }
3125
3126 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3127 robj *o = lookupKeyWrite(c->db, key);
3128 if (!o) addReply(c,reply);
3129 return o;
3130 }
3131
3132 static int checkType(redisClient *c, robj *o, int type) {
3133 if (o->type != type) {
3134 addReply(c,shared.wrongtypeerr);
3135 return 1;
3136 }
3137 return 0;
3138 }
3139
3140 static int deleteKey(redisDb *db, robj *key) {
3141 int retval;
3142
3143 /* We need to protect key from destruction: after the first dictDelete()
3144 * it may happen that 'key' is no longer valid if we don't increment
3145 * it's count. This may happen when we get the object reference directly
3146 * from the hash table with dictRandomKey() or dict iterators */
3147 incrRefCount(key);
3148 if (dictSize(db->expires)) dictDelete(db->expires,key);
3149 retval = dictDelete(db->dict,key);
3150 decrRefCount(key);
3151
3152 return retval == DICT_OK;
3153 }
3154
3155 /* Check if the nul-terminated string 's' can be represented by a long
3156 * (that is, is a number that fits into long without any other space or
3157 * character before or after the digits).
3158 *
3159 * If so, the function returns REDIS_OK and *longval is set to the value
3160 * of the number. Otherwise REDIS_ERR is returned */
3161 static int isStringRepresentableAsLong(sds s, long *longval) {
3162 char buf[32], *endptr;
3163 long value;
3164 int slen;
3165
3166 value = strtol(s, &endptr, 10);
3167 if (endptr[0] != '\0') return REDIS_ERR;
3168 slen = ll2string(buf,32,value);
3169
3170 /* If the number converted back into a string is not identical
3171 * then it's not possible to encode the string as integer */
3172 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3173 if (longval) *longval = value;
3174 return REDIS_OK;
3175 }
3176
3177 /* Try to encode a string object in order to save space */
3178 static robj *tryObjectEncoding(robj *o) {
3179 long value;
3180 sds s = o->ptr;
3181
3182 if (o->encoding != REDIS_ENCODING_RAW)
3183 return o; /* Already encoded */
3184
3185 /* It's not safe to encode shared objects: shared objects can be shared
3186 * everywhere in the "object space" of Redis. Encoded objects can only
3187 * appear as "values" (and not, for instance, as keys) */
3188 if (o->refcount > 1) return o;
3189
3190 /* Currently we try to encode only strings */
3191 redisAssert(o->type == REDIS_STRING);
3192
3193 /* Check if we can represent this string as a long integer */
3194 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3195
3196 /* Ok, this object can be encoded */
3197 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3198 decrRefCount(o);
3199 incrRefCount(shared.integers[value]);
3200 return shared.integers[value];
3201 } else {
3202 o->encoding = REDIS_ENCODING_INT;
3203 sdsfree(o->ptr);
3204 o->ptr = (void*) value;
3205 return o;
3206 }
3207 }
3208
3209 /* Get a decoded version of an encoded object (returned as a new object).
3210 * If the object is already raw-encoded just increment the ref count. */
3211 static robj *getDecodedObject(robj *o) {
3212 robj *dec;
3213
3214 if (o->encoding == REDIS_ENCODING_RAW) {
3215 incrRefCount(o);
3216 return o;
3217 }
3218 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3219 char buf[32];
3220
3221 ll2string(buf,32,(long)o->ptr);
3222 dec = createStringObject(buf,strlen(buf));
3223 return dec;
3224 } else {
3225 redisPanic("Unknown encoding type");
3226 }
3227 }
3228
3229 /* Compare two string objects via strcmp() or alike.
3230 * Note that the objects may be integer-encoded. In such a case we
3231 * use ll2string() to get a string representation of the numbers on the stack
3232 * and compare the strings, it's much faster than calling getDecodedObject().
3233 *
3234 * Important note: if objects are not integer encoded, but binary-safe strings,
3235 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3236 * binary safe. */
3237 static int compareStringObjects(robj *a, robj *b) {
3238 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3239 char bufa[128], bufb[128], *astr, *bstr;
3240 int bothsds = 1;
3241
3242 if (a == b) return 0;
3243 if (a->encoding != REDIS_ENCODING_RAW) {
3244 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3245 astr = bufa;
3246 bothsds = 0;
3247 } else {
3248 astr = a->ptr;
3249 }
3250 if (b->encoding != REDIS_ENCODING_RAW) {
3251 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3252 bstr = bufb;
3253 bothsds = 0;
3254 } else {
3255 bstr = b->ptr;
3256 }
3257 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3258 }
3259
3260 /* Equal string objects return 1 if the two objects are the same from the
3261 * point of view of a string comparison, otherwise 0 is returned. Note that
3262 * this function is faster then checking for (compareStringObject(a,b) == 0)
3263 * because it can perform some more optimization. */
3264 static int equalStringObjects(robj *a, robj *b) {
3265 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3266 return a->ptr == b->ptr;
3267 } else {
3268 return compareStringObjects(a,b) == 0;
3269 }
3270 }
3271
3272 static size_t stringObjectLen(robj *o) {
3273 redisAssert(o->type == REDIS_STRING);
3274 if (o->encoding == REDIS_ENCODING_RAW) {
3275 return sdslen(o->ptr);
3276 } else {
3277 char buf[32];
3278
3279 return ll2string(buf,32,(long)o->ptr);
3280 }
3281 }
3282
3283 static int getDoubleFromObject(robj *o, double *target) {
3284 double value;
3285 char *eptr;
3286
3287 if (o == NULL) {
3288 value = 0;
3289 } else {
3290 redisAssert(o->type == REDIS_STRING);
3291 if (o->encoding == REDIS_ENCODING_RAW) {
3292 value = strtod(o->ptr, &eptr);
3293 if (eptr[0] != '\0') return REDIS_ERR;
3294 } else if (o->encoding == REDIS_ENCODING_INT) {
3295 value = (long)o->ptr;
3296 } else {
3297 redisPanic("Unknown string encoding");
3298 }
3299 }
3300
3301 *target = value;
3302 return REDIS_OK;
3303 }
3304
3305 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3306 double value;
3307 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3308 if (msg != NULL) {
3309 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3310 } else {
3311 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3312 }
3313 return REDIS_ERR;
3314 }
3315
3316 *target = value;
3317 return REDIS_OK;
3318 }
3319
3320 static int getLongLongFromObject(robj *o, long long *target) {
3321 long long value;
3322 char *eptr;
3323
3324 if (o == NULL) {
3325 value = 0;
3326 } else {
3327 redisAssert(o->type == REDIS_STRING);
3328 if (o->encoding == REDIS_ENCODING_RAW) {
3329 value = strtoll(o->ptr, &eptr, 10);
3330 if (eptr[0] != '\0') return REDIS_ERR;
3331 } else if (o->encoding == REDIS_ENCODING_INT) {
3332 value = (long)o->ptr;
3333 } else {
3334 redisPanic("Unknown string encoding");
3335 }
3336 }
3337
3338 *target = value;
3339 return REDIS_OK;
3340 }
3341
3342 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3343 long long value;
3344 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3345 if (msg != NULL) {
3346 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3347 } else {
3348 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3349 }
3350 return REDIS_ERR;
3351 }
3352
3353 *target = value;
3354 return REDIS_OK;
3355 }
3356
3357 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3358 long long value;
3359
3360 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3361 if (value < LONG_MIN || value > LONG_MAX) {
3362 if (msg != NULL) {
3363 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3364 } else {
3365 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3366 }
3367 return REDIS_ERR;
3368 }
3369
3370 *target = value;
3371 return REDIS_OK;
3372 }
3373
3374 /*============================ RDB saving/loading =========================== */
3375
3376 static int rdbSaveType(FILE *fp, unsigned char type) {
3377 if (fwrite(&type,1,1,fp) == 0) return -1;
3378 return 0;
3379 }
3380
3381 static int rdbSaveTime(FILE *fp, time_t t) {
3382 int32_t t32 = (int32_t) t;
3383 if (fwrite(&t32,4,1,fp) == 0) return -1;
3384 return 0;
3385 }
3386
3387 /* check rdbLoadLen() comments for more info */
3388 static int rdbSaveLen(FILE *fp, uint32_t len) {
3389 unsigned char buf[2];
3390
3391 if (len < (1<<6)) {
3392 /* Save a 6 bit len */
3393 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3394 if (fwrite(buf,1,1,fp) == 0) return -1;
3395 } else if (len < (1<<14)) {
3396 /* Save a 14 bit len */
3397 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3398 buf[1] = len&0xFF;
3399 if (fwrite(buf,2,1,fp) == 0) return -1;
3400 } else {
3401 /* Save a 32 bit len */
3402 buf[0] = (REDIS_RDB_32BITLEN<<6);
3403 if (fwrite(buf,1,1,fp) == 0) return -1;
3404 len = htonl(len);
3405 if (fwrite(&len,4,1,fp) == 0) return -1;
3406 }
3407 return 0;
3408 }
3409
3410 /* Encode 'value' as an integer if possible (if integer will fit the
3411 * supported range). If the function sucessful encoded the integer
3412 * then the (up to 5 bytes) encoded representation is written in the
3413 * string pointed by 'enc' and the length is returned. Otherwise
3414 * 0 is returned. */
3415 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3416 /* Finally check if it fits in our ranges */
3417 if (value >= -(1<<7) && value <= (1<<7)-1) {
3418 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3419 enc[1] = value&0xFF;
3420 return 2;
3421 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3422 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3423 enc[1] = value&0xFF;
3424 enc[2] = (value>>8)&0xFF;
3425 return 3;
3426 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3427 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3428 enc[1] = value&0xFF;
3429 enc[2] = (value>>8)&0xFF;
3430 enc[3] = (value>>16)&0xFF;
3431 enc[4] = (value>>24)&0xFF;
3432 return 5;
3433 } else {
3434 return 0;
3435 }
3436 }
3437
3438 /* String objects in the form "2391" "-100" without any space and with a
3439 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3440 * encoded as integers to save space */
3441 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3442 long long value;
3443 char *endptr, buf[32];
3444
3445 /* Check if it's possible to encode this value as a number */
3446 value = strtoll(s, &endptr, 10);
3447 if (endptr[0] != '\0') return 0;
3448 ll2string(buf,32,value);
3449
3450 /* If the number converted back into a string is not identical
3451 * then it's not possible to encode the string as integer */
3452 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3453
3454 return rdbEncodeInteger(value,enc);
3455 }
3456
3457 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3458 size_t comprlen, outlen;
3459 unsigned char byte;
3460 void *out;
3461
3462 /* We require at least four bytes compression for this to be worth it */
3463 if (len <= 4) return 0;
3464 outlen = len-4;
3465 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3466 comprlen = lzf_compress(s, len, out, outlen);
3467 if (comprlen == 0) {
3468 zfree(out);
3469 return 0;
3470 }
3471 /* Data compressed! Let's save it on disk */
3472 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3473 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3474 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3475 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3476 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3477 zfree(out);
3478 return comprlen;
3479
3480 writeerr:
3481 zfree(out);
3482 return -1;
3483 }
3484
3485 /* Save a string objet as [len][data] on disk. If the object is a string
3486 * representation of an integer value we try to safe it in a special form */
3487 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3488 int enclen;
3489
3490 /* Try integer encoding */
3491 if (len <= 11) {
3492 unsigned char buf[5];
3493 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3494 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3495 return 0;
3496 }
3497 }
3498
3499 /* Try LZF compression - under 20 bytes it's unable to compress even
3500 * aaaaaaaaaaaaaaaaaa so skip it */
3501 if (server.rdbcompression && len > 20) {
3502 int retval;
3503
3504 retval = rdbSaveLzfStringObject(fp,s,len);
3505 if (retval == -1) return -1;
3506 if (retval > 0) return 0;
3507 /* retval == 0 means data can't be compressed, save the old way */
3508 }
3509
3510 /* Store verbatim */
3511 if (rdbSaveLen(fp,len) == -1) return -1;
3512 if (len && fwrite(s,len,1,fp) == 0) return -1;
3513 return 0;
3514 }
3515
3516 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3517 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3518 int retval;
3519
3520 /* Avoid to decode the object, then encode it again, if the
3521 * object is alrady integer encoded. */
3522 if (obj->encoding == REDIS_ENCODING_INT) {
3523 long val = (long) obj->ptr;
3524 unsigned char buf[5];
3525 int enclen;
3526
3527 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3528 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3529 return 0;
3530 }
3531 /* otherwise... fall throught and continue with the usual
3532 * code path. */
3533 }
3534
3535 /* Avoid incr/decr ref count business when possible.
3536 * This plays well with copy-on-write given that we are probably
3537 * in a child process (BGSAVE). Also this makes sure key objects
3538 * of swapped objects are not incRefCount-ed (an assert does not allow
3539 * this in order to avoid bugs) */
3540 if (obj->encoding != REDIS_ENCODING_RAW) {
3541 obj = getDecodedObject(obj);
3542 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3543 decrRefCount(obj);
3544 } else {
3545 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3546 }
3547 return retval;
3548 }
3549
3550 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3551 * 8 bit integer specifing the length of the representation.
3552 * This 8 bit integer has special values in order to specify the following
3553 * conditions:
3554 * 253: not a number
3555 * 254: + inf
3556 * 255: - inf
3557 */
3558 static int rdbSaveDoubleValue(FILE *fp, double val) {
3559 unsigned char buf[128];
3560 int len;
3561
3562 if (isnan(val)) {
3563 buf[0] = 253;
3564 len = 1;
3565 } else if (!isfinite(val)) {
3566 len = 1;
3567 buf[0] = (val < 0) ? 255 : 254;
3568 } else {
3569 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3570 /* Check if the float is in a safe range to be casted into a
3571 * long long. We are assuming that long long is 64 bit here.
3572 * Also we are assuming that there are no implementations around where
3573 * double has precision < 52 bit.
3574 *
3575 * Under this assumptions we test if a double is inside an interval
3576 * where casting to long long is safe. Then using two castings we
3577 * make sure the decimal part is zero. If all this is true we use
3578 * integer printing function that is much faster. */
3579 double min = -4503599627370495; /* (2^52)-1 */
3580 double max = 4503599627370496; /* -(2^52) */
3581 if (val > min && val < max && val == ((double)((long long)val)))
3582 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3583 else
3584 #endif
3585 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3586 buf[0] = strlen((char*)buf+1);
3587 len = buf[0]+1;
3588 }
3589 if (fwrite(buf,len,1,fp) == 0) return -1;
3590 return 0;
3591 }
3592
3593 /* Save a Redis object. */
3594 static int rdbSaveObject(FILE *fp, robj *o) {
3595 if (o->type == REDIS_STRING) {
3596 /* Save a string value */
3597 if (rdbSaveStringObject(fp,o) == -1) return -1;
3598 } else if (o->type == REDIS_LIST) {
3599 /* Save a list value */
3600 list *list = o->ptr;
3601 listIter li;
3602 listNode *ln;
3603
3604 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3605 listRewind(list,&li);
3606 while((ln = listNext(&li))) {
3607 robj *eleobj = listNodeValue(ln);
3608
3609 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3610 }
3611 } else if (o->type == REDIS_SET) {
3612 /* Save a set value */
3613 dict *set = o->ptr;
3614 dictIterator *di = dictGetIterator(set);
3615 dictEntry *de;
3616
3617 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3618 while((de = dictNext(di)) != NULL) {
3619 robj *eleobj = dictGetEntryKey(de);
3620
3621 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3622 }
3623 dictReleaseIterator(di);
3624 } else if (o->type == REDIS_ZSET) {
3625 /* Save a set value */
3626 zset *zs = o->ptr;
3627 dictIterator *di = dictGetIterator(zs->dict);
3628 dictEntry *de;
3629
3630 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3631 while((de = dictNext(di)) != NULL) {
3632 robj *eleobj = dictGetEntryKey(de);
3633 double *score = dictGetEntryVal(de);
3634
3635 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3636 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3637 }
3638 dictReleaseIterator(di);
3639 } else if (o->type == REDIS_HASH) {
3640 /* Save a hash value */
3641 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3642 unsigned char *p = zipmapRewind(o->ptr);
3643 unsigned int count = zipmapLen(o->ptr);
3644 unsigned char *key, *val;
3645 unsigned int klen, vlen;
3646
3647 if (rdbSaveLen(fp,count) == -1) return -1;
3648 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3649 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3650 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3651 }
3652 } else {
3653 dictIterator *di = dictGetIterator(o->ptr);
3654 dictEntry *de;
3655
3656 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3657 while((de = dictNext(di)) != NULL) {
3658 robj *key = dictGetEntryKey(de);
3659 robj *val = dictGetEntryVal(de);
3660
3661 if (rdbSaveStringObject(fp,key) == -1) return -1;
3662 if (rdbSaveStringObject(fp,val) == -1) return -1;
3663 }
3664 dictReleaseIterator(di);
3665 }
3666 } else {
3667 redisPanic("Unknown object type");
3668 }
3669 return 0;
3670 }
3671
3672 /* Return the length the object will have on disk if saved with
3673 * the rdbSaveObject() function. Currently we use a trick to get
3674 * this length with very little changes to the code. In the future
3675 * we could switch to a faster solution. */
3676 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3677 if (fp == NULL) fp = server.devnull;
3678 rewind(fp);
3679 assert(rdbSaveObject(fp,o) != 1);
3680 return ftello(fp);
3681 }
3682
3683 /* Return the number of pages required to save this object in the swap file */
3684 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3685 off_t bytes = rdbSavedObjectLen(o,fp);
3686
3687 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3688 }
3689
3690 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3691 static int rdbSave(char *filename) {
3692 dictIterator *di = NULL;
3693 dictEntry *de;
3694 FILE *fp;
3695 char tmpfile[256];
3696 int j;
3697 time_t now = time(NULL);
3698
3699 /* Wait for I/O therads to terminate, just in case this is a
3700 * foreground-saving, to avoid seeking the swap file descriptor at the
3701 * same time. */
3702 if (server.vm_enabled)
3703 waitEmptyIOJobsQueue();
3704
3705 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3706 fp = fopen(tmpfile,"w");
3707 if (!fp) {
3708 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3709 return REDIS_ERR;
3710 }
3711 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3712 for (j = 0; j < server.dbnum; j++) {
3713 redisDb *db = server.db+j;
3714 dict *d = db->dict;
3715 if (dictSize(d) == 0) continue;
3716 di = dictGetIterator(d);
3717 if (!di) {
3718 fclose(fp);
3719 return REDIS_ERR;
3720 }
3721
3722 /* Write the SELECT DB opcode */
3723 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3724 if (rdbSaveLen(fp,j) == -1) goto werr;
3725
3726 /* Iterate this DB writing every entry */
3727 while((de = dictNext(di)) != NULL) {
3728 robj *key = dictGetEntryKey(de);
3729 robj *o = dictGetEntryVal(de);
3730 time_t expiretime = getExpire(db,key);
3731
3732 /* Save the expire time */
3733 if (expiretime != -1) {
3734 /* If this key is already expired skip it */
3735 if (expiretime < now) continue;
3736 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3737 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3738 }
3739 /* Save the key and associated value. This requires special
3740 * handling if the value is swapped out. */
3741 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3742 key->storage == REDIS_VM_SWAPPING) {
3743 /* Save type, key, value */
3744 if (rdbSaveType(fp,o->type) == -1) goto werr;
3745 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3746 if (rdbSaveObject(fp,o) == -1) goto werr;
3747 } else {
3748 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3749 robj *po;
3750 /* Get a preview of the object in memory */
3751 po = vmPreviewObject(key);
3752 /* Save type, key, value */
3753 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3754 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3755 if (rdbSaveObject(fp,po) == -1) goto werr;
3756 /* Remove the loaded object from memory */
3757 decrRefCount(po);
3758 }
3759 }
3760 dictReleaseIterator(di);
3761 }
3762 /* EOF opcode */
3763 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3764
3765 /* Make sure data will not remain on the OS's output buffers */
3766 fflush(fp);
3767 fsync(fileno(fp));
3768 fclose(fp);
3769
3770 /* Use RENAME to make sure the DB file is changed atomically only
3771 * if the generate DB file is ok. */
3772 if (rename(tmpfile,filename) == -1) {
3773 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3774 unlink(tmpfile);
3775 return REDIS_ERR;
3776 }
3777 redisLog(REDIS_NOTICE,"DB saved on disk");
3778 server.dirty = 0;
3779 server.lastsave = time(NULL);
3780 return REDIS_OK;
3781
3782 werr:
3783 fclose(fp);
3784 unlink(tmpfile);
3785 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3786 if (di) dictReleaseIterator(di);
3787 return REDIS_ERR;
3788 }
3789
3790 static int rdbSaveBackground(char *filename) {
3791 pid_t childpid;
3792
3793 if (server.bgsavechildpid != -1) return REDIS_ERR;
3794 if (server.vm_enabled) waitEmptyIOJobsQueue();
3795 if ((childpid = fork()) == 0) {
3796 /* Child */
3797 if (server.vm_enabled) vmReopenSwapFile();
3798 close(server.fd);
3799 if (rdbSave(filename) == REDIS_OK) {
3800 _exit(0);
3801 } else {
3802 _exit(1);
3803 }
3804 } else {
3805 /* Parent */
3806 if (childpid == -1) {
3807 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3808 strerror(errno));
3809 return REDIS_ERR;
3810 }
3811 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3812 server.bgsavechildpid = childpid;
3813 updateDictResizePolicy();
3814 return REDIS_OK;
3815 }
3816 return REDIS_OK; /* unreached */
3817 }
3818
3819 static void rdbRemoveTempFile(pid_t childpid) {
3820 char tmpfile[256];
3821
3822 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3823 unlink(tmpfile);
3824 }
3825
3826 static int rdbLoadType(FILE *fp) {
3827 unsigned char type;
3828 if (fread(&type,1,1,fp) == 0) return -1;
3829 return type;
3830 }
3831
3832 static time_t rdbLoadTime(FILE *fp) {
3833 int32_t t32;
3834 if (fread(&t32,4,1,fp) == 0) return -1;
3835 return (time_t) t32;
3836 }
3837
3838 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3839 * of this file for a description of how this are stored on disk.
3840 *
3841 * isencoded is set to 1 if the readed length is not actually a length but
3842 * an "encoding type", check the above comments for more info */
3843 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3844 unsigned char buf[2];
3845 uint32_t len;
3846 int type;
3847
3848 if (isencoded) *isencoded = 0;
3849 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3850 type = (buf[0]&0xC0)>>6;
3851 if (type == REDIS_RDB_6BITLEN) {
3852 /* Read a 6 bit len */
3853 return buf[0]&0x3F;
3854 } else if (type == REDIS_RDB_ENCVAL) {
3855 /* Read a 6 bit len encoding type */
3856 if (isencoded) *isencoded = 1;
3857 return buf[0]&0x3F;
3858 } else if (type == REDIS_RDB_14BITLEN) {
3859 /* Read a 14 bit len */
3860 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3861 return ((buf[0]&0x3F)<<8)|buf[1];
3862 } else {
3863 /* Read a 32 bit len */
3864 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3865 return ntohl(len);
3866 }
3867 }
3868
3869 /* Load an integer-encoded object from file 'fp', with the specified
3870 * encoding type 'enctype'. If encode is true the function may return
3871 * an integer-encoded object as reply, otherwise the returned object
3872 * will always be encoded as a raw string. */
3873 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3874 unsigned char enc[4];
3875 long long val;
3876
3877 if (enctype == REDIS_RDB_ENC_INT8) {
3878 if (fread(enc,1,1,fp) == 0) return NULL;
3879 val = (signed char)enc[0];
3880 } else if (enctype == REDIS_RDB_ENC_INT16) {
3881 uint16_t v;
3882 if (fread(enc,2,1,fp) == 0) return NULL;
3883 v = enc[0]|(enc[1]<<8);
3884 val = (int16_t)v;
3885 } else if (enctype == REDIS_RDB_ENC_INT32) {
3886 uint32_t v;
3887 if (fread(enc,4,1,fp) == 0) return NULL;
3888 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3889 val = (int32_t)v;
3890 } else {
3891 val = 0; /* anti-warning */
3892 redisPanic("Unknown RDB integer encoding type");
3893 }
3894 if (encode)
3895 return createStringObjectFromLongLong(val);
3896 else
3897 return createObject(REDIS_STRING,sdsfromlonglong(val));
3898 }
3899
3900 static robj *rdbLoadLzfStringObject(FILE*fp) {
3901 unsigned int len, clen;
3902 unsigned char *c = NULL;
3903 sds val = NULL;
3904
3905 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3906 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3907 if ((c = zmalloc(clen)) == NULL) goto err;
3908 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3909 if (fread(c,clen,1,fp) == 0) goto err;
3910 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3911 zfree(c);
3912 return createObject(REDIS_STRING,val);
3913 err:
3914 zfree(c);
3915 sdsfree(val);
3916 return NULL;
3917 }
3918
3919 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3920 int isencoded;
3921 uint32_t len;
3922 sds val;
3923
3924 len = rdbLoadLen(fp,&isencoded);
3925 if (isencoded) {
3926 switch(len) {
3927 case REDIS_RDB_ENC_INT8:
3928 case REDIS_RDB_ENC_INT16:
3929 case REDIS_RDB_ENC_INT32:
3930 return rdbLoadIntegerObject(fp,len,encode);
3931 case REDIS_RDB_ENC_LZF:
3932 return rdbLoadLzfStringObject(fp);
3933 default:
3934 redisPanic("Unknown RDB encoding type");
3935 }
3936 }
3937
3938 if (len == REDIS_RDB_LENERR) return NULL;
3939 val = sdsnewlen(NULL,len);
3940 if (len && fread(val,len,1,fp) == 0) {
3941 sdsfree(val);
3942 return NULL;
3943 }
3944 return createObject(REDIS_STRING,val);
3945 }
3946
3947 static robj *rdbLoadStringObject(FILE *fp) {
3948 return rdbGenericLoadStringObject(fp,0);
3949 }
3950
3951 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3952 return rdbGenericLoadStringObject(fp,1);
3953 }
3954
3955 /* For information about double serialization check rdbSaveDoubleValue() */
3956 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3957 char buf[128];
3958 unsigned char len;
3959
3960 if (fread(&len,1,1,fp) == 0) return -1;
3961 switch(len) {
3962 case 255: *val = R_NegInf; return 0;
3963 case 254: *val = R_PosInf; return 0;
3964 case 253: *val = R_Nan; return 0;
3965 default:
3966 if (fread(buf,len,1,fp) == 0) return -1;
3967 buf[len] = '\0';
3968 sscanf(buf, "%lg", val);
3969 return 0;
3970 }
3971 }
3972
3973 /* Load a Redis object of the specified type from the specified file.
3974 * On success a newly allocated object is returned, otherwise NULL. */
3975 static robj *rdbLoadObject(int type, FILE *fp) {
3976 robj *o;
3977
3978 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3979 if (type == REDIS_STRING) {
3980 /* Read string value */
3981 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3982 o = tryObjectEncoding(o);
3983 } else if (type == REDIS_LIST || type == REDIS_SET) {
3984 /* Read list/set value */
3985 uint32_t listlen;
3986
3987 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3988 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3989 /* It's faster to expand the dict to the right size asap in order
3990 * to avoid rehashing */
3991 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3992 dictExpand(o->ptr,listlen);
3993 /* Load every single element of the list/set */
3994 while(listlen--) {
3995 robj *ele;
3996
3997 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3998 ele = tryObjectEncoding(ele);
3999 if (type == REDIS_LIST) {
4000 listAddNodeTail((list*)o->ptr,ele);
4001 } else {
4002 dictAdd((dict*)o->ptr,ele,NULL);
4003 }
4004 }
4005 } else if (type == REDIS_ZSET) {
4006 /* Read list/set value */
4007 size_t zsetlen;
4008 zset *zs;
4009
4010 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4011 o = createZsetObject();
4012 zs = o->ptr;
4013 /* Load every single element of the list/set */
4014 while(zsetlen--) {
4015 robj *ele;
4016 double *score = zmalloc(sizeof(double));
4017
4018 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4019 ele = tryObjectEncoding(ele);
4020 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4021 dictAdd(zs->dict,ele,score);
4022 zslInsert(zs->zsl,*score,ele);
4023 incrRefCount(ele); /* added to skiplist */
4024 }
4025 } else if (type == REDIS_HASH) {
4026 size_t hashlen;
4027
4028 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4029 o = createHashObject();
4030 /* Too many entries? Use an hash table. */
4031 if (hashlen > server.hash_max_zipmap_entries)
4032 convertToRealHash(o);
4033 /* Load every key/value, then set it into the zipmap or hash
4034 * table, as needed. */
4035 while(hashlen--) {
4036 robj *key, *val;
4037
4038 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4039 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4040 /* If we are using a zipmap and there are too big values
4041 * the object is converted to real hash table encoding. */
4042 if (o->encoding != REDIS_ENCODING_HT &&
4043 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4044 sdslen(val->ptr) > server.hash_max_zipmap_value))
4045 {
4046 convertToRealHash(o);
4047 }
4048
4049 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4050 unsigned char *zm = o->ptr;
4051
4052 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4053 val->ptr,sdslen(val->ptr),NULL);
4054 o->ptr = zm;
4055 decrRefCount(key);
4056 decrRefCount(val);
4057 } else {
4058 key = tryObjectEncoding(key);
4059 val = tryObjectEncoding(val);
4060 dictAdd((dict*)o->ptr,key,val);
4061 }
4062 }
4063 } else {
4064 redisPanic("Unknown object type");
4065 }
4066 return o;
4067 }
4068
4069 static int rdbLoad(char *filename) {
4070 FILE *fp;
4071 uint32_t dbid;
4072 int type, retval, rdbver;
4073 int swap_all_values = 0;
4074 dict *d = server.db[0].dict;
4075 redisDb *db = server.db+0;
4076 char buf[1024];
4077 time_t expiretime, now = time(NULL);
4078 long long loadedkeys = 0;
4079
4080 fp = fopen(filename,"r");
4081 if (!fp) return REDIS_ERR;
4082 if (fread(buf,9,1,fp) == 0) goto eoferr;
4083 buf[9] = '\0';
4084 if (memcmp(buf,"REDIS",5) != 0) {
4085 fclose(fp);
4086 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4087 return REDIS_ERR;
4088 }
4089 rdbver = atoi(buf+5);
4090 if (rdbver != 1) {
4091 fclose(fp);
4092 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4093 return REDIS_ERR;
4094 }
4095 while(1) {
4096 robj *key, *val;
4097
4098 expiretime = -1;
4099 /* Read type. */
4100 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4101 if (type == REDIS_EXPIRETIME) {
4102 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4103 /* We read the time so we need to read the object type again */
4104 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4105 }
4106 if (type == REDIS_EOF) break;
4107 /* Handle SELECT DB opcode as a special case */
4108 if (type == REDIS_SELECTDB) {
4109 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4110 goto eoferr;
4111 if (dbid >= (unsigned)server.dbnum) {
4112 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4113 exit(1);
4114 }
4115 db = server.db+dbid;
4116 d = db->dict;
4117 continue;
4118 }
4119 /* Read key */
4120 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4121 /* Read value */
4122 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4123 /* Check if the key already expired */
4124 if (expiretime != -1 && expiretime < now) {
4125 decrRefCount(key);
4126 decrRefCount(val);
4127 continue;
4128 }
4129 /* Add the new object in the hash table */
4130 retval = dictAdd(d,key,val);
4131 if (retval == DICT_ERR) {
4132 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4133 exit(1);
4134 }
4135 loadedkeys++;
4136 /* Set the expire time if needed */
4137 if (expiretime != -1) setExpire(db,key,expiretime);
4138
4139 /* Handle swapping while loading big datasets when VM is on */
4140
4141 /* If we detecter we are hopeless about fitting something in memory
4142 * we just swap every new key on disk. Directly...
4143 * Note that's important to check for this condition before resorting
4144 * to random sampling, otherwise we may try to swap already
4145 * swapped keys. */
4146 if (swap_all_values) {
4147 dictEntry *de = dictFind(d,key);
4148
4149 /* de may be NULL since the key already expired */
4150 if (de) {
4151 key = dictGetEntryKey(de);
4152 val = dictGetEntryVal(de);
4153
4154 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4155 dictGetEntryVal(de) = NULL;
4156 }
4157 }
4158 continue;
4159 }
4160
4161 /* If we have still some hope of having some value fitting memory
4162 * then we try random sampling. */
4163 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4164 while (zmalloc_used_memory() > server.vm_max_memory) {
4165 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4166 }
4167 if (zmalloc_used_memory() > server.vm_max_memory)
4168 swap_all_values = 1; /* We are already using too much mem */
4169 }
4170 }
4171 fclose(fp);
4172 return REDIS_OK;
4173
4174 eoferr: /* unexpected end of file is handled here with a fatal exit */
4175 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4176 exit(1);
4177 return REDIS_ERR; /* Just to avoid warning */
4178 }
4179
4180 /*================================== Shutdown =============================== */
4181 static int prepareForShutdown() {
4182 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4183 /* Kill the saving child if there is a background saving in progress.
4184 We want to avoid race conditions, for instance our saving child may
4185 overwrite the synchronous saving did by SHUTDOWN. */
4186 if (server.bgsavechildpid != -1) {
4187 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4188 kill(server.bgsavechildpid,SIGKILL);
4189 rdbRemoveTempFile(server.bgsavechildpid);
4190 }
4191 if (server.appendonly) {
4192 /* Append only file: fsync() the AOF and exit */
4193 fsync(server.appendfd);
4194 if (server.vm_enabled) unlink(server.vm_swap_file);
4195 } else {
4196 /* Snapshotting. Perform a SYNC SAVE and exit */
4197 if (rdbSave(server.dbfilename) == REDIS_OK) {
4198 if (server.daemonize)
4199 unlink(server.pidfile);
4200 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4201 } else {
4202 /* Ooops.. error saving! The best we can do is to continue
4203 * operating. Note that if there was a background saving process,
4204 * in the next cron() Redis will be notified that the background
4205 * saving aborted, handling special stuff like slaves pending for
4206 * synchronization... */
4207 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4208 return REDIS_ERR;
4209 }
4210 }
4211 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4212 return REDIS_OK;
4213 }
4214
4215 /*================================== Commands =============================== */
4216
4217 static void authCommand(redisClient *c) {
4218 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4219 c->authenticated = 1;
4220 addReply(c,shared.ok);
4221 } else {
4222 c->authenticated = 0;
4223 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4224 }
4225 }
4226
4227 static void pingCommand(redisClient *c) {
4228 addReply(c,shared.pong);
4229 }
4230
4231 static void echoCommand(redisClient *c) {
4232 addReplyBulk(c,c->argv[1]);
4233 }
4234
4235 /*=================================== Strings =============================== */
4236
4237 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4238 int retval;
4239 long seconds = 0; /* initialized to avoid an harmness warning */
4240
4241 if (expire) {
4242 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4243 return;
4244 if (seconds <= 0) {
4245 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4246 return;
4247 }
4248 }
4249
4250 touchWatchedKey(c->db,key);
4251 if (nx) deleteIfVolatile(c->db,key);
4252 retval = dictAdd(c->db->dict,key,val);
4253 if (retval == DICT_ERR) {
4254 if (!nx) {
4255 /* If the key is about a swapped value, we want a new key object
4256 * to overwrite the old. So we delete the old key in the database.
4257 * This will also make sure that swap pages about the old object
4258 * will be marked as free. */
4259 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4260 incrRefCount(key);
4261 dictReplace(c->db->dict,key,val);
4262 incrRefCount(val);
4263 } else {
4264 addReply(c,shared.czero);
4265 return;
4266 }
4267 } else {
4268 incrRefCount(key);
4269 incrRefCount(val);
4270 }
4271 server.dirty++;
4272 removeExpire(c->db,key);
4273 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4274 addReply(c, nx ? shared.cone : shared.ok);
4275 }
4276
4277 static void setCommand(redisClient *c) {
4278 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4279 }
4280
4281 static void setnxCommand(redisClient *c) {
4282 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4283 }
4284
4285 static void setexCommand(redisClient *c) {
4286 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4287 }
4288
4289 static int getGenericCommand(redisClient *c) {
4290 robj *o;
4291
4292 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4293 return REDIS_OK;
4294
4295 if (o->type != REDIS_STRING) {
4296 addReply(c,shared.wrongtypeerr);
4297 return REDIS_ERR;
4298 } else {
4299 addReplyBulk(c,o);
4300 return REDIS_OK;
4301 }
4302 }
4303
4304 static void getCommand(redisClient *c) {
4305 getGenericCommand(c);
4306 }
4307
4308 static void getsetCommand(redisClient *c) {
4309 if (getGenericCommand(c) == REDIS_ERR) return;
4310 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4311 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4312 } else {
4313 incrRefCount(c->argv[1]);
4314 }
4315 incrRefCount(c->argv[2]);
4316 server.dirty++;
4317 removeExpire(c->db,c->argv[1]);
4318 }
4319
4320 static void mgetCommand(redisClient *c) {
4321 int j;
4322
4323 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4324 for (j = 1; j < c->argc; j++) {
4325 robj *o = lookupKeyRead(c->db,c->argv[j]);
4326 if (o == NULL) {
4327 addReply(c,shared.nullbulk);
4328 } else {
4329 if (o->type != REDIS_STRING) {
4330 addReply(c,shared.nullbulk);
4331 } else {
4332 addReplyBulk(c,o);
4333 }
4334 }
4335 }
4336 }
4337
4338 static void msetGenericCommand(redisClient *c, int nx) {
4339 int j, busykeys = 0;
4340
4341 if ((c->argc % 2) == 0) {
4342 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4343 return;
4344 }
4345 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4346 * set nothing at all if at least one already key exists. */
4347 if (nx) {
4348 for (j = 1; j < c->argc; j += 2) {
4349 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4350 busykeys++;
4351 }
4352 }
4353 }
4354 if (busykeys) {
4355 addReply(c, shared.czero);
4356 return;
4357 }
4358
4359 for (j = 1; j < c->argc; j += 2) {
4360 int retval;
4361
4362 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4363 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4364 if (retval == DICT_ERR) {
4365 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4366 incrRefCount(c->argv[j+1]);
4367 } else {
4368 incrRefCount(c->argv[j]);
4369 incrRefCount(c->argv[j+1]);
4370 }
4371 removeExpire(c->db,c->argv[j]);
4372 }
4373 server.dirty += (c->argc-1)/2;
4374 addReply(c, nx ? shared.cone : shared.ok);
4375 }
4376
4377 static void msetCommand(redisClient *c) {
4378 msetGenericCommand(c,0);
4379 }
4380
4381 static void msetnxCommand(redisClient *c) {
4382 msetGenericCommand(c,1);
4383 }
4384
4385 static void incrDecrCommand(redisClient *c, long long incr) {
4386 long long value;
4387 int retval;
4388 robj *o;
4389
4390 o = lookupKeyWrite(c->db,c->argv[1]);
4391 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4392 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4393
4394 value += incr;
4395 o = createStringObjectFromLongLong(value);
4396 retval = dictAdd(c->db->dict,c->argv[1],o);
4397 if (retval == DICT_ERR) {
4398 dictReplace(c->db->dict,c->argv[1],o);
4399 removeExpire(c->db,c->argv[1]);
4400 } else {
4401 incrRefCount(c->argv[1]);
4402 }
4403 server.dirty++;
4404 addReply(c,shared.colon);
4405 addReply(c,o);
4406 addReply(c,shared.crlf);
4407 }
4408
4409 static void incrCommand(redisClient *c) {
4410 incrDecrCommand(c,1);
4411 }
4412
4413 static void decrCommand(redisClient *c) {
4414 incrDecrCommand(c,-1);
4415 }
4416
4417 static void incrbyCommand(redisClient *c) {
4418 long long incr;
4419
4420 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4421 incrDecrCommand(c,incr);
4422 }
4423
4424 static void decrbyCommand(redisClient *c) {
4425 long long incr;
4426
4427 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4428 incrDecrCommand(c,-incr);
4429 }
4430
4431 static void appendCommand(redisClient *c) {
4432 int retval;
4433 size_t totlen;
4434 robj *o;
4435
4436 o = lookupKeyWrite(c->db,c->argv[1]);
4437 if (o == NULL) {
4438 /* Create the key */
4439 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4440 incrRefCount(c->argv[1]);
4441 incrRefCount(c->argv[2]);
4442 totlen = stringObjectLen(c->argv[2]);
4443 } else {
4444 dictEntry *de;
4445
4446 de = dictFind(c->db->dict,c->argv[1]);
4447 assert(de != NULL);
4448
4449 o = dictGetEntryVal(de);
4450 if (o->type != REDIS_STRING) {
4451 addReply(c,shared.wrongtypeerr);
4452 return;
4453 }
4454 /* If the object is specially encoded or shared we have to make
4455 * a copy */
4456 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4457 robj *decoded = getDecodedObject(o);
4458
4459 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4460 decrRefCount(decoded);
4461 dictReplace(c->db->dict,c->argv[1],o);
4462 }
4463 /* APPEND! */
4464 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4465 o->ptr = sdscatlen(o->ptr,
4466 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4467 } else {
4468 o->ptr = sdscatprintf(o->ptr, "%ld",
4469 (unsigned long) c->argv[2]->ptr);
4470 }
4471 totlen = sdslen(o->ptr);
4472 }
4473 server.dirty++;
4474 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4475 }
4476
4477 static void substrCommand(redisClient *c) {
4478 robj *o;
4479 long start = atoi(c->argv[2]->ptr);
4480 long end = atoi(c->argv[3]->ptr);
4481 size_t rangelen, strlen;
4482 sds range;
4483
4484 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4485 checkType(c,o,REDIS_STRING)) return;
4486
4487 o = getDecodedObject(o);
4488 strlen = sdslen(o->ptr);
4489
4490 /* convert negative indexes */
4491 if (start < 0) start = strlen+start;
4492 if (end < 0) end = strlen+end;
4493 if (start < 0) start = 0;
4494 if (end < 0) end = 0;
4495
4496 /* indexes sanity checks */
4497 if (start > end || (size_t)start >= strlen) {
4498 /* Out of range start or start > end result in null reply */
4499 addReply(c,shared.nullbulk);
4500 decrRefCount(o);
4501 return;
4502 }
4503 if ((size_t)end >= strlen) end = strlen-1;
4504 rangelen = (end-start)+1;
4505
4506 /* Return the result */
4507 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4508 range = sdsnewlen((char*)o->ptr+start,rangelen);
4509 addReplySds(c,range);
4510 addReply(c,shared.crlf);
4511 decrRefCount(o);
4512 }
4513
4514 /* ========================= Type agnostic commands ========================= */
4515
4516 static void delCommand(redisClient *c) {
4517 int deleted = 0, j;
4518
4519 for (j = 1; j < c->argc; j++) {
4520 if (deleteKey(c->db,c->argv[j])) {
4521 touchWatchedKey(c->db,c->argv[j]);
4522 server.dirty++;
4523 deleted++;
4524 }
4525 }
4526 addReplyLongLong(c,deleted);
4527 }
4528
4529 static void existsCommand(redisClient *c) {
4530 expireIfNeeded(c->db,c->argv[1]);
4531 if (dictFind(c->db->dict,c->argv[1])) {
4532 addReply(c, shared.cone);
4533 } else {
4534 addReply(c, shared.czero);
4535 }
4536 }
4537
4538 static void selectCommand(redisClient *c) {
4539 int id = atoi(c->argv[1]->ptr);
4540
4541 if (selectDb(c,id) == REDIS_ERR) {
4542 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4543 } else {
4544 addReply(c,shared.ok);
4545 }
4546 }
4547
4548 static void randomkeyCommand(redisClient *c) {
4549 dictEntry *de;
4550 robj *key;
4551
4552 while(1) {
4553 de = dictGetRandomKey(c->db->dict);
4554 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4555 }
4556
4557 if (de == NULL) {
4558 addReply(c,shared.nullbulk);
4559 return;
4560 }
4561
4562 key = dictGetEntryKey(de);
4563 if (server.vm_enabled) {
4564 key = dupStringObject(key);
4565 addReplyBulk(c,key);
4566 decrRefCount(key);
4567 } else {
4568 addReplyBulk(c,key);
4569 }
4570 }
4571
4572 static void keysCommand(redisClient *c) {
4573 dictIterator *di;
4574 dictEntry *de;
4575 sds pattern = c->argv[1]->ptr;
4576 int plen = sdslen(pattern);
4577 unsigned long numkeys = 0;
4578 robj *lenobj = createObject(REDIS_STRING,NULL);
4579
4580 di = dictGetIterator(c->db->dict);
4581 addReply(c,lenobj);
4582 decrRefCount(lenobj);
4583 while((de = dictNext(di)) != NULL) {
4584 robj *keyobj = dictGetEntryKey(de);
4585
4586 sds key = keyobj->ptr;
4587 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4588 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4589 if (expireIfNeeded(c->db,keyobj) == 0) {
4590 addReplyBulk(c,keyobj);
4591 numkeys++;
4592 }
4593 }
4594 }
4595 dictReleaseIterator(di);
4596 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4597 }
4598
4599 static void dbsizeCommand(redisClient *c) {
4600 addReplySds(c,
4601 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4602 }
4603
4604 static void lastsaveCommand(redisClient *c) {
4605 addReplySds(c,
4606 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4607 }
4608
4609 static void typeCommand(redisClient *c) {
4610 robj *o;
4611 char *type;
4612
4613 o = lookupKeyRead(c->db,c->argv[1]);
4614 if (o == NULL) {
4615 type = "+none";
4616 } else {
4617 switch(o->type) {
4618 case REDIS_STRING: type = "+string"; break;
4619 case REDIS_LIST: type = "+list"; break;
4620 case REDIS_SET: type = "+set"; break;
4621 case REDIS_ZSET: type = "+zset"; break;
4622 case REDIS_HASH: type = "+hash"; break;
4623 default: type = "+unknown"; break;
4624 }
4625 }
4626 addReplySds(c,sdsnew(type));
4627 addReply(c,shared.crlf);
4628 }
4629
4630 static void saveCommand(redisClient *c) {
4631 if (server.bgsavechildpid != -1) {
4632 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4633 return;
4634 }
4635 if (rdbSave(server.dbfilename) == REDIS_OK) {
4636 addReply(c,shared.ok);
4637 } else {
4638 addReply(c,shared.err);
4639 }
4640 }
4641
4642 static void bgsaveCommand(redisClient *c) {
4643 if (server.bgsavechildpid != -1) {
4644 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4645 return;
4646 }
4647 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4648 char *status = "+Background saving started\r\n";
4649 addReplySds(c,sdsnew(status));
4650 } else {
4651 addReply(c,shared.err);
4652 }
4653 }
4654
4655 static void shutdownCommand(redisClient *c) {
4656 if (prepareForShutdown() == REDIS_OK)
4657 exit(0);
4658 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4659 }
4660
4661 static void renameGenericCommand(redisClient *c, int nx) {
4662 robj *o;
4663
4664 /* To use the same key as src and dst is probably an error */
4665 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4666 addReply(c,shared.sameobjecterr);
4667 return;
4668 }
4669
4670 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4671 return;
4672
4673 incrRefCount(o);
4674 deleteIfVolatile(c->db,c->argv[2]);
4675 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4676 if (nx) {
4677 decrRefCount(o);
4678 addReply(c,shared.czero);
4679 return;
4680 }
4681 dictReplace(c->db->dict,c->argv[2],o);
4682 } else {
4683 incrRefCount(c->argv[2]);
4684 }
4685 deleteKey(c->db,c->argv[1]);
4686 touchWatchedKey(c->db,c->argv[2]);
4687 server.dirty++;
4688 addReply(c,nx ? shared.cone : shared.ok);
4689 }
4690
4691 static void renameCommand(redisClient *c) {
4692 renameGenericCommand(c,0);
4693 }
4694
4695 static void renamenxCommand(redisClient *c) {
4696 renameGenericCommand(c,1);
4697 }
4698
4699 static void moveCommand(redisClient *c) {
4700 robj *o;
4701 redisDb *src, *dst;
4702 int srcid;
4703
4704 /* Obtain source and target DB pointers */
4705 src = c->db;
4706 srcid = c->db->id;
4707 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4708 addReply(c,shared.outofrangeerr);
4709 return;
4710 }
4711 dst = c->db;
4712 selectDb(c,srcid); /* Back to the source DB */
4713
4714 /* If the user is moving using as target the same
4715 * DB as the source DB it is probably an error. */
4716 if (src == dst) {
4717 addReply(c,shared.sameobjecterr);
4718 return;
4719 }
4720
4721 /* Check if the element exists and get a reference */
4722 o = lookupKeyWrite(c->db,c->argv[1]);
4723 if (!o) {
4724 addReply(c,shared.czero);
4725 return;
4726 }
4727
4728 /* Try to add the element to the target DB */
4729 deleteIfVolatile(dst,c->argv[1]);
4730 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4731 addReply(c,shared.czero);
4732 return;
4733 }
4734 incrRefCount(c->argv[1]);
4735 incrRefCount(o);
4736
4737 /* OK! key moved, free the entry in the source DB */
4738 deleteKey(src,c->argv[1]);
4739 server.dirty++;
4740 addReply(c,shared.cone);
4741 }
4742
4743 /* =================================== Lists ================================ */
4744 static void pushGenericCommand(redisClient *c, int where) {
4745 robj *lobj;
4746 list *list;
4747
4748 lobj = lookupKeyWrite(c->db,c->argv[1]);
4749 if (lobj == NULL) {
4750 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4751 addReply(c,shared.cone);
4752 return;
4753 }
4754 lobj = createListObject();
4755 list = lobj->ptr;
4756 if (where == REDIS_HEAD) {
4757 listAddNodeHead(list,c->argv[2]);
4758 } else {
4759 listAddNodeTail(list,c->argv[2]);
4760 }
4761 dictAdd(c->db->dict,c->argv[1],lobj);
4762 incrRefCount(c->argv[1]);
4763 incrRefCount(c->argv[2]);
4764 } else {
4765 if (lobj->type != REDIS_LIST) {
4766 addReply(c,shared.wrongtypeerr);
4767 return;
4768 }
4769 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4770 addReply(c,shared.cone);
4771 return;
4772 }
4773 list = lobj->ptr;
4774 if (where == REDIS_HEAD) {
4775 listAddNodeHead(list,c->argv[2]);
4776 } else {
4777 listAddNodeTail(list,c->argv[2]);
4778 }
4779 incrRefCount(c->argv[2]);
4780 }
4781 server.dirty++;
4782 addReplyLongLong(c,listLength(list));
4783 }
4784
4785 static void lpushCommand(redisClient *c) {
4786 pushGenericCommand(c,REDIS_HEAD);
4787 }
4788
4789 static void rpushCommand(redisClient *c) {
4790 pushGenericCommand(c,REDIS_TAIL);
4791 }
4792
4793 static void llenCommand(redisClient *c) {
4794 robj *o;
4795 list *l;
4796
4797 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4798 checkType(c,o,REDIS_LIST)) return;
4799
4800 l = o->ptr;
4801 addReplyUlong(c,listLength(l));
4802 }
4803
4804 static void lindexCommand(redisClient *c) {
4805 robj *o;
4806 int index = atoi(c->argv[2]->ptr);
4807 list *list;
4808 listNode *ln;
4809
4810 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4811 checkType(c,o,REDIS_LIST)) return;
4812 list = o->ptr;
4813
4814 ln = listIndex(list, index);
4815 if (ln == NULL) {
4816 addReply(c,shared.nullbulk);
4817 } else {
4818 robj *ele = listNodeValue(ln);
4819 addReplyBulk(c,ele);
4820 }
4821 }
4822
4823 static void lsetCommand(redisClient *c) {
4824 robj *o;
4825 int index = atoi(c->argv[2]->ptr);
4826 list *list;
4827 listNode *ln;
4828
4829 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4830 checkType(c,o,REDIS_LIST)) return;
4831 list = o->ptr;
4832
4833 ln = listIndex(list, index);
4834 if (ln == NULL) {
4835 addReply(c,shared.outofrangeerr);
4836 } else {
4837 robj *ele = listNodeValue(ln);
4838
4839 decrRefCount(ele);
4840 listNodeValue(ln) = c->argv[3];
4841 incrRefCount(c->argv[3]);
4842 addReply(c,shared.ok);
4843 server.dirty++;
4844 }
4845 }
4846
4847 static void popGenericCommand(redisClient *c, int where) {
4848 robj *o;
4849 list *list;
4850 listNode *ln;
4851
4852 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4853 checkType(c,o,REDIS_LIST)) return;
4854 list = o->ptr;
4855
4856 if (where == REDIS_HEAD)
4857 ln = listFirst(list);
4858 else
4859 ln = listLast(list);
4860
4861 if (ln == NULL) {
4862 addReply(c,shared.nullbulk);
4863 } else {
4864 robj *ele = listNodeValue(ln);
4865 addReplyBulk(c,ele);
4866 listDelNode(list,ln);
4867 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4868 server.dirty++;
4869 }
4870 }
4871
4872 static void lpopCommand(redisClient *c) {
4873 popGenericCommand(c,REDIS_HEAD);
4874 }
4875
4876 static void rpopCommand(redisClient *c) {
4877 popGenericCommand(c,REDIS_TAIL);
4878 }
4879
4880 static void lrangeCommand(redisClient *c) {
4881 robj *o;
4882 int start = atoi(c->argv[2]->ptr);
4883 int end = atoi(c->argv[3]->ptr);
4884 int llen;
4885 int rangelen, j;
4886 list *list;
4887 listNode *ln;
4888 robj *ele;
4889
4890 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4891 || checkType(c,o,REDIS_LIST)) return;
4892 list = o->ptr;
4893 llen = listLength(list);
4894
4895 /* convert negative indexes */
4896 if (start < 0) start = llen+start;
4897 if (end < 0) end = llen+end;
4898 if (start < 0) start = 0;
4899 if (end < 0) end = 0;
4900
4901 /* indexes sanity checks */
4902 if (start > end || start >= llen) {
4903 /* Out of range start or start > end result in empty list */
4904 addReply(c,shared.emptymultibulk);
4905 return;
4906 }
4907 if (end >= llen) end = llen-1;
4908 rangelen = (end-start)+1;
4909
4910 /* Return the result in form of a multi-bulk reply */
4911 ln = listIndex(list, start);
4912 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4913 for (j = 0; j < rangelen; j++) {
4914 ele = listNodeValue(ln);
4915 addReplyBulk(c,ele);
4916 ln = ln->next;
4917 }
4918 }
4919
4920 static void ltrimCommand(redisClient *c) {
4921 robj *o;
4922 int start = atoi(c->argv[2]->ptr);
4923 int end = atoi(c->argv[3]->ptr);
4924 int llen;
4925 int j, ltrim, rtrim;
4926 list *list;
4927 listNode *ln;
4928
4929 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4930 checkType(c,o,REDIS_LIST)) return;
4931 list = o->ptr;
4932 llen = listLength(list);
4933
4934 /* convert negative indexes */
4935 if (start < 0) start = llen+start;
4936 if (end < 0) end = llen+end;
4937 if (start < 0) start = 0;
4938 if (end < 0) end = 0;
4939
4940 /* indexes sanity checks */
4941 if (start > end || start >= llen) {
4942 /* Out of range start or start > end result in empty list */
4943 ltrim = llen;
4944 rtrim = 0;
4945 } else {
4946 if (end >= llen) end = llen-1;
4947 ltrim = start;
4948 rtrim = llen-end-1;
4949 }
4950
4951 /* Remove list elements to perform the trim */
4952 for (j = 0; j < ltrim; j++) {
4953 ln = listFirst(list);
4954 listDelNode(list,ln);
4955 }
4956 for (j = 0; j < rtrim; j++) {
4957 ln = listLast(list);
4958 listDelNode(list,ln);
4959 }
4960 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4961 server.dirty++;
4962 addReply(c,shared.ok);
4963 }
4964
4965 static void lremCommand(redisClient *c) {
4966 robj *o;
4967 list *list;
4968 listNode *ln, *next;
4969 int toremove = atoi(c->argv[2]->ptr);
4970 int removed = 0;
4971 int fromtail = 0;
4972
4973 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4974 checkType(c,o,REDIS_LIST)) return;
4975 list = o->ptr;
4976
4977 if (toremove < 0) {
4978 toremove = -toremove;
4979 fromtail = 1;
4980 }
4981 ln = fromtail ? list->tail : list->head;
4982 while (ln) {
4983 robj *ele = listNodeValue(ln);
4984
4985 next = fromtail ? ln->prev : ln->next;
4986 if (equalStringObjects(ele,c->argv[3])) {
4987 listDelNode(list,ln);
4988 server.dirty++;
4989 removed++;
4990 if (toremove && removed == toremove) break;
4991 }
4992 ln = next;
4993 }
4994 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4995 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4996 }
4997
4998 /* This is the semantic of this command:
4999 * RPOPLPUSH srclist dstlist:
5000 * IF LLEN(srclist) > 0
5001 * element = RPOP srclist
5002 * LPUSH dstlist element
5003 * RETURN element
5004 * ELSE
5005 * RETURN nil
5006 * END
5007 * END
5008 *
5009 * The idea is to be able to get an element from a list in a reliable way
5010 * since the element is not just returned but pushed against another list
5011 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5012 */
5013 static void rpoplpushcommand(redisClient *c) {
5014 robj *sobj;
5015 list *srclist;
5016 listNode *ln;
5017
5018 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5019 checkType(c,sobj,REDIS_LIST)) return;
5020 srclist = sobj->ptr;
5021 ln = listLast(srclist);
5022
5023 if (ln == NULL) {
5024 addReply(c,shared.nullbulk);
5025 } else {
5026 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5027 robj *ele = listNodeValue(ln);
5028 list *dstlist;
5029
5030 if (dobj && dobj->type != REDIS_LIST) {
5031 addReply(c,shared.wrongtypeerr);
5032 return;
5033 }
5034
5035 /* Add the element to the target list (unless it's directly
5036 * passed to some BLPOP-ing client */
5037 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5038 if (dobj == NULL) {
5039 /* Create the list if the key does not exist */
5040 dobj = createListObject();
5041 dictAdd(c->db->dict,c->argv[2],dobj);
5042 incrRefCount(c->argv[2]);
5043 }
5044 dstlist = dobj->ptr;
5045 listAddNodeHead(dstlist,ele);
5046 incrRefCount(ele);
5047 }
5048
5049 /* Send the element to the client as reply as well */
5050 addReplyBulk(c,ele);
5051
5052 /* Finally remove the element from the source list */
5053 listDelNode(srclist,ln);
5054 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5055 server.dirty++;
5056 }
5057 }
5058
5059 /* ==================================== Sets ================================ */
5060
5061 static void saddCommand(redisClient *c) {
5062 robj *set;
5063
5064 set = lookupKeyWrite(c->db,c->argv[1]);
5065 if (set == NULL) {
5066 set = createSetObject();
5067 dictAdd(c->db->dict,c->argv[1],set);
5068 incrRefCount(c->argv[1]);
5069 } else {
5070 if (set->type != REDIS_SET) {
5071 addReply(c,shared.wrongtypeerr);
5072 return;
5073 }
5074 }
5075 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5076 incrRefCount(c->argv[2]);
5077 server.dirty++;
5078 addReply(c,shared.cone);
5079 } else {
5080 addReply(c,shared.czero);
5081 }
5082 }
5083
5084 static void sremCommand(redisClient *c) {
5085 robj *set;
5086
5087 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5088 checkType(c,set,REDIS_SET)) return;
5089
5090 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5091 server.dirty++;
5092 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5093 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5094 addReply(c,shared.cone);
5095 } else {
5096 addReply(c,shared.czero);
5097 }
5098 }
5099
5100 static void smoveCommand(redisClient *c) {
5101 robj *srcset, *dstset;
5102
5103 srcset = lookupKeyWrite(c->db,c->argv[1]);
5104 dstset = lookupKeyWrite(c->db,c->argv[2]);
5105
5106 /* If the source key does not exist return 0, if it's of the wrong type
5107 * raise an error */
5108 if (srcset == NULL || srcset->type != REDIS_SET) {
5109 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5110 return;
5111 }
5112 /* Error if the destination key is not a set as well */
5113 if (dstset && dstset->type != REDIS_SET) {
5114 addReply(c,shared.wrongtypeerr);
5115 return;
5116 }
5117 /* Remove the element from the source set */
5118 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5119 /* Key not found in the src set! return zero */
5120 addReply(c,shared.czero);
5121 return;
5122 }
5123 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5124 deleteKey(c->db,c->argv[1]);
5125 server.dirty++;
5126 /* Add the element to the destination set */
5127 if (!dstset) {
5128 dstset = createSetObject();
5129 dictAdd(c->db->dict,c->argv[2],dstset);
5130 incrRefCount(c->argv[2]);
5131 }
5132 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5133 incrRefCount(c->argv[3]);
5134 addReply(c,shared.cone);
5135 }
5136
5137 static void sismemberCommand(redisClient *c) {
5138 robj *set;
5139
5140 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5141 checkType(c,set,REDIS_SET)) return;
5142
5143 if (dictFind(set->ptr,c->argv[2]))
5144 addReply(c,shared.cone);
5145 else
5146 addReply(c,shared.czero);
5147 }
5148
5149 static void scardCommand(redisClient *c) {
5150 robj *o;
5151 dict *s;
5152
5153 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5154 checkType(c,o,REDIS_SET)) return;
5155
5156 s = o->ptr;
5157 addReplyUlong(c,dictSize(s));
5158 }
5159
5160 static void spopCommand(redisClient *c) {
5161 robj *set;
5162 dictEntry *de;
5163
5164 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5165 checkType(c,set,REDIS_SET)) return;
5166
5167 de = dictGetRandomKey(set->ptr);
5168 if (de == NULL) {
5169 addReply(c,shared.nullbulk);
5170 } else {
5171 robj *ele = dictGetEntryKey(de);
5172
5173 addReplyBulk(c,ele);
5174 dictDelete(set->ptr,ele);
5175 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5176 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5177 server.dirty++;
5178 }
5179 }
5180
5181 static void srandmemberCommand(redisClient *c) {
5182 robj *set;
5183 dictEntry *de;
5184
5185 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5186 checkType(c,set,REDIS_SET)) return;
5187
5188 de = dictGetRandomKey(set->ptr);
5189 if (de == NULL) {
5190 addReply(c,shared.nullbulk);
5191 } else {
5192 robj *ele = dictGetEntryKey(de);
5193
5194 addReplyBulk(c,ele);
5195 }
5196 }
5197
5198 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5199 dict **d1 = (void*) s1, **d2 = (void*) s2;
5200
5201 return dictSize(*d1)-dictSize(*d2);
5202 }
5203
5204 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5205 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5206 dictIterator *di;
5207 dictEntry *de;
5208 robj *lenobj = NULL, *dstset = NULL;
5209 unsigned long j, cardinality = 0;
5210
5211 for (j = 0; j < setsnum; j++) {
5212 robj *setobj;
5213
5214 setobj = dstkey ?
5215 lookupKeyWrite(c->db,setskeys[j]) :
5216 lookupKeyRead(c->db,setskeys[j]);
5217 if (!setobj) {
5218 zfree(dv);
5219 if (dstkey) {
5220 if (deleteKey(c->db,dstkey))
5221 server.dirty++;
5222 addReply(c,shared.czero);
5223 } else {
5224 addReply(c,shared.emptymultibulk);
5225 }
5226 return;
5227 }
5228 if (setobj->type != REDIS_SET) {
5229 zfree(dv);
5230 addReply(c,shared.wrongtypeerr);
5231 return;
5232 }
5233 dv[j] = setobj->ptr;
5234 }
5235 /* Sort sets from the smallest to largest, this will improve our
5236 * algorithm's performace */
5237 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5238
5239 /* The first thing we should output is the total number of elements...
5240 * since this is a multi-bulk write, but at this stage we don't know
5241 * the intersection set size, so we use a trick, append an empty object
5242 * to the output list and save the pointer to later modify it with the
5243 * right length */
5244 if (!dstkey) {
5245 lenobj = createObject(REDIS_STRING,NULL);
5246 addReply(c,lenobj);
5247 decrRefCount(lenobj);
5248 } else {
5249 /* If we have a target key where to store the resulting set
5250 * create this key with an empty set inside */
5251 dstset = createSetObject();
5252 }
5253
5254 /* Iterate all the elements of the first (smallest) set, and test
5255 * the element against all the other sets, if at least one set does
5256 * not include the element it is discarded */
5257 di = dictGetIterator(dv[0]);
5258
5259 while((de = dictNext(di)) != NULL) {
5260 robj *ele;
5261
5262 for (j = 1; j < setsnum; j++)
5263 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5264 if (j != setsnum)
5265 continue; /* at least one set does not contain the member */
5266 ele = dictGetEntryKey(de);
5267 if (!dstkey) {
5268 addReplyBulk(c,ele);
5269 cardinality++;
5270 } else {
5271 dictAdd(dstset->ptr,ele,NULL);
5272 incrRefCount(ele);
5273 }
5274 }
5275 dictReleaseIterator(di);
5276
5277 if (dstkey) {
5278 /* Store the resulting set into the target, if the intersection
5279 * is not an empty set. */
5280 deleteKey(c->db,dstkey);
5281 if (dictSize((dict*)dstset->ptr) > 0) {
5282 dictAdd(c->db->dict,dstkey,dstset);
5283 incrRefCount(dstkey);
5284 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5285 } else {
5286 decrRefCount(dstset);
5287 addReply(c,shared.czero);
5288 }
5289 server.dirty++;
5290 } else {
5291 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5292 }
5293 zfree(dv);
5294 }
5295
5296 static void sinterCommand(redisClient *c) {
5297 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5298 }
5299
5300 static void sinterstoreCommand(redisClient *c) {
5301 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5302 }
5303
5304 #define REDIS_OP_UNION 0
5305 #define REDIS_OP_DIFF 1
5306 #define REDIS_OP_INTER 2
5307
5308 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5309 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5310 dictIterator *di;
5311 dictEntry *de;
5312 robj *dstset = NULL;
5313 int j, cardinality = 0;
5314
5315 for (j = 0; j < setsnum; j++) {
5316 robj *setobj;
5317
5318 setobj = dstkey ?
5319 lookupKeyWrite(c->db,setskeys[j]) :
5320 lookupKeyRead(c->db,setskeys[j]);
5321 if (!setobj) {
5322 dv[j] = NULL;
5323 continue;
5324 }
5325 if (setobj->type != REDIS_SET) {
5326 zfree(dv);
5327 addReply(c,shared.wrongtypeerr);
5328 return;
5329 }
5330 dv[j] = setobj->ptr;
5331 }
5332
5333 /* We need a temp set object to store our union. If the dstkey
5334 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5335 * this set object will be the resulting object to set into the target key*/
5336 dstset = createSetObject();
5337
5338 /* Iterate all the elements of all the sets, add every element a single
5339 * time to the result set */
5340 for (j = 0; j < setsnum; j++) {
5341 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5342 if (!dv[j]) continue; /* non existing keys are like empty sets */
5343
5344 di = dictGetIterator(dv[j]);
5345
5346 while((de = dictNext(di)) != NULL) {
5347 robj *ele;
5348
5349 /* dictAdd will not add the same element multiple times */
5350 ele = dictGetEntryKey(de);
5351 if (op == REDIS_OP_UNION || j == 0) {
5352 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5353 incrRefCount(ele);
5354 cardinality++;
5355 }
5356 } else if (op == REDIS_OP_DIFF) {
5357 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5358 cardinality--;
5359 }
5360 }
5361 }
5362 dictReleaseIterator(di);
5363
5364 /* result set is empty? Exit asap. */
5365 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5366 }
5367
5368 /* Output the content of the resulting set, if not in STORE mode */
5369 if (!dstkey) {
5370 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5371 di = dictGetIterator(dstset->ptr);
5372 while((de = dictNext(di)) != NULL) {
5373 robj *ele;
5374
5375 ele = dictGetEntryKey(de);
5376 addReplyBulk(c,ele);
5377 }
5378 dictReleaseIterator(di);
5379 decrRefCount(dstset);
5380 } else {
5381 /* If we have a target key where to store the resulting set
5382 * create this key with the result set inside */
5383 deleteKey(c->db,dstkey);
5384 if (dictSize((dict*)dstset->ptr) > 0) {
5385 dictAdd(c->db->dict,dstkey,dstset);
5386 incrRefCount(dstkey);
5387 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5388 } else {
5389 decrRefCount(dstset);
5390 addReply(c,shared.czero);
5391 }
5392 server.dirty++;
5393 }
5394 zfree(dv);
5395 }
5396
5397 static void sunionCommand(redisClient *c) {
5398 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5399 }
5400
5401 static void sunionstoreCommand(redisClient *c) {
5402 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5403 }
5404
5405 static void sdiffCommand(redisClient *c) {
5406 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5407 }
5408
5409 static void sdiffstoreCommand(redisClient *c) {
5410 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5411 }
5412
5413 /* ==================================== ZSets =============================== */
5414
5415 /* ZSETs are ordered sets using two data structures to hold the same elements
5416 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5417 * data structure.
5418 *
5419 * The elements are added to an hash table mapping Redis objects to scores.
5420 * At the same time the elements are added to a skip list mapping scores
5421 * to Redis objects (so objects are sorted by scores in this "view"). */
5422
5423 /* This skiplist implementation is almost a C translation of the original
5424 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5425 * Alternative to Balanced Trees", modified in three ways:
5426 * a) this implementation allows for repeated values.
5427 * b) the comparison is not just by key (our 'score') but by satellite data.
5428 * c) there is a back pointer, so it's a doubly linked list with the back
5429 * pointers being only at "level 1". This allows to traverse the list
5430 * from tail to head, useful for ZREVRANGE. */
5431
5432 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5433 zskiplistNode *zn = zmalloc(sizeof(*zn));
5434
5435 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5436 if (level > 1)
5437 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5438 else
5439 zn->span = NULL;
5440 zn->score = score;
5441 zn->obj = obj;
5442 return zn;
5443 }
5444
5445 static zskiplist *zslCreate(void) {
5446 int j;
5447 zskiplist *zsl;
5448
5449 zsl = zmalloc(sizeof(*zsl));
5450 zsl->level = 1;
5451 zsl->length = 0;
5452 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5453 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5454 zsl->header->forward[j] = NULL;
5455
5456 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5457 if (j < ZSKIPLIST_MAXLEVEL-1)
5458 zsl->header->span[j] = 0;
5459 }
5460 zsl->header->backward = NULL;
5461 zsl->tail = NULL;
5462 return zsl;
5463 }
5464
5465 static void zslFreeNode(zskiplistNode *node) {
5466 decrRefCount(node->obj);
5467 zfree(node->forward);
5468 zfree(node->span);
5469 zfree(node);
5470 }
5471
5472 static void zslFree(zskiplist *zsl) {
5473 zskiplistNode *node = zsl->header->forward[0], *next;
5474
5475 zfree(zsl->header->forward);
5476 zfree(zsl->header->span);
5477 zfree(zsl->header);
5478 while(node) {
5479 next = node->forward[0];
5480 zslFreeNode(node);
5481 node = next;
5482 }
5483 zfree(zsl);
5484 }
5485
5486 static int zslRandomLevel(void) {
5487 int level = 1;
5488 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5489 level += 1;
5490 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5491 }
5492
5493 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5494 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5495 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5496 int i, level;
5497
5498 x = zsl->header;
5499 for (i = zsl->level-1; i >= 0; i--) {
5500 /* store rank that is crossed to reach the insert position */
5501 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5502
5503 while (x->forward[i] &&
5504 (x->forward[i]->score < score ||
5505 (x->forward[i]->score == score &&
5506 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5507 rank[i] += i > 0 ? x->span[i-1] : 1;
5508 x = x->forward[i];
5509 }
5510 update[i] = x;
5511 }
5512 /* we assume the key is not already inside, since we allow duplicated
5513 * scores, and the re-insertion of score and redis object should never
5514 * happpen since the caller of zslInsert() should test in the hash table
5515 * if the element is already inside or not. */
5516 level = zslRandomLevel();
5517 if (level > zsl->level) {
5518 for (i = zsl->level; i < level; i++) {
5519 rank[i] = 0;
5520 update[i] = zsl->header;
5521 update[i]->span[i-1] = zsl->length;
5522 }
5523 zsl->level = level;
5524 }
5525 x = zslCreateNode(level,score,obj);
5526 for (i = 0; i < level; i++) {
5527 x->forward[i] = update[i]->forward[i];
5528 update[i]->forward[i] = x;
5529
5530 /* update span covered by update[i] as x is inserted here */
5531 if (i > 0) {
5532 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5533 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5534 }
5535 }
5536
5537 /* increment span for untouched levels */
5538 for (i = level; i < zsl->level; i++) {
5539 update[i]->span[i-1]++;
5540 }
5541
5542 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5543 if (x->forward[0])
5544 x->forward[0]->backward = x;
5545 else
5546 zsl->tail = x;
5547 zsl->length++;
5548 }
5549
5550 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5551 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5552 int i;
5553 for (i = 0; i < zsl->level; i++) {
5554 if (update[i]->forward[i] == x) {
5555 if (i > 0) {
5556 update[i]->span[i-1] += x->span[i-1] - 1;
5557 }
5558 update[i]->forward[i] = x->forward[i];
5559 } else {
5560 /* invariant: i > 0, because update[0]->forward[0]
5561 * is always equal to x */
5562 update[i]->span[i-1] -= 1;
5563 }
5564 }
5565 if (x->forward[0]) {
5566 x->forward[0]->backward = x->backward;
5567 } else {
5568 zsl->tail = x->backward;
5569 }
5570 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5571 zsl->level--;
5572 zsl->length--;
5573 }
5574
5575 /* Delete an element with matching score/object from the skiplist. */
5576 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5577 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5578 int i;
5579
5580 x = zsl->header;
5581 for (i = zsl->level-1; i >= 0; i--) {
5582 while (x->forward[i] &&
5583 (x->forward[i]->score < score ||
5584 (x->forward[i]->score == score &&
5585 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5586 x = x->forward[i];
5587 update[i] = x;
5588 }
5589 /* We may have multiple elements with the same score, what we need
5590 * is to find the element with both the right score and object. */
5591 x = x->forward[0];
5592 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5593 zslDeleteNode(zsl, x, update);
5594 zslFreeNode(x);
5595 return 1;
5596 } else {
5597 return 0; /* not found */
5598 }
5599 return 0; /* not found */
5600 }
5601
5602 /* Delete all the elements with score between min and max from the skiplist.
5603 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5604 * Note that this function takes the reference to the hash table view of the
5605 * sorted set, in order to remove the elements from the hash table too. */
5606 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5607 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5608 unsigned long removed = 0;
5609 int i;
5610
5611 x = zsl->header;
5612 for (i = zsl->level-1; i >= 0; i--) {
5613 while (x->forward[i] && x->forward[i]->score < min)
5614 x = x->forward[i];
5615 update[i] = x;
5616 }
5617 /* We may have multiple elements with the same score, what we need
5618 * is to find the element with both the right score and object. */
5619 x = x->forward[0];
5620 while (x && x->score <= max) {
5621 zskiplistNode *next = x->forward[0];
5622 zslDeleteNode(zsl, x, update);
5623 dictDelete(dict,x->obj);
5624 zslFreeNode(x);
5625 removed++;
5626 x = next;
5627 }
5628 return removed; /* not found */
5629 }
5630
5631 /* Delete all the elements with rank between start and end from the skiplist.
5632 * Start and end are inclusive. Note that start and end need to be 1-based */
5633 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5634 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5635 unsigned long traversed = 0, removed = 0;
5636 int i;
5637
5638 x = zsl->header;
5639 for (i = zsl->level-1; i >= 0; i--) {
5640 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5641 traversed += i > 0 ? x->span[i-1] : 1;
5642 x = x->forward[i];
5643 }
5644 update[i] = x;
5645 }
5646
5647 traversed++;
5648 x = x->forward[0];
5649 while (x && traversed <= end) {
5650 zskiplistNode *next = x->forward[0];
5651 zslDeleteNode(zsl, x, update);
5652 dictDelete(dict,x->obj);
5653 zslFreeNode(x);
5654 removed++;
5655 traversed++;
5656 x = next;
5657 }
5658 return removed;
5659 }
5660
5661 /* Find the first node having a score equal or greater than the specified one.
5662 * Returns NULL if there is no match. */
5663 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5664 zskiplistNode *x;
5665 int i;
5666
5667 x = zsl->header;
5668 for (i = zsl->level-1; i >= 0; i--) {
5669 while (x->forward[i] && x->forward[i]->score < score)
5670 x = x->forward[i];
5671 }
5672 /* We may have multiple elements with the same score, what we need
5673 * is to find the element with both the right score and object. */
5674 return x->forward[0];
5675 }
5676
5677 /* Find the rank for an element by both score and key.
5678 * Returns 0 when the element cannot be found, rank otherwise.
5679 * Note that the rank is 1-based due to the span of zsl->header to the
5680 * first element. */
5681 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5682 zskiplistNode *x;
5683 unsigned long rank = 0;
5684 int i;
5685
5686 x = zsl->header;
5687 for (i = zsl->level-1; i >= 0; i--) {
5688 while (x->forward[i] &&
5689 (x->forward[i]->score < score ||
5690 (x->forward[i]->score == score &&
5691 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5692 rank += i > 0 ? x->span[i-1] : 1;
5693 x = x->forward[i];
5694 }
5695
5696 /* x might be equal to zsl->header, so test if obj is non-NULL */
5697 if (x->obj && equalStringObjects(x->obj,o)) {
5698 return rank;
5699 }
5700 }
5701 return 0;
5702 }
5703
5704 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5705 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5706 zskiplistNode *x;
5707 unsigned long traversed = 0;
5708 int i;
5709
5710 x = zsl->header;
5711 for (i = zsl->level-1; i >= 0; i--) {
5712 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5713 {
5714 traversed += i > 0 ? x->span[i-1] : 1;
5715 x = x->forward[i];
5716 }
5717 if (traversed == rank) {
5718 return x;
5719 }
5720 }
5721 return NULL;
5722 }
5723
5724 /* The actual Z-commands implementations */
5725
5726 /* This generic command implements both ZADD and ZINCRBY.
5727 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5728 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5729 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5730 robj *zsetobj;
5731 zset *zs;
5732 double *score;
5733
5734 zsetobj = lookupKeyWrite(c->db,key);
5735 if (zsetobj == NULL) {
5736 zsetobj = createZsetObject();
5737 dictAdd(c->db->dict,key,zsetobj);
5738 incrRefCount(key);
5739 } else {
5740 if (zsetobj->type != REDIS_ZSET) {
5741 addReply(c,shared.wrongtypeerr);
5742 return;
5743 }
5744 }
5745 zs = zsetobj->ptr;
5746
5747 /* Ok now since we implement both ZADD and ZINCRBY here the code
5748 * needs to handle the two different conditions. It's all about setting
5749 * '*score', that is, the new score to set, to the right value. */
5750 score = zmalloc(sizeof(double));
5751 if (doincrement) {
5752 dictEntry *de;
5753
5754 /* Read the old score. If the element was not present starts from 0 */
5755 de = dictFind(zs->dict,ele);
5756 if (de) {
5757 double *oldscore = dictGetEntryVal(de);
5758 *score = *oldscore + scoreval;
5759 } else {
5760 *score = scoreval;
5761 }
5762 } else {
5763 *score = scoreval;
5764 }
5765
5766 /* What follows is a simple remove and re-insert operation that is common
5767 * to both ZADD and ZINCRBY... */
5768 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5769 /* case 1: New element */
5770 incrRefCount(ele); /* added to hash */
5771 zslInsert(zs->zsl,*score,ele);
5772 incrRefCount(ele); /* added to skiplist */
5773 server.dirty++;
5774 if (doincrement)
5775 addReplyDouble(c,*score);
5776 else
5777 addReply(c,shared.cone);
5778 } else {
5779 dictEntry *de;
5780 double *oldscore;
5781
5782 /* case 2: Score update operation */
5783 de = dictFind(zs->dict,ele);
5784 redisAssert(de != NULL);
5785 oldscore = dictGetEntryVal(de);
5786 if (*score != *oldscore) {
5787 int deleted;
5788
5789 /* Remove and insert the element in the skip list with new score */
5790 deleted = zslDelete(zs->zsl,*oldscore,ele);
5791 redisAssert(deleted != 0);
5792 zslInsert(zs->zsl,*score,ele);
5793 incrRefCount(ele);
5794 /* Update the score in the hash table */
5795 dictReplace(zs->dict,ele,score);
5796 server.dirty++;
5797 } else {
5798 zfree(score);
5799 }
5800 if (doincrement)
5801 addReplyDouble(c,*score);
5802 else
5803 addReply(c,shared.czero);
5804 }
5805 }
5806
5807 static void zaddCommand(redisClient *c) {
5808 double scoreval;
5809
5810 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5811 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5812 }
5813
5814 static void zincrbyCommand(redisClient *c) {
5815 double scoreval;
5816
5817 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5818 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5819 }
5820
5821 static void zremCommand(redisClient *c) {
5822 robj *zsetobj;
5823 zset *zs;
5824 dictEntry *de;
5825 double *oldscore;
5826 int deleted;
5827
5828 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5829 checkType(c,zsetobj,REDIS_ZSET)) return;
5830
5831 zs = zsetobj->ptr;
5832 de = dictFind(zs->dict,c->argv[2]);
5833 if (de == NULL) {
5834 addReply(c,shared.czero);
5835 return;
5836 }
5837 /* Delete from the skiplist */
5838 oldscore = dictGetEntryVal(de);
5839 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5840 redisAssert(deleted != 0);
5841
5842 /* Delete from the hash table */
5843 dictDelete(zs->dict,c->argv[2]);
5844 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5845 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5846 server.dirty++;
5847 addReply(c,shared.cone);
5848 }
5849
5850 static void zremrangebyscoreCommand(redisClient *c) {
5851 double min;
5852 double max;
5853 long deleted;
5854 robj *zsetobj;
5855 zset *zs;
5856
5857 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5858 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5859
5860 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5861 checkType(c,zsetobj,REDIS_ZSET)) return;
5862
5863 zs = zsetobj->ptr;
5864 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5865 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5866 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5867 server.dirty += deleted;
5868 addReplyLongLong(c,deleted);
5869 }
5870
5871 static void zremrangebyrankCommand(redisClient *c) {
5872 long start;
5873 long end;
5874 int llen;
5875 long deleted;
5876 robj *zsetobj;
5877 zset *zs;
5878
5879 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5880 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5881
5882 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5883 checkType(c,zsetobj,REDIS_ZSET)) return;
5884 zs = zsetobj->ptr;
5885 llen = zs->zsl->length;
5886
5887 /* convert negative indexes */
5888 if (start < 0) start = llen+start;
5889 if (end < 0) end = llen+end;
5890 if (start < 0) start = 0;
5891 if (end < 0) end = 0;
5892
5893 /* indexes sanity checks */
5894 if (start > end || start >= llen) {
5895 addReply(c,shared.czero);
5896 return;
5897 }
5898 if (end >= llen) end = llen-1;
5899
5900 /* increment start and end because zsl*Rank functions
5901 * use 1-based rank */
5902 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5903 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5904 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5905 server.dirty += deleted;
5906 addReplyLongLong(c, deleted);
5907 }
5908
5909 typedef struct {
5910 dict *dict;
5911 double weight;
5912 } zsetopsrc;
5913
5914 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5915 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5916 unsigned long size1, size2;
5917 size1 = d1->dict ? dictSize(d1->dict) : 0;
5918 size2 = d2->dict ? dictSize(d2->dict) : 0;
5919 return size1 - size2;
5920 }
5921
5922 #define REDIS_AGGR_SUM 1
5923 #define REDIS_AGGR_MIN 2
5924 #define REDIS_AGGR_MAX 3
5925 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5926
5927 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5928 if (aggregate == REDIS_AGGR_SUM) {
5929 *target = *target + val;
5930 } else if (aggregate == REDIS_AGGR_MIN) {
5931 *target = val < *target ? val : *target;
5932 } else if (aggregate == REDIS_AGGR_MAX) {
5933 *target = val > *target ? val : *target;
5934 } else {
5935 /* safety net */
5936 redisPanic("Unknown ZUNION/INTER aggregate type");
5937 }
5938 }
5939
5940 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5941 int i, j, setnum;
5942 int aggregate = REDIS_AGGR_SUM;
5943 zsetopsrc *src;
5944 robj *dstobj;
5945 zset *dstzset;
5946 dictIterator *di;
5947 dictEntry *de;
5948
5949 /* expect setnum input keys to be given */
5950 setnum = atoi(c->argv[2]->ptr);
5951 if (setnum < 1) {
5952 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5953 return;
5954 }
5955
5956 /* test if the expected number of keys would overflow */
5957 if (3+setnum > c->argc) {
5958 addReply(c,shared.syntaxerr);
5959 return;
5960 }
5961
5962 /* read keys to be used for input */
5963 src = zmalloc(sizeof(zsetopsrc) * setnum);
5964 for (i = 0, j = 3; i < setnum; i++, j++) {
5965 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
5966 if (!obj) {
5967 src[i].dict = NULL;
5968 } else {
5969 if (obj->type == REDIS_ZSET) {
5970 src[i].dict = ((zset*)obj->ptr)->dict;
5971 } else if (obj->type == REDIS_SET) {
5972 src[i].dict = (obj->ptr);
5973 } else {
5974 zfree(src);
5975 addReply(c,shared.wrongtypeerr);
5976 return;
5977 }
5978 }
5979
5980 /* default all weights to 1 */
5981 src[i].weight = 1.0;
5982 }
5983
5984 /* parse optional extra arguments */
5985 if (j < c->argc) {
5986 int remaining = c->argc - j;
5987
5988 while (remaining) {
5989 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5990 j++; remaining--;
5991 for (i = 0; i < setnum; i++, j++, remaining--) {
5992 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5993 return;
5994 }
5995 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5996 j++; remaining--;
5997 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5998 aggregate = REDIS_AGGR_SUM;
5999 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6000 aggregate = REDIS_AGGR_MIN;
6001 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6002 aggregate = REDIS_AGGR_MAX;
6003 } else {
6004 zfree(src);
6005 addReply(c,shared.syntaxerr);
6006 return;
6007 }
6008 j++; remaining--;
6009 } else {
6010 zfree(src);
6011 addReply(c,shared.syntaxerr);
6012 return;
6013 }
6014 }
6015 }
6016
6017 /* sort sets from the smallest to largest, this will improve our
6018 * algorithm's performance */
6019 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6020
6021 dstobj = createZsetObject();
6022 dstzset = dstobj->ptr;
6023
6024 if (op == REDIS_OP_INTER) {
6025 /* skip going over all entries if the smallest zset is NULL or empty */
6026 if (src[0].dict && dictSize(src[0].dict) > 0) {
6027 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6028 * from small to large, all src[i > 0].dict are non-empty too */
6029 di = dictGetIterator(src[0].dict);
6030 while((de = dictNext(di)) != NULL) {
6031 double *score = zmalloc(sizeof(double)), value;
6032 *score = src[0].weight * zunionInterDictValue(de);
6033
6034 for (j = 1; j < setnum; j++) {
6035 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6036 if (other) {
6037 value = src[j].weight * zunionInterDictValue(other);
6038 zunionInterAggregate(score, value, aggregate);
6039 } else {
6040 break;
6041 }
6042 }
6043
6044 /* skip entry when not present in every source dict */
6045 if (j != setnum) {
6046 zfree(score);
6047 } else {
6048 robj *o = dictGetEntryKey(de);
6049 dictAdd(dstzset->dict,o,score);
6050 incrRefCount(o); /* added to dictionary */
6051 zslInsert(dstzset->zsl,*score,o);
6052 incrRefCount(o); /* added to skiplist */
6053 }
6054 }
6055 dictReleaseIterator(di);
6056 }
6057 } else if (op == REDIS_OP_UNION) {
6058 for (i = 0; i < setnum; i++) {
6059 if (!src[i].dict) continue;
6060
6061 di = dictGetIterator(src[i].dict);
6062 while((de = dictNext(di)) != NULL) {
6063 /* skip key when already processed */
6064 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6065
6066 double *score = zmalloc(sizeof(double)), value;
6067 *score = src[i].weight * zunionInterDictValue(de);
6068
6069 /* because the zsets are sorted by size, its only possible
6070 * for sets at larger indices to hold this entry */
6071 for (j = (i+1); j < setnum; j++) {
6072 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6073 if (other) {
6074 value = src[j].weight * zunionInterDictValue(other);
6075 zunionInterAggregate(score, value, aggregate);
6076 }
6077 }
6078
6079 robj *o = dictGetEntryKey(de);
6080 dictAdd(dstzset->dict,o,score);
6081 incrRefCount(o); /* added to dictionary */
6082 zslInsert(dstzset->zsl,*score,o);
6083 incrRefCount(o); /* added to skiplist */
6084 }
6085 dictReleaseIterator(di);
6086 }
6087 } else {
6088 /* unknown operator */
6089 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6090 }
6091
6092 deleteKey(c->db,dstkey);
6093 if (dstzset->zsl->length) {
6094 dictAdd(c->db->dict,dstkey,dstobj);
6095 incrRefCount(dstkey);
6096 addReplyLongLong(c, dstzset->zsl->length);
6097 server.dirty++;
6098 } else {
6099 decrRefCount(dstobj);
6100 addReply(c, shared.czero);
6101 }
6102 zfree(src);
6103 }
6104
6105 static void zunionstoreCommand(redisClient *c) {
6106 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6107 }
6108
6109 static void zinterstoreCommand(redisClient *c) {
6110 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6111 }
6112
6113 static void zrangeGenericCommand(redisClient *c, int reverse) {
6114 robj *o;
6115 long start;
6116 long end;
6117 int withscores = 0;
6118 int llen;
6119 int rangelen, j;
6120 zset *zsetobj;
6121 zskiplist *zsl;
6122 zskiplistNode *ln;
6123 robj *ele;
6124
6125 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6126 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6127
6128 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6129 withscores = 1;
6130 } else if (c->argc >= 5) {
6131 addReply(c,shared.syntaxerr);
6132 return;
6133 }
6134
6135 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6136 || checkType(c,o,REDIS_ZSET)) return;
6137 zsetobj = o->ptr;
6138 zsl = zsetobj->zsl;
6139 llen = zsl->length;
6140
6141 /* convert negative indexes */
6142 if (start < 0) start = llen+start;
6143 if (end < 0) end = llen+end;
6144 if (start < 0) start = 0;
6145 if (end < 0) end = 0;
6146
6147 /* indexes sanity checks */
6148 if (start > end || start >= llen) {
6149 /* Out of range start or start > end result in empty list */
6150 addReply(c,shared.emptymultibulk);
6151 return;
6152 }
6153 if (end >= llen) end = llen-1;
6154 rangelen = (end-start)+1;
6155
6156 /* check if starting point is trivial, before searching
6157 * the element in log(N) time */
6158 if (reverse) {
6159 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6160 } else {
6161 ln = start == 0 ?
6162 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6163 }
6164
6165 /* Return the result in form of a multi-bulk reply */
6166 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6167 withscores ? (rangelen*2) : rangelen));
6168 for (j = 0; j < rangelen; j++) {
6169 ele = ln->obj;
6170 addReplyBulk(c,ele);
6171 if (withscores)
6172 addReplyDouble(c,ln->score);
6173 ln = reverse ? ln->backward : ln->forward[0];
6174 }
6175 }
6176
6177 static void zrangeCommand(redisClient *c) {
6178 zrangeGenericCommand(c,0);
6179 }
6180
6181 static void zrevrangeCommand(redisClient *c) {
6182 zrangeGenericCommand(c,1);
6183 }
6184
6185 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6186 * If justcount is non-zero, just the count is returned. */
6187 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6188 robj *o;
6189 double min, max;
6190 int minex = 0, maxex = 0; /* are min or max exclusive? */
6191 int offset = 0, limit = -1;
6192 int withscores = 0;
6193 int badsyntax = 0;
6194
6195 /* Parse the min-max interval. If one of the values is prefixed
6196 * by the "(" character, it's considered "open". For instance
6197 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6198 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6199 if (((char*)c->argv[2]->ptr)[0] == '(') {
6200 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6201 minex = 1;
6202 } else {
6203 min = strtod(c->argv[2]->ptr,NULL);
6204 }
6205 if (((char*)c->argv[3]->ptr)[0] == '(') {
6206 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6207 maxex = 1;
6208 } else {
6209 max = strtod(c->argv[3]->ptr,NULL);
6210 }
6211
6212 /* Parse "WITHSCORES": note that if the command was called with
6213 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6214 * enter the following paths to parse WITHSCORES and LIMIT. */
6215 if (c->argc == 5 || c->argc == 8) {
6216 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6217 withscores = 1;
6218 else
6219 badsyntax = 1;
6220 }
6221 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6222 badsyntax = 1;
6223 if (badsyntax) {
6224 addReplySds(c,
6225 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6226 return;
6227 }
6228
6229 /* Parse "LIMIT" */
6230 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6231 addReply(c,shared.syntaxerr);
6232 return;
6233 } else if (c->argc == (7 + withscores)) {
6234 offset = atoi(c->argv[5]->ptr);
6235 limit = atoi(c->argv[6]->ptr);
6236 if (offset < 0) offset = 0;
6237 }
6238
6239 /* Ok, lookup the key and get the range */
6240 o = lookupKeyRead(c->db,c->argv[1]);
6241 if (o == NULL) {
6242 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6243 } else {
6244 if (o->type != REDIS_ZSET) {
6245 addReply(c,shared.wrongtypeerr);
6246 } else {
6247 zset *zsetobj = o->ptr;
6248 zskiplist *zsl = zsetobj->zsl;
6249 zskiplistNode *ln;
6250 robj *ele, *lenobj = NULL;
6251 unsigned long rangelen = 0;
6252
6253 /* Get the first node with the score >= min, or with
6254 * score > min if 'minex' is true. */
6255 ln = zslFirstWithScore(zsl,min);
6256 while (minex && ln && ln->score == min) ln = ln->forward[0];
6257
6258 if (ln == NULL) {
6259 /* No element matching the speciifed interval */
6260 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6261 return;
6262 }
6263
6264 /* We don't know in advance how many matching elements there
6265 * are in the list, so we push this object that will represent
6266 * the multi-bulk length in the output buffer, and will "fix"
6267 * it later */
6268 if (!justcount) {
6269 lenobj = createObject(REDIS_STRING,NULL);
6270 addReply(c,lenobj);
6271 decrRefCount(lenobj);
6272 }
6273
6274 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6275 if (offset) {
6276 offset--;
6277 ln = ln->forward[0];
6278 continue;
6279 }
6280 if (limit == 0) break;
6281 if (!justcount) {
6282 ele = ln->obj;
6283 addReplyBulk(c,ele);
6284 if (withscores)
6285 addReplyDouble(c,ln->score);
6286 }
6287 ln = ln->forward[0];
6288 rangelen++;
6289 if (limit > 0) limit--;
6290 }
6291 if (justcount) {
6292 addReplyLongLong(c,(long)rangelen);
6293 } else {
6294 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6295 withscores ? (rangelen*2) : rangelen);
6296 }
6297 }
6298 }
6299 }
6300
6301 static void zrangebyscoreCommand(redisClient *c) {
6302 genericZrangebyscoreCommand(c,0);
6303 }
6304
6305 static void zcountCommand(redisClient *c) {
6306 genericZrangebyscoreCommand(c,1);
6307 }
6308
6309 static void zcardCommand(redisClient *c) {
6310 robj *o;
6311 zset *zs;
6312
6313 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6314 checkType(c,o,REDIS_ZSET)) return;
6315
6316 zs = o->ptr;
6317 addReplyUlong(c,zs->zsl->length);
6318 }
6319
6320 static void zscoreCommand(redisClient *c) {
6321 robj *o;
6322 zset *zs;
6323 dictEntry *de;
6324
6325 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6326 checkType(c,o,REDIS_ZSET)) return;
6327
6328 zs = o->ptr;
6329 de = dictFind(zs->dict,c->argv[2]);
6330 if (!de) {
6331 addReply(c,shared.nullbulk);
6332 } else {
6333 double *score = dictGetEntryVal(de);
6334
6335 addReplyDouble(c,*score);
6336 }
6337 }
6338
6339 static void zrankGenericCommand(redisClient *c, int reverse) {
6340 robj *o;
6341 zset *zs;
6342 zskiplist *zsl;
6343 dictEntry *de;
6344 unsigned long rank;
6345 double *score;
6346
6347 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6348 checkType(c,o,REDIS_ZSET)) return;
6349
6350 zs = o->ptr;
6351 zsl = zs->zsl;
6352 de = dictFind(zs->dict,c->argv[2]);
6353 if (!de) {
6354 addReply(c,shared.nullbulk);
6355 return;
6356 }
6357
6358 score = dictGetEntryVal(de);
6359 rank = zslGetRank(zsl, *score, c->argv[2]);
6360 if (rank) {
6361 if (reverse) {
6362 addReplyLongLong(c, zsl->length - rank);
6363 } else {
6364 addReplyLongLong(c, rank-1);
6365 }
6366 } else {
6367 addReply(c,shared.nullbulk);
6368 }
6369 }
6370
6371 static void zrankCommand(redisClient *c) {
6372 zrankGenericCommand(c, 0);
6373 }
6374
6375 static void zrevrankCommand(redisClient *c) {
6376 zrankGenericCommand(c, 1);
6377 }
6378
6379 /* ========================= Hashes utility functions ======================= */
6380 #define REDIS_HASH_KEY 1
6381 #define REDIS_HASH_VALUE 2
6382
6383 /* Check the length of a number of objects to see if we need to convert a
6384 * zipmap to a real hash. Note that we only check string encoded objects
6385 * as their string length can be queried in constant time. */
6386 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6387 int i;
6388 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6389
6390 for (i = start; i <= end; i++) {
6391 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6392 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6393 {
6394 convertToRealHash(subject);
6395 return;
6396 }
6397 }
6398 }
6399
6400 /* Encode given objects in-place when the hash uses a dict. */
6401 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6402 if (subject->encoding == REDIS_ENCODING_HT) {
6403 if (o1) *o1 = tryObjectEncoding(*o1);
6404 if (o2) *o2 = tryObjectEncoding(*o2);
6405 }
6406 }
6407
6408 /* Get the value from a hash identified by key. Returns either a string
6409 * object or NULL if the value cannot be found. The refcount of the object
6410 * is always increased by 1 when the value was found. */
6411 static robj *hashGet(robj *o, robj *key) {
6412 robj *value = NULL;
6413 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6414 unsigned char *v;
6415 unsigned int vlen;
6416 key = getDecodedObject(key);
6417 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6418 value = createStringObject((char*)v,vlen);
6419 }
6420 decrRefCount(key);
6421 } else {
6422 dictEntry *de = dictFind(o->ptr,key);
6423 if (de != NULL) {
6424 value = dictGetEntryVal(de);
6425 incrRefCount(value);
6426 }
6427 }
6428 return value;
6429 }
6430
6431 /* Test if the key exists in the given hash. Returns 1 if the key
6432 * exists and 0 when it doesn't. */
6433 static int hashExists(robj *o, robj *key) {
6434 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6435 key = getDecodedObject(key);
6436 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6437 decrRefCount(key);
6438 return 1;
6439 }
6440 decrRefCount(key);
6441 } else {
6442 if (dictFind(o->ptr,key) != NULL) {
6443 return 1;
6444 }
6445 }
6446 return 0;
6447 }
6448
6449 /* Add an element, discard the old if the key already exists.
6450 * Return 0 on insert and 1 on update. */
6451 static int hashSet(robj *o, robj *key, robj *value) {
6452 int update = 0;
6453 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6454 key = getDecodedObject(key);
6455 value = getDecodedObject(value);
6456 o->ptr = zipmapSet(o->ptr,
6457 key->ptr,sdslen(key->ptr),
6458 value->ptr,sdslen(value->ptr), &update);
6459 decrRefCount(key);
6460 decrRefCount(value);
6461
6462 /* Check if the zipmap needs to be upgraded to a real hash table */
6463 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6464 convertToRealHash(o);
6465 } else {
6466 if (dictReplace(o->ptr,key,value)) {
6467 /* Insert */
6468 incrRefCount(key);
6469 } else {
6470 /* Update */
6471 update = 1;
6472 }
6473 incrRefCount(value);
6474 }
6475 return update;
6476 }
6477
6478 /* Delete an element from a hash.
6479 * Return 1 on deleted and 0 on not found. */
6480 static int hashDelete(robj *o, robj *key) {
6481 int deleted = 0;
6482 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6483 key = getDecodedObject(key);
6484 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6485 decrRefCount(key);
6486 } else {
6487 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6488 /* Always check if the dictionary needs a resize after a delete. */
6489 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6490 }
6491 return deleted;
6492 }
6493
6494 /* Return the number of elements in a hash. */
6495 static unsigned long hashLength(robj *o) {
6496 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6497 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6498 }
6499
6500 /* Structure to hold hash iteration abstration. Note that iteration over
6501 * hashes involves both fields and values. Because it is possible that
6502 * not both are required, store pointers in the iterator to avoid
6503 * unnecessary memory allocation for fields/values. */
6504 typedef struct {
6505 int encoding;
6506 unsigned char *zi;
6507 unsigned char *zk, *zv;
6508 unsigned int zklen, zvlen;
6509
6510 dictIterator *di;
6511 dictEntry *de;
6512 } hashIterator;
6513
6514 static hashIterator *hashInitIterator(robj *subject) {
6515 hashIterator *hi = zmalloc(sizeof(hashIterator));
6516 hi->encoding = subject->encoding;
6517 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6518 hi->zi = zipmapRewind(subject->ptr);
6519 } else if (hi->encoding == REDIS_ENCODING_HT) {
6520 hi->di = dictGetIterator(subject->ptr);
6521 } else {
6522 redisAssert(NULL);
6523 }
6524 return hi;
6525 }
6526
6527 static void hashReleaseIterator(hashIterator *hi) {
6528 if (hi->encoding == REDIS_ENCODING_HT) {
6529 dictReleaseIterator(hi->di);
6530 }
6531 zfree(hi);
6532 }
6533
6534 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6535 * could be found and REDIS_ERR when the iterator reaches the end. */
6536 static int hashNext(hashIterator *hi) {
6537 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6538 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6539 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6540 } else {
6541 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6542 }
6543 return REDIS_OK;
6544 }
6545
6546 /* Get key or value object at current iteration position.
6547 * This increases the refcount of the field object by 1. */
6548 static robj *hashCurrent(hashIterator *hi, int what) {
6549 robj *o;
6550 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6551 if (what & REDIS_HASH_KEY) {
6552 o = createStringObject((char*)hi->zk,hi->zklen);
6553 } else {
6554 o = createStringObject((char*)hi->zv,hi->zvlen);
6555 }
6556 } else {
6557 if (what & REDIS_HASH_KEY) {
6558 o = dictGetEntryKey(hi->de);
6559 } else {
6560 o = dictGetEntryVal(hi->de);
6561 }
6562 incrRefCount(o);
6563 }
6564 return o;
6565 }
6566
6567 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6568 robj *o = lookupKeyWrite(c->db,key);
6569 if (o == NULL) {
6570 o = createHashObject();
6571 dictAdd(c->db->dict,key,o);
6572 incrRefCount(key);
6573 } else {
6574 if (o->type != REDIS_HASH) {
6575 addReply(c,shared.wrongtypeerr);
6576 return NULL;
6577 }
6578 }
6579 return o;
6580 }
6581
6582 /* ============================= Hash commands ============================== */
6583 static void hsetCommand(redisClient *c) {
6584 int update;
6585 robj *o;
6586
6587 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6588 hashTryConversion(o,c->argv,2,3);
6589 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6590 update = hashSet(o,c->argv[2],c->argv[3]);
6591 addReply(c, update ? shared.czero : shared.cone);
6592 server.dirty++;
6593 }
6594
6595 static void hsetnxCommand(redisClient *c) {
6596 robj *o;
6597 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6598 hashTryConversion(o,c->argv,2,3);
6599
6600 if (hashExists(o, c->argv[2])) {
6601 addReply(c, shared.czero);
6602 } else {
6603 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6604 hashSet(o,c->argv[2],c->argv[3]);
6605 addReply(c, shared.cone);
6606 server.dirty++;
6607 }
6608 }
6609
6610 static void hmsetCommand(redisClient *c) {
6611 int i;
6612 robj *o;
6613
6614 if ((c->argc % 2) == 1) {
6615 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6616 return;
6617 }
6618
6619 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6620 hashTryConversion(o,c->argv,2,c->argc-1);
6621 for (i = 2; i < c->argc; i += 2) {
6622 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6623 hashSet(o,c->argv[i],c->argv[i+1]);
6624 }
6625 addReply(c, shared.ok);
6626 server.dirty++;
6627 }
6628
6629 static void hincrbyCommand(redisClient *c) {
6630 long long value, incr;
6631 robj *o, *current, *new;
6632
6633 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6634 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6635 if ((current = hashGet(o,c->argv[2])) != NULL) {
6636 if (getLongLongFromObjectOrReply(c,current,&value,
6637 "hash value is not an integer") != REDIS_OK) {
6638 decrRefCount(current);
6639 return;
6640 }
6641 decrRefCount(current);
6642 } else {
6643 value = 0;
6644 }
6645
6646 value += incr;
6647 new = createStringObjectFromLongLong(value);
6648 hashTryObjectEncoding(o,&c->argv[2],NULL);
6649 hashSet(o,c->argv[2],new);
6650 decrRefCount(new);
6651 addReplyLongLong(c,value);
6652 server.dirty++;
6653 }
6654
6655 static void hgetCommand(redisClient *c) {
6656 robj *o, *value;
6657 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6658 checkType(c,o,REDIS_HASH)) return;
6659
6660 if ((value = hashGet(o,c->argv[2])) != NULL) {
6661 addReplyBulk(c,value);
6662 decrRefCount(value);
6663 } else {
6664 addReply(c,shared.nullbulk);
6665 }
6666 }
6667
6668 static void hmgetCommand(redisClient *c) {
6669 int i;
6670 robj *o, *value;
6671 o = lookupKeyRead(c->db,c->argv[1]);
6672 if (o != NULL && o->type != REDIS_HASH) {
6673 addReply(c,shared.wrongtypeerr);
6674 }
6675
6676 /* Note the check for o != NULL happens inside the loop. This is
6677 * done because objects that cannot be found are considered to be
6678 * an empty hash. The reply should then be a series of NULLs. */
6679 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6680 for (i = 2; i < c->argc; i++) {
6681 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6682 addReplyBulk(c,value);
6683 decrRefCount(value);
6684 } else {
6685 addReply(c,shared.nullbulk);
6686 }
6687 }
6688 }
6689
6690 static void hdelCommand(redisClient *c) {
6691 robj *o;
6692 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6693 checkType(c,o,REDIS_HASH)) return;
6694
6695 if (hashDelete(o,c->argv[2])) {
6696 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6697 addReply(c,shared.cone);
6698 server.dirty++;
6699 } else {
6700 addReply(c,shared.czero);
6701 }
6702 }
6703
6704 static void hlenCommand(redisClient *c) {
6705 robj *o;
6706 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6707 checkType(c,o,REDIS_HASH)) return;
6708
6709 addReplyUlong(c,hashLength(o));
6710 }
6711
6712 static void genericHgetallCommand(redisClient *c, int flags) {
6713 robj *o, *lenobj, *obj;
6714 unsigned long count = 0;
6715 hashIterator *hi;
6716
6717 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6718 || checkType(c,o,REDIS_HASH)) return;
6719
6720 lenobj = createObject(REDIS_STRING,NULL);
6721 addReply(c,lenobj);
6722 decrRefCount(lenobj);
6723
6724 hi = hashInitIterator(o);
6725 while (hashNext(hi) != REDIS_ERR) {
6726 if (flags & REDIS_HASH_KEY) {
6727 obj = hashCurrent(hi,REDIS_HASH_KEY);
6728 addReplyBulk(c,obj);
6729 decrRefCount(obj);
6730 count++;
6731 }
6732 if (flags & REDIS_HASH_VALUE) {
6733 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6734 addReplyBulk(c,obj);
6735 decrRefCount(obj);
6736 count++;
6737 }
6738 }
6739 hashReleaseIterator(hi);
6740
6741 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6742 }
6743
6744 static void hkeysCommand(redisClient *c) {
6745 genericHgetallCommand(c,REDIS_HASH_KEY);
6746 }
6747
6748 static void hvalsCommand(redisClient *c) {
6749 genericHgetallCommand(c,REDIS_HASH_VALUE);
6750 }
6751
6752 static void hgetallCommand(redisClient *c) {
6753 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6754 }
6755
6756 static void hexistsCommand(redisClient *c) {
6757 robj *o;
6758 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6759 checkType(c,o,REDIS_HASH)) return;
6760
6761 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6762 }
6763
6764 static void convertToRealHash(robj *o) {
6765 unsigned char *key, *val, *p, *zm = o->ptr;
6766 unsigned int klen, vlen;
6767 dict *dict = dictCreate(&hashDictType,NULL);
6768
6769 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6770 p = zipmapRewind(zm);
6771 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6772 robj *keyobj, *valobj;
6773
6774 keyobj = createStringObject((char*)key,klen);
6775 valobj = createStringObject((char*)val,vlen);
6776 keyobj = tryObjectEncoding(keyobj);
6777 valobj = tryObjectEncoding(valobj);
6778 dictAdd(dict,keyobj,valobj);
6779 }
6780 o->encoding = REDIS_ENCODING_HT;
6781 o->ptr = dict;
6782 zfree(zm);
6783 }
6784
6785 /* ========================= Non type-specific commands ==================== */
6786
6787 static void flushdbCommand(redisClient *c) {
6788 server.dirty += dictSize(c->db->dict);
6789 touchWatchedKeysOnFlush(c->db->id);
6790 dictEmpty(c->db->dict);
6791 dictEmpty(c->db->expires);
6792 addReply(c,shared.ok);
6793 }
6794
6795 static void flushallCommand(redisClient *c) {
6796 touchWatchedKeysOnFlush(-1);
6797 server.dirty += emptyDb();
6798 addReply(c,shared.ok);
6799 if (server.bgsavechildpid != -1) {
6800 kill(server.bgsavechildpid,SIGKILL);
6801 rdbRemoveTempFile(server.bgsavechildpid);
6802 }
6803 rdbSave(server.dbfilename);
6804 server.dirty++;
6805 }
6806
6807 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6808 redisSortOperation *so = zmalloc(sizeof(*so));
6809 so->type = type;
6810 so->pattern = pattern;
6811 return so;
6812 }
6813
6814 /* Return the value associated to the key with a name obtained
6815 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6816 * The returned object will always have its refcount increased by 1
6817 * when it is non-NULL. */
6818 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6819 char *p, *f;
6820 sds spat, ssub;
6821 robj keyobj, fieldobj, *o;
6822 int prefixlen, sublen, postfixlen, fieldlen;
6823 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6824 struct {
6825 long len;
6826 long free;
6827 char buf[REDIS_SORTKEY_MAX+1];
6828 } keyname, fieldname;
6829
6830 /* If the pattern is "#" return the substitution object itself in order
6831 * to implement the "SORT ... GET #" feature. */
6832 spat = pattern->ptr;
6833 if (spat[0] == '#' && spat[1] == '\0') {
6834 incrRefCount(subst);
6835 return subst;
6836 }
6837
6838 /* The substitution object may be specially encoded. If so we create
6839 * a decoded object on the fly. Otherwise getDecodedObject will just
6840 * increment the ref count, that we'll decrement later. */
6841 subst = getDecodedObject(subst);
6842
6843 ssub = subst->ptr;
6844 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6845 p = strchr(spat,'*');
6846 if (!p) {
6847 decrRefCount(subst);
6848 return NULL;
6849 }
6850
6851 /* Find out if we're dealing with a hash dereference. */
6852 if ((f = strstr(p+1, "->")) != NULL) {
6853 fieldlen = sdslen(spat)-(f-spat);
6854 /* this also copies \0 character */
6855 memcpy(fieldname.buf,f+2,fieldlen-1);
6856 fieldname.len = fieldlen-2;
6857 } else {
6858 fieldlen = 0;
6859 }
6860
6861 prefixlen = p-spat;
6862 sublen = sdslen(ssub);
6863 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6864 memcpy(keyname.buf,spat,prefixlen);
6865 memcpy(keyname.buf+prefixlen,ssub,sublen);
6866 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6867 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6868 keyname.len = prefixlen+sublen+postfixlen;
6869 decrRefCount(subst);
6870
6871 /* Lookup substituted key */
6872 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6873 o = lookupKeyRead(db,&keyobj);
6874 if (o == NULL) return NULL;
6875
6876 if (fieldlen > 0) {
6877 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6878
6879 /* Retrieve value from hash by the field name. This operation
6880 * already increases the refcount of the returned object. */
6881 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6882 o = hashGet(o, &fieldobj);
6883 } else {
6884 if (o->type != REDIS_STRING) return NULL;
6885
6886 /* Every object that this function returns needs to have its refcount
6887 * increased. sortCommand decreases it again. */
6888 incrRefCount(o);
6889 }
6890
6891 return o;
6892 }
6893
6894 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6895 * the additional parameter is not standard but a BSD-specific we have to
6896 * pass sorting parameters via the global 'server' structure */
6897 static int sortCompare(const void *s1, const void *s2) {
6898 const redisSortObject *so1 = s1, *so2 = s2;
6899 int cmp;
6900
6901 if (!server.sort_alpha) {
6902 /* Numeric sorting. Here it's trivial as we precomputed scores */
6903 if (so1->u.score > so2->u.score) {
6904 cmp = 1;
6905 } else if (so1->u.score < so2->u.score) {
6906 cmp = -1;
6907 } else {
6908 cmp = 0;
6909 }
6910 } else {
6911 /* Alphanumeric sorting */
6912 if (server.sort_bypattern) {
6913 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6914 /* At least one compare object is NULL */
6915 if (so1->u.cmpobj == so2->u.cmpobj)
6916 cmp = 0;
6917 else if (so1->u.cmpobj == NULL)
6918 cmp = -1;
6919 else
6920 cmp = 1;
6921 } else {
6922 /* We have both the objects, use strcoll */
6923 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6924 }
6925 } else {
6926 /* Compare elements directly. */
6927 cmp = compareStringObjects(so1->obj,so2->obj);
6928 }
6929 }
6930 return server.sort_desc ? -cmp : cmp;
6931 }
6932
6933 /* The SORT command is the most complex command in Redis. Warning: this code
6934 * is optimized for speed and a bit less for readability */
6935 static void sortCommand(redisClient *c) {
6936 list *operations;
6937 int outputlen = 0;
6938 int desc = 0, alpha = 0;
6939 int limit_start = 0, limit_count = -1, start, end;
6940 int j, dontsort = 0, vectorlen;
6941 int getop = 0; /* GET operation counter */
6942 robj *sortval, *sortby = NULL, *storekey = NULL;
6943 redisSortObject *vector; /* Resulting vector to sort */
6944
6945 /* Lookup the key to sort. It must be of the right types */
6946 sortval = lookupKeyRead(c->db,c->argv[1]);
6947 if (sortval == NULL) {
6948 addReply(c,shared.emptymultibulk);
6949 return;
6950 }
6951 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6952 sortval->type != REDIS_ZSET)
6953 {
6954 addReply(c,shared.wrongtypeerr);
6955 return;
6956 }
6957
6958 /* Create a list of operations to perform for every sorted element.
6959 * Operations can be GET/DEL/INCR/DECR */
6960 operations = listCreate();
6961 listSetFreeMethod(operations,zfree);
6962 j = 2;
6963
6964 /* Now we need to protect sortval incrementing its count, in the future
6965 * SORT may have options able to overwrite/delete keys during the sorting
6966 * and the sorted key itself may get destroied */
6967 incrRefCount(sortval);
6968
6969 /* The SORT command has an SQL-alike syntax, parse it */
6970 while(j < c->argc) {
6971 int leftargs = c->argc-j-1;
6972 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6973 desc = 0;
6974 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6975 desc = 1;
6976 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6977 alpha = 1;
6978 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6979 limit_start = atoi(c->argv[j+1]->ptr);
6980 limit_count = atoi(c->argv[j+2]->ptr);
6981 j+=2;
6982 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6983 storekey = c->argv[j+1];
6984 j++;
6985 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6986 sortby = c->argv[j+1];
6987 /* If the BY pattern does not contain '*', i.e. it is constant,
6988 * we don't need to sort nor to lookup the weight keys. */
6989 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6990 j++;
6991 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6992 listAddNodeTail(operations,createSortOperation(
6993 REDIS_SORT_GET,c->argv[j+1]));
6994 getop++;
6995 j++;
6996 } else {
6997 decrRefCount(sortval);
6998 listRelease(operations);
6999 addReply(c,shared.syntaxerr);
7000 return;
7001 }
7002 j++;
7003 }
7004
7005 /* Load the sorting vector with all the objects to sort */
7006 switch(sortval->type) {
7007 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7008 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7009 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7010 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7011 }
7012 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7013 j = 0;
7014
7015 if (sortval->type == REDIS_LIST) {
7016 list *list = sortval->ptr;
7017 listNode *ln;
7018 listIter li;
7019
7020 listRewind(list,&li);
7021 while((ln = listNext(&li))) {
7022 robj *ele = ln->value;
7023 vector[j].obj = ele;
7024 vector[j].u.score = 0;
7025 vector[j].u.cmpobj = NULL;
7026 j++;
7027 }
7028 } else {
7029 dict *set;
7030 dictIterator *di;
7031 dictEntry *setele;
7032
7033 if (sortval->type == REDIS_SET) {
7034 set = sortval->ptr;
7035 } else {
7036 zset *zs = sortval->ptr;
7037 set = zs->dict;
7038 }
7039
7040 di = dictGetIterator(set);
7041 while((setele = dictNext(di)) != NULL) {
7042 vector[j].obj = dictGetEntryKey(setele);
7043 vector[j].u.score = 0;
7044 vector[j].u.cmpobj = NULL;
7045 j++;
7046 }
7047 dictReleaseIterator(di);
7048 }
7049 redisAssert(j == vectorlen);
7050
7051 /* Now it's time to load the right scores in the sorting vector */
7052 if (dontsort == 0) {
7053 for (j = 0; j < vectorlen; j++) {
7054 robj *byval;
7055 if (sortby) {
7056 /* lookup value to sort by */
7057 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7058 if (!byval) continue;
7059 } else {
7060 /* use object itself to sort by */
7061 byval = vector[j].obj;
7062 }
7063
7064 if (alpha) {
7065 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7066 } else {
7067 if (byval->encoding == REDIS_ENCODING_RAW) {
7068 vector[j].u.score = strtod(byval->ptr,NULL);
7069 } else if (byval->encoding == REDIS_ENCODING_INT) {
7070 /* Don't need to decode the object if it's
7071 * integer-encoded (the only encoding supported) so
7072 * far. We can just cast it */
7073 vector[j].u.score = (long)byval->ptr;
7074 } else {
7075 redisAssert(1 != 1);
7076 }
7077 }
7078
7079 /* when the object was retrieved using lookupKeyByPattern,
7080 * its refcount needs to be decreased. */
7081 if (sortby) {
7082 decrRefCount(byval);
7083 }
7084 }
7085 }
7086
7087 /* We are ready to sort the vector... perform a bit of sanity check
7088 * on the LIMIT option too. We'll use a partial version of quicksort. */
7089 start = (limit_start < 0) ? 0 : limit_start;
7090 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7091 if (start >= vectorlen) {
7092 start = vectorlen-1;
7093 end = vectorlen-2;
7094 }
7095 if (end >= vectorlen) end = vectorlen-1;
7096
7097 if (dontsort == 0) {
7098 server.sort_desc = desc;
7099 server.sort_alpha = alpha;
7100 server.sort_bypattern = sortby ? 1 : 0;
7101 if (sortby && (start != 0 || end != vectorlen-1))
7102 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7103 else
7104 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7105 }
7106
7107 /* Send command output to the output buffer, performing the specified
7108 * GET/DEL/INCR/DECR operations if any. */
7109 outputlen = getop ? getop*(end-start+1) : end-start+1;
7110 if (storekey == NULL) {
7111 /* STORE option not specified, sent the sorting result to client */
7112 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7113 for (j = start; j <= end; j++) {
7114 listNode *ln;
7115 listIter li;
7116
7117 if (!getop) addReplyBulk(c,vector[j].obj);
7118 listRewind(operations,&li);
7119 while((ln = listNext(&li))) {
7120 redisSortOperation *sop = ln->value;
7121 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7122 vector[j].obj);
7123
7124 if (sop->type == REDIS_SORT_GET) {
7125 if (!val) {
7126 addReply(c,shared.nullbulk);
7127 } else {
7128 addReplyBulk(c,val);
7129 decrRefCount(val);
7130 }
7131 } else {
7132 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7133 }
7134 }
7135 }
7136 } else {
7137 robj *listObject = createListObject();
7138 list *listPtr = (list*) listObject->ptr;
7139
7140 /* STORE option specified, set the sorting result as a List object */
7141 for (j = start; j <= end; j++) {
7142 listNode *ln;
7143 listIter li;
7144
7145 if (!getop) {
7146 listAddNodeTail(listPtr,vector[j].obj);
7147 incrRefCount(vector[j].obj);
7148 }
7149 listRewind(operations,&li);
7150 while((ln = listNext(&li))) {
7151 redisSortOperation *sop = ln->value;
7152 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7153 vector[j].obj);
7154
7155 if (sop->type == REDIS_SORT_GET) {
7156 if (!val) {
7157 listAddNodeTail(listPtr,createStringObject("",0));
7158 } else {
7159 /* We should do a incrRefCount on val because it is
7160 * added to the list, but also a decrRefCount because
7161 * it is returned by lookupKeyByPattern. This results
7162 * in doing nothing at all. */
7163 listAddNodeTail(listPtr,val);
7164 }
7165 } else {
7166 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7167 }
7168 }
7169 }
7170 if (dictReplace(c->db->dict,storekey,listObject)) {
7171 incrRefCount(storekey);
7172 }
7173 /* Note: we add 1 because the DB is dirty anyway since even if the
7174 * SORT result is empty a new key is set and maybe the old content
7175 * replaced. */
7176 server.dirty += 1+outputlen;
7177 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7178 }
7179
7180 /* Cleanup */
7181 decrRefCount(sortval);
7182 listRelease(operations);
7183 for (j = 0; j < vectorlen; j++) {
7184 if (alpha && vector[j].u.cmpobj)
7185 decrRefCount(vector[j].u.cmpobj);
7186 }
7187 zfree(vector);
7188 }
7189
7190 /* Convert an amount of bytes into a human readable string in the form
7191 * of 100B, 2G, 100M, 4K, and so forth. */
7192 static void bytesToHuman(char *s, unsigned long long n) {
7193 double d;
7194
7195 if (n < 1024) {
7196 /* Bytes */
7197 sprintf(s,"%lluB",n);
7198 return;
7199 } else if (n < (1024*1024)) {
7200 d = (double)n/(1024);
7201 sprintf(s,"%.2fK",d);
7202 } else if (n < (1024LL*1024*1024)) {
7203 d = (double)n/(1024*1024);
7204 sprintf(s,"%.2fM",d);
7205 } else if (n < (1024LL*1024*1024*1024)) {
7206 d = (double)n/(1024LL*1024*1024);
7207 sprintf(s,"%.2fG",d);
7208 }
7209 }
7210
7211 /* Create the string returned by the INFO command. This is decoupled
7212 * by the INFO command itself as we need to report the same information
7213 * on memory corruption problems. */
7214 static sds genRedisInfoString(void) {
7215 sds info;
7216 time_t uptime = time(NULL)-server.stat_starttime;
7217 int j;
7218 char hmem[64];
7219
7220 bytesToHuman(hmem,zmalloc_used_memory());
7221 info = sdscatprintf(sdsempty(),
7222 "redis_version:%s\r\n"
7223 "redis_git_sha1:%s\r\n"
7224 "redis_git_dirty:%d\r\n"
7225 "arch_bits:%s\r\n"
7226 "multiplexing_api:%s\r\n"
7227 "process_id:%ld\r\n"
7228 "uptime_in_seconds:%ld\r\n"
7229 "uptime_in_days:%ld\r\n"
7230 "connected_clients:%d\r\n"
7231 "connected_slaves:%d\r\n"
7232 "blocked_clients:%d\r\n"
7233 "used_memory:%zu\r\n"
7234 "used_memory_human:%s\r\n"
7235 "changes_since_last_save:%lld\r\n"
7236 "bgsave_in_progress:%d\r\n"
7237 "last_save_time:%ld\r\n"
7238 "bgrewriteaof_in_progress:%d\r\n"
7239 "total_connections_received:%lld\r\n"
7240 "total_commands_processed:%lld\r\n"
7241 "expired_keys:%lld\r\n"
7242 "hash_max_zipmap_entries:%zu\r\n"
7243 "hash_max_zipmap_value:%zu\r\n"
7244 "pubsub_channels:%ld\r\n"
7245 "pubsub_patterns:%u\r\n"
7246 "vm_enabled:%d\r\n"
7247 "role:%s\r\n"
7248 ,REDIS_VERSION,
7249 REDIS_GIT_SHA1,
7250 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7251 (sizeof(long) == 8) ? "64" : "32",
7252 aeGetApiName(),
7253 (long) getpid(),
7254 uptime,
7255 uptime/(3600*24),
7256 listLength(server.clients)-listLength(server.slaves),
7257 listLength(server.slaves),
7258 server.blpop_blocked_clients,
7259 zmalloc_used_memory(),
7260 hmem,
7261 server.dirty,
7262 server.bgsavechildpid != -1,
7263 server.lastsave,
7264 server.bgrewritechildpid != -1,
7265 server.stat_numconnections,
7266 server.stat_numcommands,
7267 server.stat_expiredkeys,
7268 server.hash_max_zipmap_entries,
7269 server.hash_max_zipmap_value,
7270 dictSize(server.pubsub_channels),
7271 listLength(server.pubsub_patterns),
7272 server.vm_enabled != 0,
7273 server.masterhost == NULL ? "master" : "slave"
7274 );
7275 if (server.masterhost) {
7276 info = sdscatprintf(info,
7277 "master_host:%s\r\n"
7278 "master_port:%d\r\n"
7279 "master_link_status:%s\r\n"
7280 "master_last_io_seconds_ago:%d\r\n"
7281 ,server.masterhost,
7282 server.masterport,
7283 (server.replstate == REDIS_REPL_CONNECTED) ?
7284 "up" : "down",
7285 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7286 );
7287 }
7288 if (server.vm_enabled) {
7289 lockThreadedIO();
7290 info = sdscatprintf(info,
7291 "vm_conf_max_memory:%llu\r\n"
7292 "vm_conf_page_size:%llu\r\n"
7293 "vm_conf_pages:%llu\r\n"
7294 "vm_stats_used_pages:%llu\r\n"
7295 "vm_stats_swapped_objects:%llu\r\n"
7296 "vm_stats_swappin_count:%llu\r\n"
7297 "vm_stats_swappout_count:%llu\r\n"
7298 "vm_stats_io_newjobs_len:%lu\r\n"
7299 "vm_stats_io_processing_len:%lu\r\n"
7300 "vm_stats_io_processed_len:%lu\r\n"
7301 "vm_stats_io_active_threads:%lu\r\n"
7302 "vm_stats_blocked_clients:%lu\r\n"
7303 ,(unsigned long long) server.vm_max_memory,
7304 (unsigned long long) server.vm_page_size,
7305 (unsigned long long) server.vm_pages,
7306 (unsigned long long) server.vm_stats_used_pages,
7307 (unsigned long long) server.vm_stats_swapped_objects,
7308 (unsigned long long) server.vm_stats_swapins,
7309 (unsigned long long) server.vm_stats_swapouts,
7310 (unsigned long) listLength(server.io_newjobs),
7311 (unsigned long) listLength(server.io_processing),
7312 (unsigned long) listLength(server.io_processed),
7313 (unsigned long) server.io_active_threads,
7314 (unsigned long) server.vm_blocked_clients
7315 );
7316 unlockThreadedIO();
7317 }
7318 for (j = 0; j < server.dbnum; j++) {
7319 long long keys, vkeys;
7320
7321 keys = dictSize(server.db[j].dict);
7322 vkeys = dictSize(server.db[j].expires);
7323 if (keys || vkeys) {
7324 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7325 j, keys, vkeys);
7326 }
7327 }
7328 return info;
7329 }
7330
7331 static void infoCommand(redisClient *c) {
7332 sds info = genRedisInfoString();
7333 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7334 (unsigned long)sdslen(info)));
7335 addReplySds(c,info);
7336 addReply(c,shared.crlf);
7337 }
7338
7339 static void monitorCommand(redisClient *c) {
7340 /* ignore MONITOR if aleady slave or in monitor mode */
7341 if (c->flags & REDIS_SLAVE) return;
7342
7343 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7344 c->slaveseldb = 0;
7345 listAddNodeTail(server.monitors,c);
7346 addReply(c,shared.ok);
7347 }
7348
7349 /* ================================= Expire ================================= */
7350 static int removeExpire(redisDb *db, robj *key) {
7351 if (dictDelete(db->expires,key) == DICT_OK) {
7352 return 1;
7353 } else {
7354 return 0;
7355 }
7356 }
7357
7358 static int setExpire(redisDb *db, robj *key, time_t when) {
7359 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7360 return 0;
7361 } else {
7362 incrRefCount(key);
7363 return 1;
7364 }
7365 }
7366
7367 /* Return the expire time of the specified key, or -1 if no expire
7368 * is associated with this key (i.e. the key is non volatile) */
7369 static time_t getExpire(redisDb *db, robj *key) {
7370 dictEntry *de;
7371
7372 /* No expire? return ASAP */
7373 if (dictSize(db->expires) == 0 ||
7374 (de = dictFind(db->expires,key)) == NULL) return -1;
7375
7376 return (time_t) dictGetEntryVal(de);
7377 }
7378
7379 static int expireIfNeeded(redisDb *db, robj *key) {
7380 time_t when;
7381 dictEntry *de;
7382
7383 /* No expire? return ASAP */
7384 if (dictSize(db->expires) == 0 ||
7385 (de = dictFind(db->expires,key)) == NULL) return 0;
7386
7387 /* Lookup the expire */
7388 when = (time_t) dictGetEntryVal(de);
7389 if (time(NULL) <= when) return 0;
7390
7391 /* Delete the key */
7392 dictDelete(db->expires,key);
7393 server.stat_expiredkeys++;
7394 return dictDelete(db->dict,key) == DICT_OK;
7395 }
7396
7397 static int deleteIfVolatile(redisDb *db, robj *key) {
7398 dictEntry *de;
7399
7400 /* No expire? return ASAP */
7401 if (dictSize(db->expires) == 0 ||
7402 (de = dictFind(db->expires,key)) == NULL) return 0;
7403
7404 /* Delete the key */
7405 server.dirty++;
7406 server.stat_expiredkeys++;
7407 dictDelete(db->expires,key);
7408 return dictDelete(db->dict,key) == DICT_OK;
7409 }
7410
7411 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7412 dictEntry *de;
7413 time_t seconds;
7414
7415 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7416
7417 seconds -= offset;
7418
7419 de = dictFind(c->db->dict,key);
7420 if (de == NULL) {
7421 addReply(c,shared.czero);
7422 return;
7423 }
7424 if (seconds <= 0) {
7425 if (deleteKey(c->db,key)) server.dirty++;
7426 addReply(c, shared.cone);
7427 return;
7428 } else {
7429 time_t when = time(NULL)+seconds;
7430 if (setExpire(c->db,key,when)) {
7431 addReply(c,shared.cone);
7432 server.dirty++;
7433 } else {
7434 addReply(c,shared.czero);
7435 }
7436 return;
7437 }
7438 }
7439
7440 static void expireCommand(redisClient *c) {
7441 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7442 }
7443
7444 static void expireatCommand(redisClient *c) {
7445 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7446 }
7447
7448 static void ttlCommand(redisClient *c) {
7449 time_t expire;
7450 int ttl = -1;
7451
7452 expire = getExpire(c->db,c->argv[1]);
7453 if (expire != -1) {
7454 ttl = (int) (expire-time(NULL));
7455 if (ttl < 0) ttl = -1;
7456 }
7457 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7458 }
7459
7460 /* ================================ MULTI/EXEC ============================== */
7461
7462 /* Client state initialization for MULTI/EXEC */
7463 static void initClientMultiState(redisClient *c) {
7464 c->mstate.commands = NULL;
7465 c->mstate.count = 0;
7466 }
7467
7468 /* Release all the resources associated with MULTI/EXEC state */
7469 static void freeClientMultiState(redisClient *c) {
7470 int j;
7471
7472 for (j = 0; j < c->mstate.count; j++) {
7473 int i;
7474 multiCmd *mc = c->mstate.commands+j;
7475
7476 for (i = 0; i < mc->argc; i++)
7477 decrRefCount(mc->argv[i]);
7478 zfree(mc->argv);
7479 }
7480 zfree(c->mstate.commands);
7481 }
7482
7483 /* Add a new command into the MULTI commands queue */
7484 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7485 multiCmd *mc;
7486 int j;
7487
7488 c->mstate.commands = zrealloc(c->mstate.commands,
7489 sizeof(multiCmd)*(c->mstate.count+1));
7490 mc = c->mstate.commands+c->mstate.count;
7491 mc->cmd = cmd;
7492 mc->argc = c->argc;
7493 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7494 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7495 for (j = 0; j < c->argc; j++)
7496 incrRefCount(mc->argv[j]);
7497 c->mstate.count++;
7498 }
7499
7500 static void multiCommand(redisClient *c) {
7501 c->flags |= REDIS_MULTI;
7502 addReply(c,shared.ok);
7503 }
7504
7505 static void discardCommand(redisClient *c) {
7506 if (!(c->flags & REDIS_MULTI)) {
7507 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7508 return;
7509 }
7510
7511 freeClientMultiState(c);
7512 initClientMultiState(c);
7513 c->flags &= (~REDIS_MULTI);
7514 addReply(c,shared.ok);
7515 }
7516
7517 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7518 * implememntation for more information. */
7519 static void execCommandReplicateMulti(redisClient *c) {
7520 struct redisCommand *cmd;
7521 robj *multistring = createStringObject("MULTI",5);
7522
7523 cmd = lookupCommand("multi");
7524 if (server.appendonly)
7525 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7526 if (listLength(server.slaves))
7527 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7528 decrRefCount(multistring);
7529 }
7530
7531 static void execCommand(redisClient *c) {
7532 int j;
7533 robj **orig_argv;
7534 int orig_argc;
7535
7536 if (!(c->flags & REDIS_MULTI)) {
7537 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7538 return;
7539 }
7540
7541 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7542 * A failed EXEC will return a multi bulk nil object. */
7543 if (c->flags & REDIS_DIRTY_CAS) {
7544 freeClientMultiState(c);
7545 initClientMultiState(c);
7546 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7547 unwatchAllKeys(c);
7548 addReply(c,shared.nullmultibulk);
7549 return;
7550 }
7551
7552 /* Replicate a MULTI request now that we are sure the block is executed.
7553 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7554 * both the AOF and the replication link will have the same consistency
7555 * and atomicity guarantees. */
7556 execCommandReplicateMulti(c);
7557
7558 /* Exec all the queued commands */
7559 orig_argv = c->argv;
7560 orig_argc = c->argc;
7561 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7562 for (j = 0; j < c->mstate.count; j++) {
7563 c->argc = c->mstate.commands[j].argc;
7564 c->argv = c->mstate.commands[j].argv;
7565 call(c,c->mstate.commands[j].cmd);
7566 }
7567 c->argv = orig_argv;
7568 c->argc = orig_argc;
7569 freeClientMultiState(c);
7570 initClientMultiState(c);
7571 c->flags &= (~REDIS_MULTI);
7572 unwatchAllKeys(c);
7573 /* Make sure the EXEC command is always replicated / AOF, since we
7574 * always send the MULTI command (we can't know beforehand if the
7575 * next operations will contain at least a modification to the DB). */
7576 server.dirty++;
7577 }
7578
7579 /* =========================== Blocking Operations ========================= */
7580
7581 /* Currently Redis blocking operations support is limited to list POP ops,
7582 * so the current implementation is not fully generic, but it is also not
7583 * completely specific so it will not require a rewrite to support new
7584 * kind of blocking operations in the future.
7585 *
7586 * Still it's important to note that list blocking operations can be already
7587 * used as a notification mechanism in order to implement other blocking
7588 * operations at application level, so there must be a very strong evidence
7589 * of usefulness and generality before new blocking operations are implemented.
7590 *
7591 * This is how the current blocking POP works, we use BLPOP as example:
7592 * - If the user calls BLPOP and the key exists and contains a non empty list
7593 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7594 * if there is not to block.
7595 * - If instead BLPOP is called and the key does not exists or the list is
7596 * empty we need to block. In order to do so we remove the notification for
7597 * new data to read in the client socket (so that we'll not serve new
7598 * requests if the blocking request is not served). Also we put the client
7599 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7600 * blocking for this keys.
7601 * - If a PUSH operation against a key with blocked clients waiting is
7602 * performed, we serve the first in the list: basically instead to push
7603 * the new element inside the list we return it to the (first / oldest)
7604 * blocking client, unblock the client, and remove it form the list.
7605 *
7606 * The above comment and the source code should be enough in order to understand
7607 * the implementation and modify / fix it later.
7608 */
7609
7610 /* Set a client in blocking mode for the specified key, with the specified
7611 * timeout */
7612 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7613 dictEntry *de;
7614 list *l;
7615 int j;
7616
7617 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7618 c->blocking_keys_num = numkeys;
7619 c->blockingto = timeout;
7620 for (j = 0; j < numkeys; j++) {
7621 /* Add the key in the client structure, to map clients -> keys */
7622 c->blocking_keys[j] = keys[j];
7623 incrRefCount(keys[j]);
7624
7625 /* And in the other "side", to map keys -> clients */
7626 de = dictFind(c->db->blocking_keys,keys[j]);
7627 if (de == NULL) {
7628 int retval;
7629
7630 /* For every key we take a list of clients blocked for it */
7631 l = listCreate();
7632 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7633 incrRefCount(keys[j]);
7634 assert(retval == DICT_OK);
7635 } else {
7636 l = dictGetEntryVal(de);
7637 }
7638 listAddNodeTail(l,c);
7639 }
7640 /* Mark the client as a blocked client */
7641 c->flags |= REDIS_BLOCKED;
7642 server.blpop_blocked_clients++;
7643 }
7644
7645 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7646 static void unblockClientWaitingData(redisClient *c) {
7647 dictEntry *de;
7648 list *l;
7649 int j;
7650
7651 assert(c->blocking_keys != NULL);
7652 /* The client may wait for multiple keys, so unblock it for every key. */
7653 for (j = 0; j < c->blocking_keys_num; j++) {
7654 /* Remove this client from the list of clients waiting for this key. */
7655 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7656 assert(de != NULL);
7657 l = dictGetEntryVal(de);
7658 listDelNode(l,listSearchKey(l,c));
7659 /* If the list is empty we need to remove it to avoid wasting memory */
7660 if (listLength(l) == 0)
7661 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7662 decrRefCount(c->blocking_keys[j]);
7663 }
7664 /* Cleanup the client structure */
7665 zfree(c->blocking_keys);
7666 c->blocking_keys = NULL;
7667 c->flags &= (~REDIS_BLOCKED);
7668 server.blpop_blocked_clients--;
7669 /* We want to process data if there is some command waiting
7670 * in the input buffer. Note that this is safe even if
7671 * unblockClientWaitingData() gets called from freeClient() because
7672 * freeClient() will be smart enough to call this function
7673 * *after* c->querybuf was set to NULL. */
7674 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7675 }
7676
7677 /* This should be called from any function PUSHing into lists.
7678 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7679 * 'ele' is the element pushed.
7680 *
7681 * If the function returns 0 there was no client waiting for a list push
7682 * against this key.
7683 *
7684 * If the function returns 1 there was a client waiting for a list push
7685 * against this key, the element was passed to this client thus it's not
7686 * needed to actually add it to the list and the caller should return asap. */
7687 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7688 struct dictEntry *de;
7689 redisClient *receiver;
7690 list *l;
7691 listNode *ln;
7692
7693 de = dictFind(c->db->blocking_keys,key);
7694 if (de == NULL) return 0;
7695 l = dictGetEntryVal(de);
7696 ln = listFirst(l);
7697 assert(ln != NULL);
7698 receiver = ln->value;
7699
7700 addReplySds(receiver,sdsnew("*2\r\n"));
7701 addReplyBulk(receiver,key);
7702 addReplyBulk(receiver,ele);
7703 unblockClientWaitingData(receiver);
7704 return 1;
7705 }
7706
7707 /* Blocking RPOP/LPOP */
7708 static void blockingPopGenericCommand(redisClient *c, int where) {
7709 robj *o;
7710 time_t timeout;
7711 int j;
7712
7713 for (j = 1; j < c->argc-1; j++) {
7714 o = lookupKeyWrite(c->db,c->argv[j]);
7715 if (o != NULL) {
7716 if (o->type != REDIS_LIST) {
7717 addReply(c,shared.wrongtypeerr);
7718 return;
7719 } else {
7720 list *list = o->ptr;
7721 if (listLength(list) != 0) {
7722 /* If the list contains elements fall back to the usual
7723 * non-blocking POP operation */
7724 robj *argv[2], **orig_argv;
7725 int orig_argc;
7726
7727 /* We need to alter the command arguments before to call
7728 * popGenericCommand() as the command takes a single key. */
7729 orig_argv = c->argv;
7730 orig_argc = c->argc;
7731 argv[1] = c->argv[j];
7732 c->argv = argv;
7733 c->argc = 2;
7734
7735 /* Also the return value is different, we need to output
7736 * the multi bulk reply header and the key name. The
7737 * "real" command will add the last element (the value)
7738 * for us. If this souds like an hack to you it's just
7739 * because it is... */
7740 addReplySds(c,sdsnew("*2\r\n"));
7741 addReplyBulk(c,argv[1]);
7742 popGenericCommand(c,where);
7743
7744 /* Fix the client structure with the original stuff */
7745 c->argv = orig_argv;
7746 c->argc = orig_argc;
7747 return;
7748 }
7749 }
7750 }
7751 }
7752 /* If the list is empty or the key does not exists we must block */
7753 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7754 if (timeout > 0) timeout += time(NULL);
7755 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7756 }
7757
7758 static void blpopCommand(redisClient *c) {
7759 blockingPopGenericCommand(c,REDIS_HEAD);
7760 }
7761
7762 static void brpopCommand(redisClient *c) {
7763 blockingPopGenericCommand(c,REDIS_TAIL);
7764 }
7765
7766 /* =============================== Replication ============================= */
7767
7768 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7769 ssize_t nwritten, ret = size;
7770 time_t start = time(NULL);
7771
7772 timeout++;
7773 while(size) {
7774 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7775 nwritten = write(fd,ptr,size);
7776 if (nwritten == -1) return -1;
7777 ptr += nwritten;
7778 size -= nwritten;
7779 }
7780 if ((time(NULL)-start) > timeout) {
7781 errno = ETIMEDOUT;
7782 return -1;
7783 }
7784 }
7785 return ret;
7786 }
7787
7788 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7789 ssize_t nread, totread = 0;
7790 time_t start = time(NULL);
7791
7792 timeout++;
7793 while(size) {
7794 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7795 nread = read(fd,ptr,size);
7796 if (nread == -1) return -1;
7797 ptr += nread;
7798 size -= nread;
7799 totread += nread;
7800 }
7801 if ((time(NULL)-start) > timeout) {
7802 errno = ETIMEDOUT;
7803 return -1;
7804 }
7805 }
7806 return totread;
7807 }
7808
7809 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7810 ssize_t nread = 0;
7811
7812 size--;
7813 while(size) {
7814 char c;
7815
7816 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7817 if (c == '\n') {
7818 *ptr = '\0';
7819 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7820 return nread;
7821 } else {
7822 *ptr++ = c;
7823 *ptr = '\0';
7824 nread++;
7825 }
7826 }
7827 return nread;
7828 }
7829
7830 static void syncCommand(redisClient *c) {
7831 /* ignore SYNC if aleady slave or in monitor mode */
7832 if (c->flags & REDIS_SLAVE) return;
7833
7834 /* SYNC can't be issued when the server has pending data to send to
7835 * the client about already issued commands. We need a fresh reply
7836 * buffer registering the differences between the BGSAVE and the current
7837 * dataset, so that we can copy to other slaves if needed. */
7838 if (listLength(c->reply) != 0) {
7839 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7840 return;
7841 }
7842
7843 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7844 /* Here we need to check if there is a background saving operation
7845 * in progress, or if it is required to start one */
7846 if (server.bgsavechildpid != -1) {
7847 /* Ok a background save is in progress. Let's check if it is a good
7848 * one for replication, i.e. if there is another slave that is
7849 * registering differences since the server forked to save */
7850 redisClient *slave;
7851 listNode *ln;
7852 listIter li;
7853
7854 listRewind(server.slaves,&li);
7855 while((ln = listNext(&li))) {
7856 slave = ln->value;
7857 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7858 }
7859 if (ln) {
7860 /* Perfect, the server is already registering differences for
7861 * another slave. Set the right state, and copy the buffer. */
7862 listRelease(c->reply);
7863 c->reply = listDup(slave->reply);
7864 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7865 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7866 } else {
7867 /* No way, we need to wait for the next BGSAVE in order to
7868 * register differences */
7869 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7870 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7871 }
7872 } else {
7873 /* Ok we don't have a BGSAVE in progress, let's start one */
7874 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7875 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7876 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7877 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7878 return;
7879 }
7880 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7881 }
7882 c->repldbfd = -1;
7883 c->flags |= REDIS_SLAVE;
7884 c->slaveseldb = 0;
7885 listAddNodeTail(server.slaves,c);
7886 return;
7887 }
7888
7889 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7890 redisClient *slave = privdata;
7891 REDIS_NOTUSED(el);
7892 REDIS_NOTUSED(mask);
7893 char buf[REDIS_IOBUF_LEN];
7894 ssize_t nwritten, buflen;
7895
7896 if (slave->repldboff == 0) {
7897 /* Write the bulk write count before to transfer the DB. In theory here
7898 * we don't know how much room there is in the output buffer of the
7899 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7900 * operations) will never be smaller than the few bytes we need. */
7901 sds bulkcount;
7902
7903 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7904 slave->repldbsize);
7905 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7906 {
7907 sdsfree(bulkcount);
7908 freeClient(slave);
7909 return;
7910 }
7911 sdsfree(bulkcount);
7912 }
7913 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7914 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7915 if (buflen <= 0) {
7916 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7917 (buflen == 0) ? "premature EOF" : strerror(errno));
7918 freeClient(slave);
7919 return;
7920 }
7921 if ((nwritten = write(fd,buf,buflen)) == -1) {
7922 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7923 strerror(errno));
7924 freeClient(slave);
7925 return;
7926 }
7927 slave->repldboff += nwritten;
7928 if (slave->repldboff == slave->repldbsize) {
7929 close(slave->repldbfd);
7930 slave->repldbfd = -1;
7931 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7932 slave->replstate = REDIS_REPL_ONLINE;
7933 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7934 sendReplyToClient, slave) == AE_ERR) {
7935 freeClient(slave);
7936 return;
7937 }
7938 addReplySds(slave,sdsempty());
7939 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7940 }
7941 }
7942
7943 /* This function is called at the end of every backgrond saving.
7944 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7945 * otherwise REDIS_ERR is passed to the function.
7946 *
7947 * The goal of this function is to handle slaves waiting for a successful
7948 * background saving in order to perform non-blocking synchronization. */
7949 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7950 listNode *ln;
7951 int startbgsave = 0;
7952 listIter li;
7953
7954 listRewind(server.slaves,&li);
7955 while((ln = listNext(&li))) {
7956 redisClient *slave = ln->value;
7957
7958 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7959 startbgsave = 1;
7960 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7961 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7962 struct redis_stat buf;
7963
7964 if (bgsaveerr != REDIS_OK) {
7965 freeClient(slave);
7966 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7967 continue;
7968 }
7969 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7970 redis_fstat(slave->repldbfd,&buf) == -1) {
7971 freeClient(slave);
7972 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7973 continue;
7974 }
7975 slave->repldboff = 0;
7976 slave->repldbsize = buf.st_size;
7977 slave->replstate = REDIS_REPL_SEND_BULK;
7978 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7979 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7980 freeClient(slave);
7981 continue;
7982 }
7983 }
7984 }
7985 if (startbgsave) {
7986 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7987 listIter li;
7988
7989 listRewind(server.slaves,&li);
7990 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7991 while((ln = listNext(&li))) {
7992 redisClient *slave = ln->value;
7993
7994 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7995 freeClient(slave);
7996 }
7997 }
7998 }
7999 }
8000
8001 static int syncWithMaster(void) {
8002 char buf[1024], tmpfile[256], authcmd[1024];
8003 long dumpsize;
8004 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8005 int dfd, maxtries = 5;
8006
8007 if (fd == -1) {
8008 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8009 strerror(errno));
8010 return REDIS_ERR;
8011 }
8012
8013 /* AUTH with the master if required. */
8014 if(server.masterauth) {
8015 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8016 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8017 close(fd);
8018 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8019 strerror(errno));
8020 return REDIS_ERR;
8021 }
8022 /* Read the AUTH result. */
8023 if (syncReadLine(fd,buf,1024,3600) == -1) {
8024 close(fd);
8025 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8026 strerror(errno));
8027 return REDIS_ERR;
8028 }
8029 if (buf[0] != '+') {
8030 close(fd);
8031 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8032 return REDIS_ERR;
8033 }
8034 }
8035
8036 /* Issue the SYNC command */
8037 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8038 close(fd);
8039 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8040 strerror(errno));
8041 return REDIS_ERR;
8042 }
8043 /* Read the bulk write count */
8044 if (syncReadLine(fd,buf,1024,3600) == -1) {
8045 close(fd);
8046 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8047 strerror(errno));
8048 return REDIS_ERR;
8049 }
8050 if (buf[0] != '$') {
8051 close(fd);
8052 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8053 return REDIS_ERR;
8054 }
8055 dumpsize = strtol(buf+1,NULL,10);
8056 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8057 /* Read the bulk write data on a temp file */
8058 while(maxtries--) {
8059 snprintf(tmpfile,256,
8060 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8061 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8062 if (dfd != -1) break;
8063 sleep(1);
8064 }
8065 if (dfd == -1) {
8066 close(fd);
8067 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8068 return REDIS_ERR;
8069 }
8070 while(dumpsize) {
8071 int nread, nwritten;
8072
8073 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8074 if (nread == -1) {
8075 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8076 strerror(errno));
8077 close(fd);
8078 close(dfd);
8079 return REDIS_ERR;
8080 }
8081 nwritten = write(dfd,buf,nread);
8082 if (nwritten == -1) {
8083 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8084 close(fd);
8085 close(dfd);
8086 return REDIS_ERR;
8087 }
8088 dumpsize -= nread;
8089 }
8090 close(dfd);
8091 if (rename(tmpfile,server.dbfilename) == -1) {
8092 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8093 unlink(tmpfile);
8094 close(fd);
8095 return REDIS_ERR;
8096 }
8097 emptyDb();
8098 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8099 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8100 close(fd);
8101 return REDIS_ERR;
8102 }
8103 server.master = createClient(fd);
8104 server.master->flags |= REDIS_MASTER;
8105 server.master->authenticated = 1;
8106 server.replstate = REDIS_REPL_CONNECTED;
8107 return REDIS_OK;
8108 }
8109
8110 static void slaveofCommand(redisClient *c) {
8111 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8112 !strcasecmp(c->argv[2]->ptr,"one")) {
8113 if (server.masterhost) {
8114 sdsfree(server.masterhost);
8115 server.masterhost = NULL;
8116 if (server.master) freeClient(server.master);
8117 server.replstate = REDIS_REPL_NONE;
8118 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8119 }
8120 } else {
8121 sdsfree(server.masterhost);
8122 server.masterhost = sdsdup(c->argv[1]->ptr);
8123 server.masterport = atoi(c->argv[2]->ptr);
8124 if (server.master) freeClient(server.master);
8125 server.replstate = REDIS_REPL_CONNECT;
8126 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8127 server.masterhost, server.masterport);
8128 }
8129 addReply(c,shared.ok);
8130 }
8131
8132 /* ============================ Maxmemory directive ======================== */
8133
8134 /* Try to free one object form the pre-allocated objects free list.
8135 * This is useful under low mem conditions as by default we take 1 million
8136 * free objects allocated. On success REDIS_OK is returned, otherwise
8137 * REDIS_ERR. */
8138 static int tryFreeOneObjectFromFreelist(void) {
8139 robj *o;
8140
8141 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8142 if (listLength(server.objfreelist)) {
8143 listNode *head = listFirst(server.objfreelist);
8144 o = listNodeValue(head);
8145 listDelNode(server.objfreelist,head);
8146 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8147 zfree(o);
8148 return REDIS_OK;
8149 } else {
8150 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8151 return REDIS_ERR;
8152 }
8153 }
8154
8155 /* This function gets called when 'maxmemory' is set on the config file to limit
8156 * the max memory used by the server, and we are out of memory.
8157 * This function will try to, in order:
8158 *
8159 * - Free objects from the free list
8160 * - Try to remove keys with an EXPIRE set
8161 *
8162 * It is not possible to free enough memory to reach used-memory < maxmemory
8163 * the server will start refusing commands that will enlarge even more the
8164 * memory usage.
8165 */
8166 static void freeMemoryIfNeeded(void) {
8167 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8168 int j, k, freed = 0;
8169
8170 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8171 for (j = 0; j < server.dbnum; j++) {
8172 int minttl = -1;
8173 robj *minkey = NULL;
8174 struct dictEntry *de;
8175
8176 if (dictSize(server.db[j].expires)) {
8177 freed = 1;
8178 /* From a sample of three keys drop the one nearest to
8179 * the natural expire */
8180 for (k = 0; k < 3; k++) {
8181 time_t t;
8182
8183 de = dictGetRandomKey(server.db[j].expires);
8184 t = (time_t) dictGetEntryVal(de);
8185 if (minttl == -1 || t < minttl) {
8186 minkey = dictGetEntryKey(de);
8187 minttl = t;
8188 }
8189 }
8190 deleteKey(server.db+j,minkey);
8191 }
8192 }
8193 if (!freed) return; /* nothing to free... */
8194 }
8195 }
8196
8197 /* ============================== Append Only file ========================== */
8198
8199 /* Write the append only file buffer on disk.
8200 *
8201 * Since we are required to write the AOF before replying to the client,
8202 * and the only way the client socket can get a write is entering when the
8203 * the event loop, we accumulate all the AOF writes in a memory
8204 * buffer and write it on disk using this function just before entering
8205 * the event loop again. */
8206 static void flushAppendOnlyFile(void) {
8207 time_t now;
8208 ssize_t nwritten;
8209
8210 if (sdslen(server.aofbuf) == 0) return;
8211
8212 /* We want to perform a single write. This should be guaranteed atomic
8213 * at least if the filesystem we are writing is a real physical one.
8214 * While this will save us against the server being killed I don't think
8215 * there is much to do about the whole server stopping for power problems
8216 * or alike */
8217 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8218 if (nwritten != (signed)sdslen(server.aofbuf)) {
8219 /* Ooops, we are in troubles. The best thing to do for now is
8220 * aborting instead of giving the illusion that everything is
8221 * working as expected. */
8222 if (nwritten == -1) {
8223 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8224 } else {
8225 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8226 }
8227 exit(1);
8228 }
8229 sdsfree(server.aofbuf);
8230 server.aofbuf = sdsempty();
8231
8232 /* Fsync if needed */
8233 now = time(NULL);
8234 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8235 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8236 now-server.lastfsync > 1))
8237 {
8238 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8239 * flushing metadata. */
8240 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8241 server.lastfsync = now;
8242 }
8243 }
8244
8245 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8246 int j;
8247 buf = sdscatprintf(buf,"*%d\r\n",argc);
8248 for (j = 0; j < argc; j++) {
8249 robj *o = getDecodedObject(argv[j]);
8250 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8251 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8252 buf = sdscatlen(buf,"\r\n",2);
8253 decrRefCount(o);
8254 }
8255 return buf;
8256 }
8257
8258 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8259 int argc = 3;
8260 long when;
8261 robj *argv[3];
8262
8263 /* Make sure we can use strtol */
8264 seconds = getDecodedObject(seconds);
8265 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8266 decrRefCount(seconds);
8267
8268 argv[0] = createStringObject("EXPIREAT",8);
8269 argv[1] = key;
8270 argv[2] = createObject(REDIS_STRING,
8271 sdscatprintf(sdsempty(),"%ld",when));
8272 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8273 decrRefCount(argv[0]);
8274 decrRefCount(argv[2]);
8275 return buf;
8276 }
8277
8278 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8279 sds buf = sdsempty();
8280 robj *tmpargv[3];
8281
8282 /* The DB this command was targetting is not the same as the last command
8283 * we appendend. To issue a SELECT command is needed. */
8284 if (dictid != server.appendseldb) {
8285 char seldb[64];
8286
8287 snprintf(seldb,sizeof(seldb),"%d",dictid);
8288 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8289 (unsigned long)strlen(seldb),seldb);
8290 server.appendseldb = dictid;
8291 }
8292
8293 if (cmd->proc == expireCommand) {
8294 /* Translate EXPIRE into EXPIREAT */
8295 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8296 } else if (cmd->proc == setexCommand) {
8297 /* Translate SETEX to SET and EXPIREAT */
8298 tmpargv[0] = createStringObject("SET",3);
8299 tmpargv[1] = argv[1];
8300 tmpargv[2] = argv[3];
8301 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8302 decrRefCount(tmpargv[0]);
8303 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8304 } else {
8305 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8306 }
8307
8308 /* Append to the AOF buffer. This will be flushed on disk just before
8309 * of re-entering the event loop, so before the client will get a
8310 * positive reply about the operation performed. */
8311 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8312
8313 /* If a background append only file rewriting is in progress we want to
8314 * accumulate the differences between the child DB and the current one
8315 * in a buffer, so that when the child process will do its work we
8316 * can append the differences to the new append only file. */
8317 if (server.bgrewritechildpid != -1)
8318 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8319
8320 sdsfree(buf);
8321 }
8322
8323 /* In Redis commands are always executed in the context of a client, so in
8324 * order to load the append only file we need to create a fake client. */
8325 static struct redisClient *createFakeClient(void) {
8326 struct redisClient *c = zmalloc(sizeof(*c));
8327
8328 selectDb(c,0);
8329 c->fd = -1;
8330 c->querybuf = sdsempty();
8331 c->argc = 0;
8332 c->argv = NULL;
8333 c->flags = 0;
8334 /* We set the fake client as a slave waiting for the synchronization
8335 * so that Redis will not try to send replies to this client. */
8336 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8337 c->reply = listCreate();
8338 listSetFreeMethod(c->reply,decrRefCount);
8339 listSetDupMethod(c->reply,dupClientReplyValue);
8340 initClientMultiState(c);
8341 return c;
8342 }
8343
8344 static void freeFakeClient(struct redisClient *c) {
8345 sdsfree(c->querybuf);
8346 listRelease(c->reply);
8347 freeClientMultiState(c);
8348 zfree(c);
8349 }
8350
8351 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8352 * error (the append only file is zero-length) REDIS_ERR is returned. On
8353 * fatal error an error message is logged and the program exists. */
8354 int loadAppendOnlyFile(char *filename) {
8355 struct redisClient *fakeClient;
8356 FILE *fp = fopen(filename,"r");
8357 struct redis_stat sb;
8358 unsigned long long loadedkeys = 0;
8359 int appendonly = server.appendonly;
8360
8361 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8362 return REDIS_ERR;
8363
8364 if (fp == NULL) {
8365 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8366 exit(1);
8367 }
8368
8369 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8370 * to the same file we're about to read. */
8371 server.appendonly = 0;
8372
8373 fakeClient = createFakeClient();
8374 while(1) {
8375 int argc, j;
8376 unsigned long len;
8377 robj **argv;
8378 char buf[128];
8379 sds argsds;
8380 struct redisCommand *cmd;
8381
8382 if (fgets(buf,sizeof(buf),fp) == NULL) {
8383 if (feof(fp))
8384 break;
8385 else
8386 goto readerr;
8387 }
8388 if (buf[0] != '*') goto fmterr;
8389 argc = atoi(buf+1);
8390 argv = zmalloc(sizeof(robj*)*argc);
8391 for (j = 0; j < argc; j++) {
8392 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8393 if (buf[0] != '$') goto fmterr;
8394 len = strtol(buf+1,NULL,10);
8395 argsds = sdsnewlen(NULL,len);
8396 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8397 argv[j] = createObject(REDIS_STRING,argsds);
8398 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8399 }
8400
8401 /* Command lookup */
8402 cmd = lookupCommand(argv[0]->ptr);
8403 if (!cmd) {
8404 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8405 exit(1);
8406 }
8407 /* Try object encoding */
8408 if (cmd->flags & REDIS_CMD_BULK)
8409 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8410 /* Run the command in the context of a fake client */
8411 fakeClient->argc = argc;
8412 fakeClient->argv = argv;
8413 cmd->proc(fakeClient);
8414 /* Discard the reply objects list from the fake client */
8415 while(listLength(fakeClient->reply))
8416 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8417 /* Clean up, ready for the next command */
8418 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8419 zfree(argv);
8420 /* Handle swapping while loading big datasets when VM is on */
8421 loadedkeys++;
8422 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8423 while (zmalloc_used_memory() > server.vm_max_memory) {
8424 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8425 }
8426 }
8427 }
8428
8429 /* This point can only be reached when EOF is reached without errors.
8430 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8431 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8432
8433 fclose(fp);
8434 freeFakeClient(fakeClient);
8435 server.appendonly = appendonly;
8436 return REDIS_OK;
8437
8438 readerr:
8439 if (feof(fp)) {
8440 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8441 } else {
8442 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8443 }
8444 exit(1);
8445 fmterr:
8446 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8447 exit(1);
8448 }
8449
8450 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8451 static int fwriteBulkObject(FILE *fp, robj *obj) {
8452 char buf[128];
8453 int decrrc = 0;
8454
8455 /* Avoid the incr/decr ref count business if possible to help
8456 * copy-on-write (we are often in a child process when this function
8457 * is called).
8458 * Also makes sure that key objects don't get incrRefCount-ed when VM
8459 * is enabled */
8460 if (obj->encoding != REDIS_ENCODING_RAW) {
8461 obj = getDecodedObject(obj);
8462 decrrc = 1;
8463 }
8464 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8465 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8466 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8467 goto err;
8468 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8469 if (decrrc) decrRefCount(obj);
8470 return 1;
8471 err:
8472 if (decrrc) decrRefCount(obj);
8473 return 0;
8474 }
8475
8476 /* Write binary-safe string into a file in the bulkformat
8477 * $<count>\r\n<payload>\r\n */
8478 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8479 char buf[128];
8480
8481 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8482 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8483 if (len && fwrite(s,len,1,fp) == 0) return 0;
8484 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8485 return 1;
8486 }
8487
8488 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8489 static int fwriteBulkDouble(FILE *fp, double d) {
8490 char buf[128], dbuf[128];
8491
8492 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8493 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8494 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8495 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8496 return 1;
8497 }
8498
8499 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8500 static int fwriteBulkLong(FILE *fp, long l) {
8501 char buf[128], lbuf[128];
8502
8503 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8504 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8505 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8506 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8507 return 1;
8508 }
8509
8510 /* Write a sequence of commands able to fully rebuild the dataset into
8511 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8512 static int rewriteAppendOnlyFile(char *filename) {
8513 dictIterator *di = NULL;
8514 dictEntry *de;
8515 FILE *fp;
8516 char tmpfile[256];
8517 int j;
8518 time_t now = time(NULL);
8519
8520 /* Note that we have to use a different temp name here compared to the
8521 * one used by rewriteAppendOnlyFileBackground() function. */
8522 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8523 fp = fopen(tmpfile,"w");
8524 if (!fp) {
8525 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8526 return REDIS_ERR;
8527 }
8528 for (j = 0; j < server.dbnum; j++) {
8529 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8530 redisDb *db = server.db+j;
8531 dict *d = db->dict;
8532 if (dictSize(d) == 0) continue;
8533 di = dictGetIterator(d);
8534 if (!di) {
8535 fclose(fp);
8536 return REDIS_ERR;
8537 }
8538
8539 /* SELECT the new DB */
8540 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8541 if (fwriteBulkLong(fp,j) == 0) goto werr;
8542
8543 /* Iterate this DB writing every entry */
8544 while((de = dictNext(di)) != NULL) {
8545 robj *key, *o;
8546 time_t expiretime;
8547 int swapped;
8548
8549 key = dictGetEntryKey(de);
8550 /* If the value for this key is swapped, load a preview in memory.
8551 * We use a "swapped" flag to remember if we need to free the
8552 * value object instead to just increment the ref count anyway
8553 * in order to avoid copy-on-write of pages if we are forked() */
8554 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8555 key->storage == REDIS_VM_SWAPPING) {
8556 o = dictGetEntryVal(de);
8557 swapped = 0;
8558 } else {
8559 o = vmPreviewObject(key);
8560 swapped = 1;
8561 }
8562 expiretime = getExpire(db,key);
8563
8564 /* Save the key and associated value */
8565 if (o->type == REDIS_STRING) {
8566 /* Emit a SET command */
8567 char cmd[]="*3\r\n$3\r\nSET\r\n";
8568 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8569 /* Key and value */
8570 if (fwriteBulkObject(fp,key) == 0) goto werr;
8571 if (fwriteBulkObject(fp,o) == 0) goto werr;
8572 } else if (o->type == REDIS_LIST) {
8573 /* Emit the RPUSHes needed to rebuild the list */
8574 list *list = o->ptr;
8575 listNode *ln;
8576 listIter li;
8577
8578 listRewind(list,&li);
8579 while((ln = listNext(&li))) {
8580 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8581 robj *eleobj = listNodeValue(ln);
8582
8583 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8584 if (fwriteBulkObject(fp,key) == 0) goto werr;
8585 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8586 }
8587 } else if (o->type == REDIS_SET) {
8588 /* Emit the SADDs needed to rebuild the set */
8589 dict *set = o->ptr;
8590 dictIterator *di = dictGetIterator(set);
8591 dictEntry *de;
8592
8593 while((de = dictNext(di)) != NULL) {
8594 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8595 robj *eleobj = dictGetEntryKey(de);
8596
8597 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8598 if (fwriteBulkObject(fp,key) == 0) goto werr;
8599 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8600 }
8601 dictReleaseIterator(di);
8602 } else if (o->type == REDIS_ZSET) {
8603 /* Emit the ZADDs needed to rebuild the sorted set */
8604 zset *zs = o->ptr;
8605 dictIterator *di = dictGetIterator(zs->dict);
8606 dictEntry *de;
8607
8608 while((de = dictNext(di)) != NULL) {
8609 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8610 robj *eleobj = dictGetEntryKey(de);
8611 double *score = dictGetEntryVal(de);
8612
8613 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8614 if (fwriteBulkObject(fp,key) == 0) goto werr;
8615 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8616 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8617 }
8618 dictReleaseIterator(di);
8619 } else if (o->type == REDIS_HASH) {
8620 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8621
8622 /* Emit the HSETs needed to rebuild the hash */
8623 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8624 unsigned char *p = zipmapRewind(o->ptr);
8625 unsigned char *field, *val;
8626 unsigned int flen, vlen;
8627
8628 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8629 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8630 if (fwriteBulkObject(fp,key) == 0) goto werr;
8631 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8632 return -1;
8633 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8634 return -1;
8635 }
8636 } else {
8637 dictIterator *di = dictGetIterator(o->ptr);
8638 dictEntry *de;
8639
8640 while((de = dictNext(di)) != NULL) {
8641 robj *field = dictGetEntryKey(de);
8642 robj *val = dictGetEntryVal(de);
8643
8644 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8645 if (fwriteBulkObject(fp,key) == 0) goto werr;
8646 if (fwriteBulkObject(fp,field) == -1) return -1;
8647 if (fwriteBulkObject(fp,val) == -1) return -1;
8648 }
8649 dictReleaseIterator(di);
8650 }
8651 } else {
8652 redisPanic("Unknown object type");
8653 }
8654 /* Save the expire time */
8655 if (expiretime != -1) {
8656 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8657 /* If this key is already expired skip it */
8658 if (expiretime < now) continue;
8659 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8660 if (fwriteBulkObject(fp,key) == 0) goto werr;
8661 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8662 }
8663 if (swapped) decrRefCount(o);
8664 }
8665 dictReleaseIterator(di);
8666 }
8667
8668 /* Make sure data will not remain on the OS's output buffers */
8669 fflush(fp);
8670 fsync(fileno(fp));
8671 fclose(fp);
8672
8673 /* Use RENAME to make sure the DB file is changed atomically only
8674 * if the generate DB file is ok. */
8675 if (rename(tmpfile,filename) == -1) {
8676 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8677 unlink(tmpfile);
8678 return REDIS_ERR;
8679 }
8680 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8681 return REDIS_OK;
8682
8683 werr:
8684 fclose(fp);
8685 unlink(tmpfile);
8686 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8687 if (di) dictReleaseIterator(di);
8688 return REDIS_ERR;
8689 }
8690
8691 /* This is how rewriting of the append only file in background works:
8692 *
8693 * 1) The user calls BGREWRITEAOF
8694 * 2) Redis calls this function, that forks():
8695 * 2a) the child rewrite the append only file in a temp file.
8696 * 2b) the parent accumulates differences in server.bgrewritebuf.
8697 * 3) When the child finished '2a' exists.
8698 * 4) The parent will trap the exit code, if it's OK, will append the
8699 * data accumulated into server.bgrewritebuf into the temp file, and
8700 * finally will rename(2) the temp file in the actual file name.
8701 * The the new file is reopened as the new append only file. Profit!
8702 */
8703 static int rewriteAppendOnlyFileBackground(void) {
8704 pid_t childpid;
8705
8706 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8707 if (server.vm_enabled) waitEmptyIOJobsQueue();
8708 if ((childpid = fork()) == 0) {
8709 /* Child */
8710 char tmpfile[256];
8711
8712 if (server.vm_enabled) vmReopenSwapFile();
8713 close(server.fd);
8714 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8715 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8716 _exit(0);
8717 } else {
8718 _exit(1);
8719 }
8720 } else {
8721 /* Parent */
8722 if (childpid == -1) {
8723 redisLog(REDIS_WARNING,
8724 "Can't rewrite append only file in background: fork: %s",
8725 strerror(errno));
8726 return REDIS_ERR;
8727 }
8728 redisLog(REDIS_NOTICE,
8729 "Background append only file rewriting started by pid %d",childpid);
8730 server.bgrewritechildpid = childpid;
8731 updateDictResizePolicy();
8732 /* We set appendseldb to -1 in order to force the next call to the
8733 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8734 * accumulated by the parent into server.bgrewritebuf will start
8735 * with a SELECT statement and it will be safe to merge. */
8736 server.appendseldb = -1;
8737 return REDIS_OK;
8738 }
8739 return REDIS_OK; /* unreached */
8740 }
8741
8742 static void bgrewriteaofCommand(redisClient *c) {
8743 if (server.bgrewritechildpid != -1) {
8744 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8745 return;
8746 }
8747 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8748 char *status = "+Background append only file rewriting started\r\n";
8749 addReplySds(c,sdsnew(status));
8750 } else {
8751 addReply(c,shared.err);
8752 }
8753 }
8754
8755 static void aofRemoveTempFile(pid_t childpid) {
8756 char tmpfile[256];
8757
8758 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8759 unlink(tmpfile);
8760 }
8761
8762 /* Virtual Memory is composed mainly of two subsystems:
8763 * - Blocking Virutal Memory
8764 * - Threaded Virtual Memory I/O
8765 * The two parts are not fully decoupled, but functions are split among two
8766 * different sections of the source code (delimited by comments) in order to
8767 * make more clear what functionality is about the blocking VM and what about
8768 * the threaded (not blocking) VM.
8769 *
8770 * Redis VM design:
8771 *
8772 * Redis VM is a blocking VM (one that blocks reading swapped values from
8773 * disk into memory when a value swapped out is needed in memory) that is made
8774 * unblocking by trying to examine the command argument vector in order to
8775 * load in background values that will likely be needed in order to exec
8776 * the command. The command is executed only once all the relevant keys
8777 * are loaded into memory.
8778 *
8779 * This basically is almost as simple of a blocking VM, but almost as parallel
8780 * as a fully non-blocking VM.
8781 */
8782
8783 /* Called when the user switches from "appendonly yes" to "appendonly no"
8784 * at runtime using the CONFIG command. */
8785 static void stopAppendOnly(void) {
8786 flushAppendOnlyFile();
8787 fsync(server.appendfd);
8788 close(server.appendfd);
8789
8790 server.appendfd = -1;
8791 server.appendseldb = -1;
8792 server.appendonly = 0;
8793 /* rewrite operation in progress? kill it, wait child exit */
8794 if (server.bgsavechildpid != -1) {
8795 int statloc;
8796
8797 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8798 wait3(&statloc,0,NULL);
8799 /* reset the buffer accumulating changes while the child saves */
8800 sdsfree(server.bgrewritebuf);
8801 server.bgrewritebuf = sdsempty();
8802 server.bgsavechildpid = -1;
8803 }
8804 }
8805
8806 /* Called when the user switches from "appendonly no" to "appendonly yes"
8807 * at runtime using the CONFIG command. */
8808 static int startAppendOnly(void) {
8809 server.appendonly = 1;
8810 server.lastfsync = time(NULL);
8811 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8812 if (server.appendfd == -1) {
8813 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8814 return REDIS_ERR;
8815 }
8816 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8817 server.appendonly = 0;
8818 close(server.appendfd);
8819 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8820 return REDIS_ERR;
8821 }
8822 return REDIS_OK;
8823 }
8824
8825 /* =================== Virtual Memory - Blocking Side ====================== */
8826
8827 static void vmInit(void) {
8828 off_t totsize;
8829 int pipefds[2];
8830 size_t stacksize;
8831 struct flock fl;
8832
8833 if (server.vm_max_threads != 0)
8834 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8835
8836 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8837 /* Try to open the old swap file, otherwise create it */
8838 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8839 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8840 }
8841 if (server.vm_fp == NULL) {
8842 redisLog(REDIS_WARNING,
8843 "Can't open the swap file: %s. Exiting.",
8844 strerror(errno));
8845 exit(1);
8846 }
8847 server.vm_fd = fileno(server.vm_fp);
8848 /* Lock the swap file for writing, this is useful in order to avoid
8849 * another instance to use the same swap file for a config error. */
8850 fl.l_type = F_WRLCK;
8851 fl.l_whence = SEEK_SET;
8852 fl.l_start = fl.l_len = 0;
8853 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8854 redisLog(REDIS_WARNING,
8855 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8856 exit(1);
8857 }
8858 /* Initialize */
8859 server.vm_next_page = 0;
8860 server.vm_near_pages = 0;
8861 server.vm_stats_used_pages = 0;
8862 server.vm_stats_swapped_objects = 0;
8863 server.vm_stats_swapouts = 0;
8864 server.vm_stats_swapins = 0;
8865 totsize = server.vm_pages*server.vm_page_size;
8866 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8867 if (ftruncate(server.vm_fd,totsize) == -1) {
8868 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8869 strerror(errno));
8870 exit(1);
8871 } else {
8872 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8873 }
8874 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8875 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8876 (long long) (server.vm_pages+7)/8, server.vm_pages);
8877 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8878
8879 /* Initialize threaded I/O (used by Virtual Memory) */
8880 server.io_newjobs = listCreate();
8881 server.io_processing = listCreate();
8882 server.io_processed = listCreate();
8883 server.io_ready_clients = listCreate();
8884 pthread_mutex_init(&server.io_mutex,NULL);
8885 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8886 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8887 server.io_active_threads = 0;
8888 if (pipe(pipefds) == -1) {
8889 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8890 ,strerror(errno));
8891 exit(1);
8892 }
8893 server.io_ready_pipe_read = pipefds[0];
8894 server.io_ready_pipe_write = pipefds[1];
8895 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8896 /* LZF requires a lot of stack */
8897 pthread_attr_init(&server.io_threads_attr);
8898 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8899 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8900 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8901 /* Listen for events in the threaded I/O pipe */
8902 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8903 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8904 oom("creating file event");
8905 }
8906
8907 /* Mark the page as used */
8908 static void vmMarkPageUsed(off_t page) {
8909 off_t byte = page/8;
8910 int bit = page&7;
8911 redisAssert(vmFreePage(page) == 1);
8912 server.vm_bitmap[byte] |= 1<<bit;
8913 }
8914
8915 /* Mark N contiguous pages as used, with 'page' being the first. */
8916 static void vmMarkPagesUsed(off_t page, off_t count) {
8917 off_t j;
8918
8919 for (j = 0; j < count; j++)
8920 vmMarkPageUsed(page+j);
8921 server.vm_stats_used_pages += count;
8922 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8923 (long long)count, (long long)page);
8924 }
8925
8926 /* Mark the page as free */
8927 static void vmMarkPageFree(off_t page) {
8928 off_t byte = page/8;
8929 int bit = page&7;
8930 redisAssert(vmFreePage(page) == 0);
8931 server.vm_bitmap[byte] &= ~(1<<bit);
8932 }
8933
8934 /* Mark N contiguous pages as free, with 'page' being the first. */
8935 static void vmMarkPagesFree(off_t page, off_t count) {
8936 off_t j;
8937
8938 for (j = 0; j < count; j++)
8939 vmMarkPageFree(page+j);
8940 server.vm_stats_used_pages -= count;
8941 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8942 (long long)count, (long long)page);
8943 }
8944
8945 /* Test if the page is free */
8946 static int vmFreePage(off_t page) {
8947 off_t byte = page/8;
8948 int bit = page&7;
8949 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8950 }
8951
8952 /* Find N contiguous free pages storing the first page of the cluster in *first.
8953 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8954 * REDIS_ERR is returned.
8955 *
8956 * This function uses a simple algorithm: we try to allocate
8957 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8958 * again from the start of the swap file searching for free spaces.
8959 *
8960 * If it looks pretty clear that there are no free pages near our offset
8961 * we try to find less populated places doing a forward jump of
8962 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8963 * without hurry, and then we jump again and so forth...
8964 *
8965 * This function can be improved using a free list to avoid to guess
8966 * too much, since we could collect data about freed pages.
8967 *
8968 * note: I implemented this function just after watching an episode of
8969 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8970 */
8971 static int vmFindContiguousPages(off_t *first, off_t n) {
8972 off_t base, offset = 0, since_jump = 0, numfree = 0;
8973
8974 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8975 server.vm_near_pages = 0;
8976 server.vm_next_page = 0;
8977 }
8978 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8979 base = server.vm_next_page;
8980
8981 while(offset < server.vm_pages) {
8982 off_t this = base+offset;
8983
8984 /* If we overflow, restart from page zero */
8985 if (this >= server.vm_pages) {
8986 this -= server.vm_pages;
8987 if (this == 0) {
8988 /* Just overflowed, what we found on tail is no longer
8989 * interesting, as it's no longer contiguous. */
8990 numfree = 0;
8991 }
8992 }
8993 if (vmFreePage(this)) {
8994 /* This is a free page */
8995 numfree++;
8996 /* Already got N free pages? Return to the caller, with success */
8997 if (numfree == n) {
8998 *first = this-(n-1);
8999 server.vm_next_page = this+1;
9000 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9001 return REDIS_OK;
9002 }
9003 } else {
9004 /* The current one is not a free page */
9005 numfree = 0;
9006 }
9007
9008 /* Fast-forward if the current page is not free and we already
9009 * searched enough near this place. */
9010 since_jump++;
9011 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9012 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9013 since_jump = 0;
9014 /* Note that even if we rewind after the jump, we are don't need
9015 * to make sure numfree is set to zero as we only jump *if* it
9016 * is set to zero. */
9017 } else {
9018 /* Otherwise just check the next page */
9019 offset++;
9020 }
9021 }
9022 return REDIS_ERR;
9023 }
9024
9025 /* Write the specified object at the specified page of the swap file */
9026 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9027 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9028 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9029 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9030 redisLog(REDIS_WARNING,
9031 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9032 strerror(errno));
9033 return REDIS_ERR;
9034 }
9035 rdbSaveObject(server.vm_fp,o);
9036 fflush(server.vm_fp);
9037 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9038 return REDIS_OK;
9039 }
9040
9041 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9042 * needed to later retrieve the object into the key object.
9043 * If we can't find enough contiguous empty pages to swap the object on disk
9044 * REDIS_ERR is returned. */
9045 static int vmSwapObjectBlocking(robj *key, robj *val) {
9046 off_t pages = rdbSavedObjectPages(val,NULL);
9047 off_t page;
9048
9049 assert(key->storage == REDIS_VM_MEMORY);
9050 assert(key->refcount == 1);
9051 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9052 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9053 key->vm.page = page;
9054 key->vm.usedpages = pages;
9055 key->storage = REDIS_VM_SWAPPED;
9056 key->vtype = val->type;
9057 decrRefCount(val); /* Deallocate the object from memory. */
9058 vmMarkPagesUsed(page,pages);
9059 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9060 (unsigned char*) key->ptr,
9061 (unsigned long long) page, (unsigned long long) pages);
9062 server.vm_stats_swapped_objects++;
9063 server.vm_stats_swapouts++;
9064 return REDIS_OK;
9065 }
9066
9067 static robj *vmReadObjectFromSwap(off_t page, int type) {
9068 robj *o;
9069
9070 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9071 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9072 redisLog(REDIS_WARNING,
9073 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9074 strerror(errno));
9075 _exit(1);
9076 }
9077 o = rdbLoadObject(type,server.vm_fp);
9078 if (o == NULL) {
9079 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9080 _exit(1);
9081 }
9082 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9083 return o;
9084 }
9085
9086 /* Load the value object relative to the 'key' object from swap to memory.
9087 * The newly allocated object is returned.
9088 *
9089 * If preview is true the unserialized object is returned to the caller but
9090 * no changes are made to the key object, nor the pages are marked as freed */
9091 static robj *vmGenericLoadObject(robj *key, int preview) {
9092 robj *val;
9093
9094 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9095 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9096 if (!preview) {
9097 key->storage = REDIS_VM_MEMORY;
9098 key->vm.atime = server.unixtime;
9099 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9100 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9101 (unsigned char*) key->ptr);
9102 server.vm_stats_swapped_objects--;
9103 } else {
9104 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9105 (unsigned char*) key->ptr);
9106 }
9107 server.vm_stats_swapins++;
9108 return val;
9109 }
9110
9111 /* Plain object loading, from swap to memory */
9112 static robj *vmLoadObject(robj *key) {
9113 /* If we are loading the object in background, stop it, we
9114 * need to load this object synchronously ASAP. */
9115 if (key->storage == REDIS_VM_LOADING)
9116 vmCancelThreadedIOJob(key);
9117 return vmGenericLoadObject(key,0);
9118 }
9119
9120 /* Just load the value on disk, without to modify the key.
9121 * This is useful when we want to perform some operation on the value
9122 * without to really bring it from swap to memory, like while saving the
9123 * dataset or rewriting the append only log. */
9124 static robj *vmPreviewObject(robj *key) {
9125 return vmGenericLoadObject(key,1);
9126 }
9127
9128 /* How a good candidate is this object for swapping?
9129 * The better candidate it is, the greater the returned value.
9130 *
9131 * Currently we try to perform a fast estimation of the object size in
9132 * memory, and combine it with aging informations.
9133 *
9134 * Basically swappability = idle-time * log(estimated size)
9135 *
9136 * Bigger objects are preferred over smaller objects, but not
9137 * proportionally, this is why we use the logarithm. This algorithm is
9138 * just a first try and will probably be tuned later. */
9139 static double computeObjectSwappability(robj *o) {
9140 time_t age = server.unixtime - o->vm.atime;
9141 long asize = 0;
9142 list *l;
9143 dict *d;
9144 struct dictEntry *de;
9145 int z;
9146
9147 if (age <= 0) return 0;
9148 switch(o->type) {
9149 case REDIS_STRING:
9150 if (o->encoding != REDIS_ENCODING_RAW) {
9151 asize = sizeof(*o);
9152 } else {
9153 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9154 }
9155 break;
9156 case REDIS_LIST:
9157 l = o->ptr;
9158 listNode *ln = listFirst(l);
9159
9160 asize = sizeof(list);
9161 if (ln) {
9162 robj *ele = ln->value;
9163 long elesize;
9164
9165 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9166 (sizeof(*o)+sdslen(ele->ptr)) :
9167 sizeof(*o);
9168 asize += (sizeof(listNode)+elesize)*listLength(l);
9169 }
9170 break;
9171 case REDIS_SET:
9172 case REDIS_ZSET:
9173 z = (o->type == REDIS_ZSET);
9174 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9175
9176 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9177 if (z) asize += sizeof(zset)-sizeof(dict);
9178 if (dictSize(d)) {
9179 long elesize;
9180 robj *ele;
9181
9182 de = dictGetRandomKey(d);
9183 ele = dictGetEntryKey(de);
9184 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9185 (sizeof(*o)+sdslen(ele->ptr)) :
9186 sizeof(*o);
9187 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9188 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9189 }
9190 break;
9191 case REDIS_HASH:
9192 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9193 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9194 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9195 unsigned int klen, vlen;
9196 unsigned char *key, *val;
9197
9198 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9199 klen = 0;
9200 vlen = 0;
9201 }
9202 asize = len*(klen+vlen+3);
9203 } else if (o->encoding == REDIS_ENCODING_HT) {
9204 d = o->ptr;
9205 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9206 if (dictSize(d)) {
9207 long elesize;
9208 robj *ele;
9209
9210 de = dictGetRandomKey(d);
9211 ele = dictGetEntryKey(de);
9212 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9213 (sizeof(*o)+sdslen(ele->ptr)) :
9214 sizeof(*o);
9215 ele = dictGetEntryVal(de);
9216 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9217 (sizeof(*o)+sdslen(ele->ptr)) :
9218 sizeof(*o);
9219 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9220 }
9221 }
9222 break;
9223 }
9224 return (double)age*log(1+asize);
9225 }
9226
9227 /* Try to swap an object that's a good candidate for swapping.
9228 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9229 * to swap any object at all.
9230 *
9231 * If 'usethreaded' is true, Redis will try to swap the object in background
9232 * using I/O threads. */
9233 static int vmSwapOneObject(int usethreads) {
9234 int j, i;
9235 struct dictEntry *best = NULL;
9236 double best_swappability = 0;
9237 redisDb *best_db = NULL;
9238 robj *key, *val;
9239
9240 for (j = 0; j < server.dbnum; j++) {
9241 redisDb *db = server.db+j;
9242 /* Why maxtries is set to 100?
9243 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9244 * are swappable objects */
9245 int maxtries = 100;
9246
9247 if (dictSize(db->dict) == 0) continue;
9248 for (i = 0; i < 5; i++) {
9249 dictEntry *de;
9250 double swappability;
9251
9252 if (maxtries) maxtries--;
9253 de = dictGetRandomKey(db->dict);
9254 key = dictGetEntryKey(de);
9255 val = dictGetEntryVal(de);
9256 /* Only swap objects that are currently in memory.
9257 *
9258 * Also don't swap shared objects if threaded VM is on, as we
9259 * try to ensure that the main thread does not touch the
9260 * object while the I/O thread is using it, but we can't
9261 * control other keys without adding additional mutex. */
9262 if (key->storage != REDIS_VM_MEMORY ||
9263 (server.vm_max_threads != 0 && val->refcount != 1)) {
9264 if (maxtries) i--; /* don't count this try */
9265 continue;
9266 }
9267 swappability = computeObjectSwappability(val);
9268 if (!best || swappability > best_swappability) {
9269 best = de;
9270 best_swappability = swappability;
9271 best_db = db;
9272 }
9273 }
9274 }
9275 if (best == NULL) return REDIS_ERR;
9276 key = dictGetEntryKey(best);
9277 val = dictGetEntryVal(best);
9278
9279 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9280 key->ptr, best_swappability);
9281
9282 /* Unshare the key if needed */
9283 if (key->refcount > 1) {
9284 robj *newkey = dupStringObject(key);
9285 decrRefCount(key);
9286 key = dictGetEntryKey(best) = newkey;
9287 }
9288 /* Swap it */
9289 if (usethreads) {
9290 vmSwapObjectThreaded(key,val,best_db);
9291 return REDIS_OK;
9292 } else {
9293 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9294 dictGetEntryVal(best) = NULL;
9295 return REDIS_OK;
9296 } else {
9297 return REDIS_ERR;
9298 }
9299 }
9300 }
9301
9302 static int vmSwapOneObjectBlocking() {
9303 return vmSwapOneObject(0);
9304 }
9305
9306 static int vmSwapOneObjectThreaded() {
9307 return vmSwapOneObject(1);
9308 }
9309
9310 /* Return true if it's safe to swap out objects in a given moment.
9311 * Basically we don't want to swap objects out while there is a BGSAVE
9312 * or a BGAEOREWRITE running in backgroud. */
9313 static int vmCanSwapOut(void) {
9314 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9315 }
9316
9317 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9318 * and was deleted. Otherwise 0 is returned. */
9319 static int deleteIfSwapped(redisDb *db, robj *key) {
9320 dictEntry *de;
9321 robj *foundkey;
9322
9323 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9324 foundkey = dictGetEntryKey(de);
9325 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9326 deleteKey(db,key);
9327 return 1;
9328 }
9329
9330 /* =================== Virtual Memory - Threaded I/O ======================= */
9331
9332 static void freeIOJob(iojob *j) {
9333 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9334 j->type == REDIS_IOJOB_DO_SWAP ||
9335 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9336 decrRefCount(j->val);
9337 /* We don't decrRefCount the j->key field as we did't incremented
9338 * the count creating IO Jobs. This is because the key field here is
9339 * just used as an indentifier and if a key is removed the Job should
9340 * never be touched again. */
9341 zfree(j);
9342 }
9343
9344 /* Every time a thread finished a Job, it writes a byte into the write side
9345 * of an unix pipe in order to "awake" the main thread, and this function
9346 * is called. */
9347 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9348 int mask)
9349 {
9350 char buf[1];
9351 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9352 REDIS_NOTUSED(el);
9353 REDIS_NOTUSED(mask);
9354 REDIS_NOTUSED(privdata);
9355
9356 /* For every byte we read in the read side of the pipe, there is one
9357 * I/O job completed to process. */
9358 while((retval = read(fd,buf,1)) == 1) {
9359 iojob *j;
9360 listNode *ln;
9361 robj *key;
9362 struct dictEntry *de;
9363
9364 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9365
9366 /* Get the processed element (the oldest one) */
9367 lockThreadedIO();
9368 assert(listLength(server.io_processed) != 0);
9369 if (toprocess == -1) {
9370 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9371 if (toprocess <= 0) toprocess = 1;
9372 }
9373 ln = listFirst(server.io_processed);
9374 j = ln->value;
9375 listDelNode(server.io_processed,ln);
9376 unlockThreadedIO();
9377 /* If this job is marked as canceled, just ignore it */
9378 if (j->canceled) {
9379 freeIOJob(j);
9380 continue;
9381 }
9382 /* Post process it in the main thread, as there are things we
9383 * can do just here to avoid race conditions and/or invasive locks */
9384 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9385 de = dictFind(j->db->dict,j->key);
9386 assert(de != NULL);
9387 key = dictGetEntryKey(de);
9388 if (j->type == REDIS_IOJOB_LOAD) {
9389 redisDb *db;
9390
9391 /* Key loaded, bring it at home */
9392 key->storage = REDIS_VM_MEMORY;
9393 key->vm.atime = server.unixtime;
9394 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9395 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9396 (unsigned char*) key->ptr);
9397 server.vm_stats_swapped_objects--;
9398 server.vm_stats_swapins++;
9399 dictGetEntryVal(de) = j->val;
9400 incrRefCount(j->val);
9401 db = j->db;
9402 freeIOJob(j);
9403 /* Handle clients waiting for this key to be loaded. */
9404 handleClientsBlockedOnSwappedKey(db,key);
9405 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9406 /* Now we know the amount of pages required to swap this object.
9407 * Let's find some space for it, and queue this task again
9408 * rebranded as REDIS_IOJOB_DO_SWAP. */
9409 if (!vmCanSwapOut() ||
9410 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9411 {
9412 /* Ooops... no space or we can't swap as there is
9413 * a fork()ed Redis trying to save stuff on disk. */
9414 freeIOJob(j);
9415 key->storage = REDIS_VM_MEMORY; /* undo operation */
9416 } else {
9417 /* Note that we need to mark this pages as used now,
9418 * if the job will be canceled, we'll mark them as freed
9419 * again. */
9420 vmMarkPagesUsed(j->page,j->pages);
9421 j->type = REDIS_IOJOB_DO_SWAP;
9422 lockThreadedIO();
9423 queueIOJob(j);
9424 unlockThreadedIO();
9425 }
9426 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9427 robj *val;
9428
9429 /* Key swapped. We can finally free some memory. */
9430 if (key->storage != REDIS_VM_SWAPPING) {
9431 printf("key->storage: %d\n",key->storage);
9432 printf("key->name: %s\n",(char*)key->ptr);
9433 printf("key->refcount: %d\n",key->refcount);
9434 printf("val: %p\n",(void*)j->val);
9435 printf("val->type: %d\n",j->val->type);
9436 printf("val->ptr: %s\n",(char*)j->val->ptr);
9437 }
9438 redisAssert(key->storage == REDIS_VM_SWAPPING);
9439 val = dictGetEntryVal(de);
9440 key->vm.page = j->page;
9441 key->vm.usedpages = j->pages;
9442 key->storage = REDIS_VM_SWAPPED;
9443 key->vtype = j->val->type;
9444 decrRefCount(val); /* Deallocate the object from memory. */
9445 dictGetEntryVal(de) = NULL;
9446 redisLog(REDIS_DEBUG,
9447 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9448 (unsigned char*) key->ptr,
9449 (unsigned long long) j->page, (unsigned long long) j->pages);
9450 server.vm_stats_swapped_objects++;
9451 server.vm_stats_swapouts++;
9452 freeIOJob(j);
9453 /* Put a few more swap requests in queue if we are still
9454 * out of memory */
9455 if (trytoswap && vmCanSwapOut() &&
9456 zmalloc_used_memory() > server.vm_max_memory)
9457 {
9458 int more = 1;
9459 while(more) {
9460 lockThreadedIO();
9461 more = listLength(server.io_newjobs) <
9462 (unsigned) server.vm_max_threads;
9463 unlockThreadedIO();
9464 /* Don't waste CPU time if swappable objects are rare. */
9465 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9466 trytoswap = 0;
9467 break;
9468 }
9469 }
9470 }
9471 }
9472 processed++;
9473 if (processed == toprocess) return;
9474 }
9475 if (retval < 0 && errno != EAGAIN) {
9476 redisLog(REDIS_WARNING,
9477 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9478 strerror(errno));
9479 }
9480 }
9481
9482 static void lockThreadedIO(void) {
9483 pthread_mutex_lock(&server.io_mutex);
9484 }
9485
9486 static void unlockThreadedIO(void) {
9487 pthread_mutex_unlock(&server.io_mutex);
9488 }
9489
9490 /* Remove the specified object from the threaded I/O queue if still not
9491 * processed, otherwise make sure to flag it as canceled. */
9492 static void vmCancelThreadedIOJob(robj *o) {
9493 list *lists[3] = {
9494 server.io_newjobs, /* 0 */
9495 server.io_processing, /* 1 */
9496 server.io_processed /* 2 */
9497 };
9498 int i;
9499
9500 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9501 again:
9502 lockThreadedIO();
9503 /* Search for a matching key in one of the queues */
9504 for (i = 0; i < 3; i++) {
9505 listNode *ln;
9506 listIter li;
9507
9508 listRewind(lists[i],&li);
9509 while ((ln = listNext(&li)) != NULL) {
9510 iojob *job = ln->value;
9511
9512 if (job->canceled) continue; /* Skip this, already canceled. */
9513 if (job->key == o) {
9514 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9515 (void*)job, (char*)o->ptr, job->type, i);
9516 /* Mark the pages as free since the swap didn't happened
9517 * or happened but is now discarded. */
9518 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9519 vmMarkPagesFree(job->page,job->pages);
9520 /* Cancel the job. It depends on the list the job is
9521 * living in. */
9522 switch(i) {
9523 case 0: /* io_newjobs */
9524 /* If the job was yet not processed the best thing to do
9525 * is to remove it from the queue at all */
9526 freeIOJob(job);
9527 listDelNode(lists[i],ln);
9528 break;
9529 case 1: /* io_processing */
9530 /* Oh Shi- the thread is messing with the Job:
9531 *
9532 * Probably it's accessing the object if this is a
9533 * PREPARE_SWAP or DO_SWAP job.
9534 * If it's a LOAD job it may be reading from disk and
9535 * if we don't wait for the job to terminate before to
9536 * cancel it, maybe in a few microseconds data can be
9537 * corrupted in this pages. So the short story is:
9538 *
9539 * Better to wait for the job to move into the
9540 * next queue (processed)... */
9541
9542 /* We try again and again until the job is completed. */
9543 unlockThreadedIO();
9544 /* But let's wait some time for the I/O thread
9545 * to finish with this job. After all this condition
9546 * should be very rare. */
9547 usleep(1);
9548 goto again;
9549 case 2: /* io_processed */
9550 /* The job was already processed, that's easy...
9551 * just mark it as canceled so that we'll ignore it
9552 * when processing completed jobs. */
9553 job->canceled = 1;
9554 break;
9555 }
9556 /* Finally we have to adjust the storage type of the object
9557 * in order to "UNDO" the operaiton. */
9558 if (o->storage == REDIS_VM_LOADING)
9559 o->storage = REDIS_VM_SWAPPED;
9560 else if (o->storage == REDIS_VM_SWAPPING)
9561 o->storage = REDIS_VM_MEMORY;
9562 unlockThreadedIO();
9563 return;
9564 }
9565 }
9566 }
9567 unlockThreadedIO();
9568 assert(1 != 1); /* We should never reach this */
9569 }
9570
9571 static void *IOThreadEntryPoint(void *arg) {
9572 iojob *j;
9573 listNode *ln;
9574 REDIS_NOTUSED(arg);
9575
9576 pthread_detach(pthread_self());
9577 while(1) {
9578 /* Get a new job to process */
9579 lockThreadedIO();
9580 if (listLength(server.io_newjobs) == 0) {
9581 /* No new jobs in queue, exit. */
9582 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9583 (long) pthread_self());
9584 server.io_active_threads--;
9585 unlockThreadedIO();
9586 return NULL;
9587 }
9588 ln = listFirst(server.io_newjobs);
9589 j = ln->value;
9590 listDelNode(server.io_newjobs,ln);
9591 /* Add the job in the processing queue */
9592 j->thread = pthread_self();
9593 listAddNodeTail(server.io_processing,j);
9594 ln = listLast(server.io_processing); /* We use ln later to remove it */
9595 unlockThreadedIO();
9596 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9597 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9598
9599 /* Process the Job */
9600 if (j->type == REDIS_IOJOB_LOAD) {
9601 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9602 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9603 FILE *fp = fopen("/dev/null","w+");
9604 j->pages = rdbSavedObjectPages(j->val,fp);
9605 fclose(fp);
9606 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9607 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9608 j->canceled = 1;
9609 }
9610
9611 /* Done: insert the job into the processed queue */
9612 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9613 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9614 lockThreadedIO();
9615 listDelNode(server.io_processing,ln);
9616 listAddNodeTail(server.io_processed,j);
9617 unlockThreadedIO();
9618
9619 /* Signal the main thread there is new stuff to process */
9620 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9621 }
9622 return NULL; /* never reached */
9623 }
9624
9625 static void spawnIOThread(void) {
9626 pthread_t thread;
9627 sigset_t mask, omask;
9628 int err;
9629
9630 sigemptyset(&mask);
9631 sigaddset(&mask,SIGCHLD);
9632 sigaddset(&mask,SIGHUP);
9633 sigaddset(&mask,SIGPIPE);
9634 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9635 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9636 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9637 strerror(err));
9638 usleep(1000000);
9639 }
9640 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9641 server.io_active_threads++;
9642 }
9643
9644 /* We need to wait for the last thread to exit before we are able to
9645 * fork() in order to BGSAVE or BGREWRITEAOF. */
9646 static void waitEmptyIOJobsQueue(void) {
9647 while(1) {
9648 int io_processed_len;
9649
9650 lockThreadedIO();
9651 if (listLength(server.io_newjobs) == 0 &&
9652 listLength(server.io_processing) == 0 &&
9653 server.io_active_threads == 0)
9654 {
9655 unlockThreadedIO();
9656 return;
9657 }
9658 /* While waiting for empty jobs queue condition we post-process some
9659 * finshed job, as I/O threads may be hanging trying to write against
9660 * the io_ready_pipe_write FD but there are so much pending jobs that
9661 * it's blocking. */
9662 io_processed_len = listLength(server.io_processed);
9663 unlockThreadedIO();
9664 if (io_processed_len) {
9665 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9666 usleep(1000); /* 1 millisecond */
9667 } else {
9668 usleep(10000); /* 10 milliseconds */
9669 }
9670 }
9671 }
9672
9673 static void vmReopenSwapFile(void) {
9674 /* Note: we don't close the old one as we are in the child process
9675 * and don't want to mess at all with the original file object. */
9676 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9677 if (server.vm_fp == NULL) {
9678 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9679 server.vm_swap_file);
9680 _exit(1);
9681 }
9682 server.vm_fd = fileno(server.vm_fp);
9683 }
9684
9685 /* This function must be called while with threaded IO locked */
9686 static void queueIOJob(iojob *j) {
9687 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9688 (void*)j, j->type, (char*)j->key->ptr);
9689 listAddNodeTail(server.io_newjobs,j);
9690 if (server.io_active_threads < server.vm_max_threads)
9691 spawnIOThread();
9692 }
9693
9694 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9695 iojob *j;
9696
9697 assert(key->storage == REDIS_VM_MEMORY);
9698 assert(key->refcount == 1);
9699
9700 j = zmalloc(sizeof(*j));
9701 j->type = REDIS_IOJOB_PREPARE_SWAP;
9702 j->db = db;
9703 j->key = key;
9704 j->val = val;
9705 incrRefCount(val);
9706 j->canceled = 0;
9707 j->thread = (pthread_t) -1;
9708 key->storage = REDIS_VM_SWAPPING;
9709
9710 lockThreadedIO();
9711 queueIOJob(j);
9712 unlockThreadedIO();
9713 return REDIS_OK;
9714 }
9715
9716 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9717
9718 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9719 * If there is not already a job loading the key, it is craeted.
9720 * The key is added to the io_keys list in the client structure, and also
9721 * in the hash table mapping swapped keys to waiting clients, that is,
9722 * server.io_waited_keys. */
9723 static int waitForSwappedKey(redisClient *c, robj *key) {
9724 struct dictEntry *de;
9725 robj *o;
9726 list *l;
9727
9728 /* If the key does not exist or is already in RAM we don't need to
9729 * block the client at all. */
9730 de = dictFind(c->db->dict,key);
9731 if (de == NULL) return 0;
9732 o = dictGetEntryKey(de);
9733 if (o->storage == REDIS_VM_MEMORY) {
9734 return 0;
9735 } else if (o->storage == REDIS_VM_SWAPPING) {
9736 /* We were swapping the key, undo it! */
9737 vmCancelThreadedIOJob(o);
9738 return 0;
9739 }
9740
9741 /* OK: the key is either swapped, or being loaded just now. */
9742
9743 /* Add the key to the list of keys this client is waiting for.
9744 * This maps clients to keys they are waiting for. */
9745 listAddNodeTail(c->io_keys,key);
9746 incrRefCount(key);
9747
9748 /* Add the client to the swapped keys => clients waiting map. */
9749 de = dictFind(c->db->io_keys,key);
9750 if (de == NULL) {
9751 int retval;
9752
9753 /* For every key we take a list of clients blocked for it */
9754 l = listCreate();
9755 retval = dictAdd(c->db->io_keys,key,l);
9756 incrRefCount(key);
9757 assert(retval == DICT_OK);
9758 } else {
9759 l = dictGetEntryVal(de);
9760 }
9761 listAddNodeTail(l,c);
9762
9763 /* Are we already loading the key from disk? If not create a job */
9764 if (o->storage == REDIS_VM_SWAPPED) {
9765 iojob *j;
9766
9767 o->storage = REDIS_VM_LOADING;
9768 j = zmalloc(sizeof(*j));
9769 j->type = REDIS_IOJOB_LOAD;
9770 j->db = c->db;
9771 j->key = o;
9772 j->key->vtype = o->vtype;
9773 j->page = o->vm.page;
9774 j->val = NULL;
9775 j->canceled = 0;
9776 j->thread = (pthread_t) -1;
9777 lockThreadedIO();
9778 queueIOJob(j);
9779 unlockThreadedIO();
9780 }
9781 return 1;
9782 }
9783
9784 /* Preload keys for any command with first, last and step values for
9785 * the command keys prototype, as defined in the command table. */
9786 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9787 int j, last;
9788 if (cmd->vm_firstkey == 0) return;
9789 last = cmd->vm_lastkey;
9790 if (last < 0) last = argc+last;
9791 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9792 redisAssert(j < argc);
9793 waitForSwappedKey(c,argv[j]);
9794 }
9795 }
9796
9797 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9798 * Note that the number of keys to preload is user-defined, so we need to
9799 * apply a sanity check against argc. */
9800 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9801 int i, num;
9802 REDIS_NOTUSED(cmd);
9803
9804 num = atoi(argv[2]->ptr);
9805 if (num > (argc-3)) return;
9806 for (i = 0; i < num; i++) {
9807 waitForSwappedKey(c,argv[3+i]);
9808 }
9809 }
9810
9811 /* Preload keys needed to execute the entire MULTI/EXEC block.
9812 *
9813 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9814 * and will block the client when any command requires a swapped out value. */
9815 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9816 int i, margc;
9817 struct redisCommand *mcmd;
9818 robj **margv;
9819 REDIS_NOTUSED(cmd);
9820 REDIS_NOTUSED(argc);
9821 REDIS_NOTUSED(argv);
9822
9823 if (!(c->flags & REDIS_MULTI)) return;
9824 for (i = 0; i < c->mstate.count; i++) {
9825 mcmd = c->mstate.commands[i].cmd;
9826 margc = c->mstate.commands[i].argc;
9827 margv = c->mstate.commands[i].argv;
9828
9829 if (mcmd->vm_preload_proc != NULL) {
9830 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9831 } else {
9832 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9833 }
9834 }
9835 }
9836
9837 /* Is this client attempting to run a command against swapped keys?
9838 * If so, block it ASAP, load the keys in background, then resume it.
9839 *
9840 * The important idea about this function is that it can fail! If keys will
9841 * still be swapped when the client is resumed, this key lookups will
9842 * just block loading keys from disk. In practical terms this should only
9843 * happen with SORT BY command or if there is a bug in this function.
9844 *
9845 * Return 1 if the client is marked as blocked, 0 if the client can
9846 * continue as the keys it is going to access appear to be in memory. */
9847 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9848 if (cmd->vm_preload_proc != NULL) {
9849 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9850 } else {
9851 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9852 }
9853
9854 /* If the client was blocked for at least one key, mark it as blocked. */
9855 if (listLength(c->io_keys)) {
9856 c->flags |= REDIS_IO_WAIT;
9857 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9858 server.vm_blocked_clients++;
9859 return 1;
9860 } else {
9861 return 0;
9862 }
9863 }
9864
9865 /* Remove the 'key' from the list of blocked keys for a given client.
9866 *
9867 * The function returns 1 when there are no longer blocking keys after
9868 * the current one was removed (and the client can be unblocked). */
9869 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9870 list *l;
9871 listNode *ln;
9872 listIter li;
9873 struct dictEntry *de;
9874
9875 /* Remove the key from the list of keys this client is waiting for. */
9876 listRewind(c->io_keys,&li);
9877 while ((ln = listNext(&li)) != NULL) {
9878 if (equalStringObjects(ln->value,key)) {
9879 listDelNode(c->io_keys,ln);
9880 break;
9881 }
9882 }
9883 assert(ln != NULL);
9884
9885 /* Remove the client form the key => waiting clients map. */
9886 de = dictFind(c->db->io_keys,key);
9887 assert(de != NULL);
9888 l = dictGetEntryVal(de);
9889 ln = listSearchKey(l,c);
9890 assert(ln != NULL);
9891 listDelNode(l,ln);
9892 if (listLength(l) == 0)
9893 dictDelete(c->db->io_keys,key);
9894
9895 return listLength(c->io_keys) == 0;
9896 }
9897
9898 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9899 struct dictEntry *de;
9900 list *l;
9901 listNode *ln;
9902 int len;
9903
9904 de = dictFind(db->io_keys,key);
9905 if (!de) return;
9906
9907 l = dictGetEntryVal(de);
9908 len = listLength(l);
9909 /* Note: we can't use something like while(listLength(l)) as the list
9910 * can be freed by the calling function when we remove the last element. */
9911 while (len--) {
9912 ln = listFirst(l);
9913 redisClient *c = ln->value;
9914
9915 if (dontWaitForSwappedKey(c,key)) {
9916 /* Put the client in the list of clients ready to go as we
9917 * loaded all the keys about it. */
9918 listAddNodeTail(server.io_ready_clients,c);
9919 }
9920 }
9921 }
9922
9923 /* =========================== Remote Configuration ========================= */
9924
9925 static void configSetCommand(redisClient *c) {
9926 robj *o = getDecodedObject(c->argv[3]);
9927 long long ll;
9928
9929 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9930 zfree(server.dbfilename);
9931 server.dbfilename = zstrdup(o->ptr);
9932 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9933 zfree(server.requirepass);
9934 server.requirepass = zstrdup(o->ptr);
9935 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9936 zfree(server.masterauth);
9937 server.masterauth = zstrdup(o->ptr);
9938 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9939 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9940 ll < 0) goto badfmt;
9941 server.maxmemory = ll;
9942 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9943 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9944 ll < 0 || ll > LONG_MAX) goto badfmt;
9945 server.maxidletime = ll;
9946 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9947 if (!strcasecmp(o->ptr,"no")) {
9948 server.appendfsync = APPENDFSYNC_NO;
9949 } else if (!strcasecmp(o->ptr,"everysec")) {
9950 server.appendfsync = APPENDFSYNC_EVERYSEC;
9951 } else if (!strcasecmp(o->ptr,"always")) {
9952 server.appendfsync = APPENDFSYNC_ALWAYS;
9953 } else {
9954 goto badfmt;
9955 }
9956 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9957 int old = server.appendonly;
9958 int new = yesnotoi(o->ptr);
9959
9960 if (new == -1) goto badfmt;
9961 if (old != new) {
9962 if (new == 0) {
9963 stopAppendOnly();
9964 } else {
9965 if (startAppendOnly() == REDIS_ERR) {
9966 addReplySds(c,sdscatprintf(sdsempty(),
9967 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9968 decrRefCount(o);
9969 return;
9970 }
9971 }
9972 }
9973 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9974 int vlen, j;
9975 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9976
9977 /* Perform sanity check before setting the new config:
9978 * - Even number of args
9979 * - Seconds >= 1, changes >= 0 */
9980 if (vlen & 1) {
9981 sdsfreesplitres(v,vlen);
9982 goto badfmt;
9983 }
9984 for (j = 0; j < vlen; j++) {
9985 char *eptr;
9986 long val;
9987
9988 val = strtoll(v[j], &eptr, 10);
9989 if (eptr[0] != '\0' ||
9990 ((j & 1) == 0 && val < 1) ||
9991 ((j & 1) == 1 && val < 0)) {
9992 sdsfreesplitres(v,vlen);
9993 goto badfmt;
9994 }
9995 }
9996 /* Finally set the new config */
9997 resetServerSaveParams();
9998 for (j = 0; j < vlen; j += 2) {
9999 time_t seconds;
10000 int changes;
10001
10002 seconds = strtoll(v[j],NULL,10);
10003 changes = strtoll(v[j+1],NULL,10);
10004 appendServerSaveParams(seconds, changes);
10005 }
10006 sdsfreesplitres(v,vlen);
10007 } else {
10008 addReplySds(c,sdscatprintf(sdsempty(),
10009 "-ERR not supported CONFIG parameter %s\r\n",
10010 (char*)c->argv[2]->ptr));
10011 decrRefCount(o);
10012 return;
10013 }
10014 decrRefCount(o);
10015 addReply(c,shared.ok);
10016 return;
10017
10018 badfmt: /* Bad format errors */
10019 addReplySds(c,sdscatprintf(sdsempty(),
10020 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10021 (char*)o->ptr,
10022 (char*)c->argv[2]->ptr));
10023 decrRefCount(o);
10024 }
10025
10026 static void configGetCommand(redisClient *c) {
10027 robj *o = getDecodedObject(c->argv[2]);
10028 robj *lenobj = createObject(REDIS_STRING,NULL);
10029 char *pattern = o->ptr;
10030 int matches = 0;
10031
10032 addReply(c,lenobj);
10033 decrRefCount(lenobj);
10034
10035 if (stringmatch(pattern,"dbfilename",0)) {
10036 addReplyBulkCString(c,"dbfilename");
10037 addReplyBulkCString(c,server.dbfilename);
10038 matches++;
10039 }
10040 if (stringmatch(pattern,"requirepass",0)) {
10041 addReplyBulkCString(c,"requirepass");
10042 addReplyBulkCString(c,server.requirepass);
10043 matches++;
10044 }
10045 if (stringmatch(pattern,"masterauth",0)) {
10046 addReplyBulkCString(c,"masterauth");
10047 addReplyBulkCString(c,server.masterauth);
10048 matches++;
10049 }
10050 if (stringmatch(pattern,"maxmemory",0)) {
10051 char buf[128];
10052
10053 ll2string(buf,128,server.maxmemory);
10054 addReplyBulkCString(c,"maxmemory");
10055 addReplyBulkCString(c,buf);
10056 matches++;
10057 }
10058 if (stringmatch(pattern,"timeout",0)) {
10059 char buf[128];
10060
10061 ll2string(buf,128,server.maxidletime);
10062 addReplyBulkCString(c,"timeout");
10063 addReplyBulkCString(c,buf);
10064 matches++;
10065 }
10066 if (stringmatch(pattern,"appendonly",0)) {
10067 addReplyBulkCString(c,"appendonly");
10068 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10069 matches++;
10070 }
10071 if (stringmatch(pattern,"appendfsync",0)) {
10072 char *policy;
10073
10074 switch(server.appendfsync) {
10075 case APPENDFSYNC_NO: policy = "no"; break;
10076 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10077 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10078 default: policy = "unknown"; break; /* too harmless to panic */
10079 }
10080 addReplyBulkCString(c,"appendfsync");
10081 addReplyBulkCString(c,policy);
10082 matches++;
10083 }
10084 if (stringmatch(pattern,"save",0)) {
10085 sds buf = sdsempty();
10086 int j;
10087
10088 for (j = 0; j < server.saveparamslen; j++) {
10089 buf = sdscatprintf(buf,"%ld %d",
10090 server.saveparams[j].seconds,
10091 server.saveparams[j].changes);
10092 if (j != server.saveparamslen-1)
10093 buf = sdscatlen(buf," ",1);
10094 }
10095 addReplyBulkCString(c,"save");
10096 addReplyBulkCString(c,buf);
10097 sdsfree(buf);
10098 matches++;
10099 }
10100 decrRefCount(o);
10101 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10102 }
10103
10104 static void configCommand(redisClient *c) {
10105 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10106 if (c->argc != 4) goto badarity;
10107 configSetCommand(c);
10108 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10109 if (c->argc != 3) goto badarity;
10110 configGetCommand(c);
10111 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10112 if (c->argc != 2) goto badarity;
10113 server.stat_numcommands = 0;
10114 server.stat_numconnections = 0;
10115 server.stat_expiredkeys = 0;
10116 server.stat_starttime = time(NULL);
10117 addReply(c,shared.ok);
10118 } else {
10119 addReplySds(c,sdscatprintf(sdsempty(),
10120 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10121 }
10122 return;
10123
10124 badarity:
10125 addReplySds(c,sdscatprintf(sdsempty(),
10126 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10127 (char*) c->argv[1]->ptr));
10128 }
10129
10130 /* =========================== Pubsub implementation ======================== */
10131
10132 static void freePubsubPattern(void *p) {
10133 pubsubPattern *pat = p;
10134
10135 decrRefCount(pat->pattern);
10136 zfree(pat);
10137 }
10138
10139 static int listMatchPubsubPattern(void *a, void *b) {
10140 pubsubPattern *pa = a, *pb = b;
10141
10142 return (pa->client == pb->client) &&
10143 (equalStringObjects(pa->pattern,pb->pattern));
10144 }
10145
10146 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10147 * 0 if the client was already subscribed to that channel. */
10148 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10149 struct dictEntry *de;
10150 list *clients = NULL;
10151 int retval = 0;
10152
10153 /* Add the channel to the client -> channels hash table */
10154 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10155 retval = 1;
10156 incrRefCount(channel);
10157 /* Add the client to the channel -> list of clients hash table */
10158 de = dictFind(server.pubsub_channels,channel);
10159 if (de == NULL) {
10160 clients = listCreate();
10161 dictAdd(server.pubsub_channels,channel,clients);
10162 incrRefCount(channel);
10163 } else {
10164 clients = dictGetEntryVal(de);
10165 }
10166 listAddNodeTail(clients,c);
10167 }
10168 /* Notify the client */
10169 addReply(c,shared.mbulk3);
10170 addReply(c,shared.subscribebulk);
10171 addReplyBulk(c,channel);
10172 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10173 return retval;
10174 }
10175
10176 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10177 * 0 if the client was not subscribed to the specified channel. */
10178 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10179 struct dictEntry *de;
10180 list *clients;
10181 listNode *ln;
10182 int retval = 0;
10183
10184 /* Remove the channel from the client -> channels hash table */
10185 incrRefCount(channel); /* channel may be just a pointer to the same object
10186 we have in the hash tables. Protect it... */
10187 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10188 retval = 1;
10189 /* Remove the client from the channel -> clients list hash table */
10190 de = dictFind(server.pubsub_channels,channel);
10191 assert(de != NULL);
10192 clients = dictGetEntryVal(de);
10193 ln = listSearchKey(clients,c);
10194 assert(ln != NULL);
10195 listDelNode(clients,ln);
10196 if (listLength(clients) == 0) {
10197 /* Free the list and associated hash entry at all if this was
10198 * the latest client, so that it will be possible to abuse
10199 * Redis PUBSUB creating millions of channels. */
10200 dictDelete(server.pubsub_channels,channel);
10201 }
10202 }
10203 /* Notify the client */
10204 if (notify) {
10205 addReply(c,shared.mbulk3);
10206 addReply(c,shared.unsubscribebulk);
10207 addReplyBulk(c,channel);
10208 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10209 listLength(c->pubsub_patterns));
10210
10211 }
10212 decrRefCount(channel); /* it is finally safe to release it */
10213 return retval;
10214 }
10215
10216 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10217 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10218 int retval = 0;
10219
10220 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10221 retval = 1;
10222 pubsubPattern *pat;
10223 listAddNodeTail(c->pubsub_patterns,pattern);
10224 incrRefCount(pattern);
10225 pat = zmalloc(sizeof(*pat));
10226 pat->pattern = getDecodedObject(pattern);
10227 pat->client = c;
10228 listAddNodeTail(server.pubsub_patterns,pat);
10229 }
10230 /* Notify the client */
10231 addReply(c,shared.mbulk3);
10232 addReply(c,shared.psubscribebulk);
10233 addReplyBulk(c,pattern);
10234 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10235 return retval;
10236 }
10237
10238 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10239 * 0 if the client was not subscribed to the specified channel. */
10240 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10241 listNode *ln;
10242 pubsubPattern pat;
10243 int retval = 0;
10244
10245 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10246 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10247 retval = 1;
10248 listDelNode(c->pubsub_patterns,ln);
10249 pat.client = c;
10250 pat.pattern = pattern;
10251 ln = listSearchKey(server.pubsub_patterns,&pat);
10252 listDelNode(server.pubsub_patterns,ln);
10253 }
10254 /* Notify the client */
10255 if (notify) {
10256 addReply(c,shared.mbulk3);
10257 addReply(c,shared.punsubscribebulk);
10258 addReplyBulk(c,pattern);
10259 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10260 listLength(c->pubsub_patterns));
10261 }
10262 decrRefCount(pattern);
10263 return retval;
10264 }
10265
10266 /* Unsubscribe from all the channels. Return the number of channels the
10267 * client was subscribed from. */
10268 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10269 dictIterator *di = dictGetIterator(c->pubsub_channels);
10270 dictEntry *de;
10271 int count = 0;
10272
10273 while((de = dictNext(di)) != NULL) {
10274 robj *channel = dictGetEntryKey(de);
10275
10276 count += pubsubUnsubscribeChannel(c,channel,notify);
10277 }
10278 dictReleaseIterator(di);
10279 return count;
10280 }
10281
10282 /* Unsubscribe from all the patterns. Return the number of patterns the
10283 * client was subscribed from. */
10284 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10285 listNode *ln;
10286 listIter li;
10287 int count = 0;
10288
10289 listRewind(c->pubsub_patterns,&li);
10290 while ((ln = listNext(&li)) != NULL) {
10291 robj *pattern = ln->value;
10292
10293 count += pubsubUnsubscribePattern(c,pattern,notify);
10294 }
10295 return count;
10296 }
10297
10298 /* Publish a message */
10299 static int pubsubPublishMessage(robj *channel, robj *message) {
10300 int receivers = 0;
10301 struct dictEntry *de;
10302 listNode *ln;
10303 listIter li;
10304
10305 /* Send to clients listening for that channel */
10306 de = dictFind(server.pubsub_channels,channel);
10307 if (de) {
10308 list *list = dictGetEntryVal(de);
10309 listNode *ln;
10310 listIter li;
10311
10312 listRewind(list,&li);
10313 while ((ln = listNext(&li)) != NULL) {
10314 redisClient *c = ln->value;
10315
10316 addReply(c,shared.mbulk3);
10317 addReply(c,shared.messagebulk);
10318 addReplyBulk(c,channel);
10319 addReplyBulk(c,message);
10320 receivers++;
10321 }
10322 }
10323 /* Send to clients listening to matching channels */
10324 if (listLength(server.pubsub_patterns)) {
10325 listRewind(server.pubsub_patterns,&li);
10326 channel = getDecodedObject(channel);
10327 while ((ln = listNext(&li)) != NULL) {
10328 pubsubPattern *pat = ln->value;
10329
10330 if (stringmatchlen((char*)pat->pattern->ptr,
10331 sdslen(pat->pattern->ptr),
10332 (char*)channel->ptr,
10333 sdslen(channel->ptr),0)) {
10334 addReply(pat->client,shared.mbulk4);
10335 addReply(pat->client,shared.pmessagebulk);
10336 addReplyBulk(pat->client,pat->pattern);
10337 addReplyBulk(pat->client,channel);
10338 addReplyBulk(pat->client,message);
10339 receivers++;
10340 }
10341 }
10342 decrRefCount(channel);
10343 }
10344 return receivers;
10345 }
10346
10347 static void subscribeCommand(redisClient *c) {
10348 int j;
10349
10350 for (j = 1; j < c->argc; j++)
10351 pubsubSubscribeChannel(c,c->argv[j]);
10352 }
10353
10354 static void unsubscribeCommand(redisClient *c) {
10355 if (c->argc == 1) {
10356 pubsubUnsubscribeAllChannels(c,1);
10357 return;
10358 } else {
10359 int j;
10360
10361 for (j = 1; j < c->argc; j++)
10362 pubsubUnsubscribeChannel(c,c->argv[j],1);
10363 }
10364 }
10365
10366 static void psubscribeCommand(redisClient *c) {
10367 int j;
10368
10369 for (j = 1; j < c->argc; j++)
10370 pubsubSubscribePattern(c,c->argv[j]);
10371 }
10372
10373 static void punsubscribeCommand(redisClient *c) {
10374 if (c->argc == 1) {
10375 pubsubUnsubscribeAllPatterns(c,1);
10376 return;
10377 } else {
10378 int j;
10379
10380 for (j = 1; j < c->argc; j++)
10381 pubsubUnsubscribePattern(c,c->argv[j],1);
10382 }
10383 }
10384
10385 static void publishCommand(redisClient *c) {
10386 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10387 addReplyLongLong(c,receivers);
10388 }
10389
10390 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10391 *
10392 * The implementation uses a per-DB hash table mapping keys to list of clients
10393 * WATCHing those keys, so that given a key that is going to be modified
10394 * we can mark all the associated clients as dirty.
10395 *
10396 * Also every client contains a list of WATCHed keys so that's possible to
10397 * un-watch such keys when the client is freed or when UNWATCH is called. */
10398
10399 /* In the client->watched_keys list we need to use watchedKey structures
10400 * as in order to identify a key in Redis we need both the key name and the
10401 * DB */
10402 typedef struct watchedKey {
10403 robj *key;
10404 redisDb *db;
10405 } watchedKey;
10406
10407 /* Watch for the specified key */
10408 static void watchForKey(redisClient *c, robj *key) {
10409 list *clients = NULL;
10410 listIter li;
10411 listNode *ln;
10412 watchedKey *wk;
10413
10414 /* Check if we are already watching for this key */
10415 listRewind(c->watched_keys,&li);
10416 while((ln = listNext(&li))) {
10417 wk = listNodeValue(ln);
10418 if (wk->db == c->db && equalStringObjects(key,wk->key))
10419 return; /* Key already watched */
10420 }
10421 /* This key is not already watched in this DB. Let's add it */
10422 clients = dictFetchValue(c->db->watched_keys,key);
10423 if (!clients) {
10424 clients = listCreate();
10425 dictAdd(c->db->watched_keys,key,clients);
10426 incrRefCount(key);
10427 }
10428 listAddNodeTail(clients,c);
10429 /* Add the new key to the lits of keys watched by this client */
10430 wk = zmalloc(sizeof(*wk));
10431 wk->key = key;
10432 wk->db = c->db;
10433 incrRefCount(key);
10434 listAddNodeTail(c->watched_keys,wk);
10435 }
10436
10437 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10438 * flag is up to the caller. */
10439 static void unwatchAllKeys(redisClient *c) {
10440 listIter li;
10441 listNode *ln;
10442
10443 if (listLength(c->watched_keys) == 0) return;
10444 listRewind(c->watched_keys,&li);
10445 while((ln = listNext(&li))) {
10446 list *clients;
10447 watchedKey *wk;
10448
10449 /* Lookup the watched key -> clients list and remove the client
10450 * from the list */
10451 wk = listNodeValue(ln);
10452 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10453 assert(clients != NULL);
10454 listDelNode(clients,listSearchKey(clients,c));
10455 /* Kill the entry at all if this was the only client */
10456 if (listLength(clients) == 0)
10457 dictDelete(wk->db->watched_keys, wk->key);
10458 /* Remove this watched key from the client->watched list */
10459 listDelNode(c->watched_keys,ln);
10460 decrRefCount(wk->key);
10461 zfree(wk);
10462 }
10463 }
10464
10465 /* "Touch" a key, so that if this key is being WATCHed by soem client the
10466 * next EXEC will fail. */
10467 static void touchWatchedKey(redisDb *db, robj *key) {
10468 list *clients;
10469 listIter li;
10470 listNode *ln;
10471
10472 if (dictSize(db->watched_keys) == 0) return;
10473 clients = dictFetchValue(db->watched_keys, key);
10474 if (!clients) return;
10475
10476 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10477 /* Check if we are already watching for this key */
10478 listRewind(clients,&li);
10479 while((ln = listNext(&li))) {
10480 redisClient *c = listNodeValue(ln);
10481
10482 c->flags |= REDIS_DIRTY_CAS;
10483 }
10484 }
10485
10486 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10487 * flush but will be deleted as effect of the flushing operation should
10488 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10489 * a FLUSHALL operation (all the DBs flushed). */
10490 static void touchWatchedKeysOnFlush(int dbid) {
10491 listIter li1, li2;
10492 listNode *ln;
10493
10494 /* For every client, check all the waited keys */
10495 listRewind(server.clients,&li1);
10496 while((ln = listNext(&li1))) {
10497 redisClient *c = listNodeValue(ln);
10498 listRewind(c->watched_keys,&li2);
10499 while((ln = listNext(&li2))) {
10500 watchedKey *wk = listNodeValue(ln);
10501
10502 /* For every watched key matching the specified DB, if the
10503 * key exists, mark the client as dirty, as the key will be
10504 * removed. */
10505 if (dbid == -1 || wk->db->id == dbid) {
10506 if (dictFind(wk->db->dict, wk->key) != NULL)
10507 c->flags |= REDIS_DIRTY_CAS;
10508 }
10509 }
10510 }
10511 }
10512
10513 static void watchCommand(redisClient *c) {
10514 int j;
10515
10516 for (j = 1; j < c->argc; j++)
10517 watchForKey(c,c->argv[j]);
10518 addReply(c,shared.ok);
10519 }
10520
10521 static void unwatchCommand(redisClient *c) {
10522 unwatchAllKeys(c);
10523 c->flags &= (~REDIS_DIRTY_CAS);
10524 addReply(c,shared.ok);
10525 }
10526
10527 /* ================================= Debugging ============================== */
10528
10529 /* Compute the sha1 of string at 's' with 'len' bytes long.
10530 * The SHA1 is then xored againt the string pointed by digest.
10531 * Since xor is commutative, this operation is used in order to
10532 * "add" digests relative to unordered elements.
10533 *
10534 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10535 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10536 SHA1_CTX ctx;
10537 unsigned char hash[20], *s = ptr;
10538 int j;
10539
10540 SHA1Init(&ctx);
10541 SHA1Update(&ctx,s,len);
10542 SHA1Final(hash,&ctx);
10543
10544 for (j = 0; j < 20; j++)
10545 digest[j] ^= hash[j];
10546 }
10547
10548 static void xorObjectDigest(unsigned char *digest, robj *o) {
10549 o = getDecodedObject(o);
10550 xorDigest(digest,o->ptr,sdslen(o->ptr));
10551 decrRefCount(o);
10552 }
10553
10554 /* This function instead of just computing the SHA1 and xoring it
10555 * against diget, also perform the digest of "digest" itself and
10556 * replace the old value with the new one.
10557 *
10558 * So the final digest will be:
10559 *
10560 * digest = SHA1(digest xor SHA1(data))
10561 *
10562 * This function is used every time we want to preserve the order so
10563 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10564 *
10565 * Also note that mixdigest("foo") followed by mixdigest("bar")
10566 * will lead to a different digest compared to "fo", "obar".
10567 */
10568 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10569 SHA1_CTX ctx;
10570 char *s = ptr;
10571
10572 xorDigest(digest,s,len);
10573 SHA1Init(&ctx);
10574 SHA1Update(&ctx,digest,20);
10575 SHA1Final(digest,&ctx);
10576 }
10577
10578 static void mixObjectDigest(unsigned char *digest, robj *o) {
10579 o = getDecodedObject(o);
10580 mixDigest(digest,o->ptr,sdslen(o->ptr));
10581 decrRefCount(o);
10582 }
10583
10584 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10585 * are not ordered, we use a trick: every aggregate digest is the xor
10586 * of the digests of their elements. This way the order will not change
10587 * the result. For list instead we use a feedback entering the output digest
10588 * as input in order to ensure that a different ordered list will result in
10589 * a different digest. */
10590 static void computeDatasetDigest(unsigned char *final) {
10591 unsigned char digest[20];
10592 char buf[128];
10593 dictIterator *di = NULL;
10594 dictEntry *de;
10595 int j;
10596 uint32_t aux;
10597
10598 memset(final,0,20); /* Start with a clean result */
10599
10600 for (j = 0; j < server.dbnum; j++) {
10601 redisDb *db = server.db+j;
10602
10603 if (dictSize(db->dict) == 0) continue;
10604 di = dictGetIterator(db->dict);
10605
10606 /* hash the DB id, so the same dataset moved in a different
10607 * DB will lead to a different digest */
10608 aux = htonl(j);
10609 mixDigest(final,&aux,sizeof(aux));
10610
10611 /* Iterate this DB writing every entry */
10612 while((de = dictNext(di)) != NULL) {
10613 robj *key, *o, *kcopy;
10614 time_t expiretime;
10615
10616 memset(digest,0,20); /* This key-val digest */
10617 key = dictGetEntryKey(de);
10618
10619 if (!server.vm_enabled) {
10620 mixObjectDigest(digest,key);
10621 o = dictGetEntryVal(de);
10622 } else {
10623 /* Don't work with the key directly as when VM is active
10624 * this is unsafe: TODO: fix decrRefCount to check if the
10625 * count really reached 0 to avoid this mess */
10626 kcopy = dupStringObject(key);
10627 mixObjectDigest(digest,kcopy);
10628 o = lookupKeyRead(db,kcopy);
10629 decrRefCount(kcopy);
10630 }
10631 aux = htonl(o->type);
10632 mixDigest(digest,&aux,sizeof(aux));
10633 expiretime = getExpire(db,key);
10634
10635 /* Save the key and associated value */
10636 if (o->type == REDIS_STRING) {
10637 mixObjectDigest(digest,o);
10638 } else if (o->type == REDIS_LIST) {
10639 list *list = o->ptr;
10640 listNode *ln;
10641 listIter li;
10642
10643 listRewind(list,&li);
10644 while((ln = listNext(&li))) {
10645 robj *eleobj = listNodeValue(ln);
10646
10647 mixObjectDigest(digest,eleobj);
10648 }
10649 } else if (o->type == REDIS_SET) {
10650 dict *set = o->ptr;
10651 dictIterator *di = dictGetIterator(set);
10652 dictEntry *de;
10653
10654 while((de = dictNext(di)) != NULL) {
10655 robj *eleobj = dictGetEntryKey(de);
10656
10657 xorObjectDigest(digest,eleobj);
10658 }
10659 dictReleaseIterator(di);
10660 } else if (o->type == REDIS_ZSET) {
10661 zset *zs = o->ptr;
10662 dictIterator *di = dictGetIterator(zs->dict);
10663 dictEntry *de;
10664
10665 while((de = dictNext(di)) != NULL) {
10666 robj *eleobj = dictGetEntryKey(de);
10667 double *score = dictGetEntryVal(de);
10668 unsigned char eledigest[20];
10669
10670 snprintf(buf,sizeof(buf),"%.17g",*score);
10671 memset(eledigest,0,20);
10672 mixObjectDigest(eledigest,eleobj);
10673 mixDigest(eledigest,buf,strlen(buf));
10674 xorDigest(digest,eledigest,20);
10675 }
10676 dictReleaseIterator(di);
10677 } else if (o->type == REDIS_HASH) {
10678 hashIterator *hi;
10679 robj *obj;
10680
10681 hi = hashInitIterator(o);
10682 while (hashNext(hi) != REDIS_ERR) {
10683 unsigned char eledigest[20];
10684
10685 memset(eledigest,0,20);
10686 obj = hashCurrent(hi,REDIS_HASH_KEY);
10687 mixObjectDigest(eledigest,obj);
10688 decrRefCount(obj);
10689 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10690 mixObjectDigest(eledigest,obj);
10691 decrRefCount(obj);
10692 xorDigest(digest,eledigest,20);
10693 }
10694 hashReleaseIterator(hi);
10695 } else {
10696 redisPanic("Unknown object type");
10697 }
10698 /* If the key has an expire, add it to the mix */
10699 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10700 /* We can finally xor the key-val digest to the final digest */
10701 xorDigest(final,digest,20);
10702 }
10703 dictReleaseIterator(di);
10704 }
10705 }
10706
10707 static void debugCommand(redisClient *c) {
10708 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10709 *((char*)-1) = 'x';
10710 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10711 if (rdbSave(server.dbfilename) != REDIS_OK) {
10712 addReply(c,shared.err);
10713 return;
10714 }
10715 emptyDb();
10716 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10717 addReply(c,shared.err);
10718 return;
10719 }
10720 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10721 addReply(c,shared.ok);
10722 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10723 emptyDb();
10724 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10725 addReply(c,shared.err);
10726 return;
10727 }
10728 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10729 addReply(c,shared.ok);
10730 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10731 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10732 robj *key, *val;
10733
10734 if (!de) {
10735 addReply(c,shared.nokeyerr);
10736 return;
10737 }
10738 key = dictGetEntryKey(de);
10739 val = dictGetEntryVal(de);
10740 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10741 key->storage == REDIS_VM_SWAPPING)) {
10742 char *strenc;
10743 char buf[128];
10744
10745 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10746 strenc = strencoding[val->encoding];
10747 } else {
10748 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10749 strenc = buf;
10750 }
10751 addReplySds(c,sdscatprintf(sdsempty(),
10752 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10753 "encoding:%s serializedlength:%lld\r\n",
10754 (void*)key, key->refcount, (void*)val, val->refcount,
10755 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10756 } else {
10757 addReplySds(c,sdscatprintf(sdsempty(),
10758 "+Key at:%p refcount:%d, value swapped at: page %llu "
10759 "using %llu pages\r\n",
10760 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10761 (unsigned long long) key->vm.usedpages));
10762 }
10763 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10764 lookupKeyRead(c->db,c->argv[2]);
10765 addReply(c,shared.ok);
10766 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10767 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10768 robj *key, *val;
10769
10770 if (!server.vm_enabled) {
10771 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10772 return;
10773 }
10774 if (!de) {
10775 addReply(c,shared.nokeyerr);
10776 return;
10777 }
10778 key = dictGetEntryKey(de);
10779 val = dictGetEntryVal(de);
10780 /* If the key is shared we want to create a copy */
10781 if (key->refcount > 1) {
10782 robj *newkey = dupStringObject(key);
10783 decrRefCount(key);
10784 key = dictGetEntryKey(de) = newkey;
10785 }
10786 /* Swap it */
10787 if (key->storage != REDIS_VM_MEMORY) {
10788 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10789 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10790 dictGetEntryVal(de) = NULL;
10791 addReply(c,shared.ok);
10792 } else {
10793 addReply(c,shared.err);
10794 }
10795 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10796 long keys, j;
10797 robj *key, *val;
10798 char buf[128];
10799
10800 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10801 return;
10802 for (j = 0; j < keys; j++) {
10803 snprintf(buf,sizeof(buf),"key:%lu",j);
10804 key = createStringObject(buf,strlen(buf));
10805 if (lookupKeyRead(c->db,key) != NULL) {
10806 decrRefCount(key);
10807 continue;
10808 }
10809 snprintf(buf,sizeof(buf),"value:%lu",j);
10810 val = createStringObject(buf,strlen(buf));
10811 dictAdd(c->db->dict,key,val);
10812 }
10813 addReply(c,shared.ok);
10814 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10815 unsigned char digest[20];
10816 sds d = sdsnew("+");
10817 int j;
10818
10819 computeDatasetDigest(digest);
10820 for (j = 0; j < 20; j++)
10821 d = sdscatprintf(d, "%02x",digest[j]);
10822
10823 d = sdscatlen(d,"\r\n",2);
10824 addReplySds(c,d);
10825 } else {
10826 addReplySds(c,sdsnew(
10827 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10828 }
10829 }
10830
10831 static void _redisAssert(char *estr, char *file, int line) {
10832 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10833 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10834 #ifdef HAVE_BACKTRACE
10835 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10836 *((char*)-1) = 'x';
10837 #endif
10838 }
10839
10840 static void _redisPanic(char *msg, char *file, int line) {
10841 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10842 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10843 #ifdef HAVE_BACKTRACE
10844 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10845 *((char*)-1) = 'x';
10846 #endif
10847 }
10848
10849 /* =================================== Main! ================================ */
10850
10851 #ifdef __linux__
10852 int linuxOvercommitMemoryValue(void) {
10853 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10854 char buf[64];
10855
10856 if (!fp) return -1;
10857 if (fgets(buf,64,fp) == NULL) {
10858 fclose(fp);
10859 return -1;
10860 }
10861 fclose(fp);
10862
10863 return atoi(buf);
10864 }
10865
10866 void linuxOvercommitMemoryWarning(void) {
10867 if (linuxOvercommitMemoryValue() == 0) {
10868 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10869 }
10870 }
10871 #endif /* __linux__ */
10872
10873 static void daemonize(void) {
10874 int fd;
10875 FILE *fp;
10876
10877 if (fork() != 0) exit(0); /* parent exits */
10878 setsid(); /* create a new session */
10879
10880 /* Every output goes to /dev/null. If Redis is daemonized but
10881 * the 'logfile' is set to 'stdout' in the configuration file
10882 * it will not log at all. */
10883 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10884 dup2(fd, STDIN_FILENO);
10885 dup2(fd, STDOUT_FILENO);
10886 dup2(fd, STDERR_FILENO);
10887 if (fd > STDERR_FILENO) close(fd);
10888 }
10889 /* Try to write the pid file */
10890 fp = fopen(server.pidfile,"w");
10891 if (fp) {
10892 fprintf(fp,"%d\n",getpid());
10893 fclose(fp);
10894 }
10895 }
10896
10897 static void version() {
10898 printf("Redis server version %s\n", REDIS_VERSION);
10899 exit(0);
10900 }
10901
10902 static void usage() {
10903 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10904 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10905 exit(1);
10906 }
10907
10908 int main(int argc, char **argv) {
10909 time_t start;
10910
10911 initServerConfig();
10912 if (argc == 2) {
10913 if (strcmp(argv[1], "-v") == 0 ||
10914 strcmp(argv[1], "--version") == 0) version();
10915 if (strcmp(argv[1], "--help") == 0) usage();
10916 resetServerSaveParams();
10917 loadServerConfig(argv[1]);
10918 } else if ((argc > 2)) {
10919 usage();
10920 } else {
10921 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10922 }
10923 if (server.daemonize) daemonize();
10924 initServer();
10925 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10926 #ifdef __linux__
10927 linuxOvercommitMemoryWarning();
10928 #endif
10929 start = time(NULL);
10930 if (server.appendonly) {
10931 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10932 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10933 } else {
10934 if (rdbLoad(server.dbfilename) == REDIS_OK)
10935 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10936 }
10937 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10938 aeSetBeforeSleepProc(server.el,beforeSleep);
10939 aeMain(server.el);
10940 aeDeleteEventLoop(server.el);
10941 return 0;
10942 }
10943
10944 /* ============================= Backtrace support ========================= */
10945
10946 #ifdef HAVE_BACKTRACE
10947 static char *findFuncName(void *pointer, unsigned long *offset);
10948
10949 static void *getMcontextEip(ucontext_t *uc) {
10950 #if defined(__FreeBSD__)
10951 return (void*) uc->uc_mcontext.mc_eip;
10952 #elif defined(__dietlibc__)
10953 return (void*) uc->uc_mcontext.eip;
10954 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10955 #if __x86_64__
10956 return (void*) uc->uc_mcontext->__ss.__rip;
10957 #else
10958 return (void*) uc->uc_mcontext->__ss.__eip;
10959 #endif
10960 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10961 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10962 return (void*) uc->uc_mcontext->__ss.__rip;
10963 #else
10964 return (void*) uc->uc_mcontext->__ss.__eip;
10965 #endif
10966 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10967 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10968 #elif defined(__ia64__) /* Linux IA64 */
10969 return (void*) uc->uc_mcontext.sc_ip;
10970 #else
10971 return NULL;
10972 #endif
10973 }
10974
10975 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10976 void *trace[100];
10977 char **messages = NULL;
10978 int i, trace_size = 0;
10979 unsigned long offset=0;
10980 ucontext_t *uc = (ucontext_t*) secret;
10981 sds infostring;
10982 REDIS_NOTUSED(info);
10983
10984 redisLog(REDIS_WARNING,
10985 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10986 infostring = genRedisInfoString();
10987 redisLog(REDIS_WARNING, "%s",infostring);
10988 /* It's not safe to sdsfree() the returned string under memory
10989 * corruption conditions. Let it leak as we are going to abort */
10990
10991 trace_size = backtrace(trace, 100);
10992 /* overwrite sigaction with caller's address */
10993 if (getMcontextEip(uc) != NULL) {
10994 trace[1] = getMcontextEip(uc);
10995 }
10996 messages = backtrace_symbols(trace, trace_size);
10997
10998 for (i=1; i<trace_size; ++i) {
10999 char *fn = findFuncName(trace[i], &offset), *p;
11000
11001 p = strchr(messages[i],'+');
11002 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11003 redisLog(REDIS_WARNING,"%s", messages[i]);
11004 } else {
11005 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11006 }
11007 }
11008 /* free(messages); Don't call free() with possibly corrupted memory. */
11009 _exit(0);
11010 }
11011
11012 static void sigtermHandler(int sig) {
11013 REDIS_NOTUSED(sig);
11014
11015 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11016 server.shutdown_asap = 1;
11017 }
11018
11019 static void setupSigSegvAction(void) {
11020 struct sigaction act;
11021
11022 sigemptyset (&act.sa_mask);
11023 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11024 * is used. Otherwise, sa_handler is used */
11025 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11026 act.sa_sigaction = segvHandler;
11027 sigaction (SIGSEGV, &act, NULL);
11028 sigaction (SIGBUS, &act, NULL);
11029 sigaction (SIGFPE, &act, NULL);
11030 sigaction (SIGILL, &act, NULL);
11031 sigaction (SIGBUS, &act, NULL);
11032
11033 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11034 act.sa_handler = sigtermHandler;
11035 sigaction (SIGTERM, &act, NULL);
11036 return;
11037 }
11038
11039 #include "staticsymbols.h"
11040 /* This function try to convert a pointer into a function name. It's used in
11041 * oreder to provide a backtrace under segmentation fault that's able to
11042 * display functions declared as static (otherwise the backtrace is useless). */
11043 static char *findFuncName(void *pointer, unsigned long *offset){
11044 int i, ret = -1;
11045 unsigned long off, minoff = 0;
11046
11047 /* Try to match against the Symbol with the smallest offset */
11048 for (i=0; symsTable[i].pointer; i++) {
11049 unsigned long lp = (unsigned long) pointer;
11050
11051 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11052 off=lp-symsTable[i].pointer;
11053 if (ret < 0 || off < minoff) {
11054 minoff=off;
11055 ret=i;
11056 }
11057 }
11058 }
11059 if (ret == -1) return NULL;
11060 *offset = minoff;
11061 return symsTable[ret].name;
11062 }
11063 #else /* HAVE_BACKTRACE */
11064 static void setupSigSegvAction(void) {
11065 }
11066 #endif /* HAVE_BACKTRACE */
11067
11068
11069
11070 /* The End */
11071
11072
11073