]> git.saurik.com Git - redis.git/blob - redis.c
don't fsync after a rewrite if appendfsync is set to no. use aof_fsycn instead of...
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int no_appendfsync_on_rewrite;
373 int shutdown_asap;
374 time_t lastfsync;
375 int appendfd;
376 int appendseldb;
377 char *pidfile;
378 pid_t bgsavechildpid;
379 pid_t bgrewritechildpid;
380 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
381 sds aofbuf; /* AOF buffer, written before entering the event loop */
382 struct saveparam *saveparams;
383 int saveparamslen;
384 char *logfile;
385 char *bindaddr;
386 char *dbfilename;
387 char *appendfilename;
388 char *requirepass;
389 int rdbcompression;
390 int activerehashing;
391 /* Replication related */
392 int isslave;
393 char *masterauth;
394 char *masterhost;
395 int masterport;
396 redisClient *master; /* client that is master for this slave */
397 int replstate;
398 unsigned int maxclients;
399 unsigned long long maxmemory;
400 unsigned int blpop_blocked_clients;
401 unsigned int vm_blocked_clients;
402 /* Sort parameters - qsort_r() is only available under BSD so we
403 * have to take this state global, in order to pass it to sortCompare() */
404 int sort_desc;
405 int sort_alpha;
406 int sort_bypattern;
407 /* Virtual memory configuration */
408 int vm_enabled;
409 char *vm_swap_file;
410 off_t vm_page_size;
411 off_t vm_pages;
412 unsigned long long vm_max_memory;
413 /* Hashes config */
414 size_t hash_max_zipmap_entries;
415 size_t hash_max_zipmap_value;
416 /* Virtual memory state */
417 FILE *vm_fp;
418 int vm_fd;
419 off_t vm_next_page; /* Next probably empty page */
420 off_t vm_near_pages; /* Number of pages allocated sequentially */
421 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
422 time_t unixtime; /* Unix time sampled every second. */
423 /* Virtual memory I/O threads stuff */
424 /* An I/O thread process an element taken from the io_jobs queue and
425 * put the result of the operation in the io_done list. While the
426 * job is being processed, it's put on io_processing queue. */
427 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
428 list *io_processing; /* List of VM I/O jobs being processed */
429 list *io_processed; /* List of VM I/O jobs already processed */
430 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
431 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
432 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
433 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
434 pthread_attr_t io_threads_attr; /* attributes for threads creation */
435 int io_active_threads; /* Number of running I/O threads */
436 int vm_max_threads; /* Max number of I/O threads running at the same time */
437 /* Our main thread is blocked on the event loop, locking for sockets ready
438 * to be read or written, so when a threaded I/O operation is ready to be
439 * processed by the main thread, the I/O thread will use a unix pipe to
440 * awake the main thread. The followings are the two pipe FDs. */
441 int io_ready_pipe_read;
442 int io_ready_pipe_write;
443 /* Virtual memory stats */
444 unsigned long long vm_stats_used_pages;
445 unsigned long long vm_stats_swapped_objects;
446 unsigned long long vm_stats_swapouts;
447 unsigned long long vm_stats_swapins;
448 /* Pubsub */
449 dict *pubsub_channels; /* Map channels to list of subscribed clients */
450 list *pubsub_patterns; /* A list of pubsub_patterns */
451 /* Misc */
452 FILE *devnull;
453 };
454
455 typedef struct pubsubPattern {
456 redisClient *client;
457 robj *pattern;
458 } pubsubPattern;
459
460 typedef void redisCommandProc(redisClient *c);
461 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
462 struct redisCommand {
463 char *name;
464 redisCommandProc *proc;
465 int arity;
466 int flags;
467 /* Use a function to determine which keys need to be loaded
468 * in the background prior to executing this command. Takes precedence
469 * over vm_firstkey and others, ignored when NULL */
470 redisVmPreloadProc *vm_preload_proc;
471 /* What keys should be loaded in background when calling this command? */
472 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
473 int vm_lastkey; /* THe last argument that's a key */
474 int vm_keystep; /* The step between first and last key */
475 };
476
477 struct redisFunctionSym {
478 char *name;
479 unsigned long pointer;
480 };
481
482 typedef struct _redisSortObject {
483 robj *obj;
484 union {
485 double score;
486 robj *cmpobj;
487 } u;
488 } redisSortObject;
489
490 typedef struct _redisSortOperation {
491 int type;
492 robj *pattern;
493 } redisSortOperation;
494
495 /* ZSETs use a specialized version of Skiplists */
496
497 typedef struct zskiplistNode {
498 struct zskiplistNode **forward;
499 struct zskiplistNode *backward;
500 unsigned int *span;
501 double score;
502 robj *obj;
503 } zskiplistNode;
504
505 typedef struct zskiplist {
506 struct zskiplistNode *header, *tail;
507 unsigned long length;
508 int level;
509 } zskiplist;
510
511 typedef struct zset {
512 dict *dict;
513 zskiplist *zsl;
514 } zset;
515
516 /* Our shared "common" objects */
517
518 #define REDIS_SHARED_INTEGERS 10000
519 struct sharedObjectsStruct {
520 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
521 *colon, *nullbulk, *nullmultibulk, *queued,
522 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
523 *outofrangeerr, *plus,
524 *select0, *select1, *select2, *select3, *select4,
525 *select5, *select6, *select7, *select8, *select9,
526 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
527 *mbulk4, *psubscribebulk, *punsubscribebulk,
528 *integers[REDIS_SHARED_INTEGERS];
529 } shared;
530
531 /* Global vars that are actally used as constants. The following double
532 * values are used for double on-disk serialization, and are initialized
533 * at runtime to avoid strange compiler optimizations. */
534
535 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
536
537 /* VM threaded I/O request message */
538 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
539 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
540 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
541 typedef struct iojob {
542 int type; /* Request type, REDIS_IOJOB_* */
543 redisDb *db;/* Redis database */
544 robj *key; /* This I/O request is about swapping this key */
545 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
546 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
547 off_t page; /* Swap page where to read/write the object */
548 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
549 int canceled; /* True if this command was canceled by blocking side of VM */
550 pthread_t thread; /* ID of the thread processing this entry */
551 } iojob;
552
553 /*================================ Prototypes =============================== */
554
555 static void freeStringObject(robj *o);
556 static void freeListObject(robj *o);
557 static void freeSetObject(robj *o);
558 static void decrRefCount(void *o);
559 static robj *createObject(int type, void *ptr);
560 static void freeClient(redisClient *c);
561 static int rdbLoad(char *filename);
562 static void addReply(redisClient *c, robj *obj);
563 static void addReplySds(redisClient *c, sds s);
564 static void incrRefCount(robj *o);
565 static int rdbSaveBackground(char *filename);
566 static robj *createStringObject(char *ptr, size_t len);
567 static robj *dupStringObject(robj *o);
568 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
569 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
570 static void flushAppendOnlyFile(void);
571 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
572 static int syncWithMaster(void);
573 static robj *tryObjectEncoding(robj *o);
574 static robj *getDecodedObject(robj *o);
575 static int removeExpire(redisDb *db, robj *key);
576 static int expireIfNeeded(redisDb *db, robj *key);
577 static int deleteIfVolatile(redisDb *db, robj *key);
578 static int deleteIfSwapped(redisDb *db, robj *key);
579 static int deleteKey(redisDb *db, robj *key);
580 static time_t getExpire(redisDb *db, robj *key);
581 static int setExpire(redisDb *db, robj *key, time_t when);
582 static void updateSlavesWaitingBgsave(int bgsaveerr);
583 static void freeMemoryIfNeeded(void);
584 static int processCommand(redisClient *c);
585 static void setupSigSegvAction(void);
586 static void rdbRemoveTempFile(pid_t childpid);
587 static void aofRemoveTempFile(pid_t childpid);
588 static size_t stringObjectLen(robj *o);
589 static void processInputBuffer(redisClient *c);
590 static zskiplist *zslCreate(void);
591 static void zslFree(zskiplist *zsl);
592 static void zslInsert(zskiplist *zsl, double score, robj *obj);
593 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
594 static void initClientMultiState(redisClient *c);
595 static void freeClientMultiState(redisClient *c);
596 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
597 static void unblockClientWaitingData(redisClient *c);
598 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
599 static void vmInit(void);
600 static void vmMarkPagesFree(off_t page, off_t count);
601 static robj *vmLoadObject(robj *key);
602 static robj *vmPreviewObject(robj *key);
603 static int vmSwapOneObjectBlocking(void);
604 static int vmSwapOneObjectThreaded(void);
605 static int vmCanSwapOut(void);
606 static int tryFreeOneObjectFromFreelist(void);
607 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
609 static void vmCancelThreadedIOJob(robj *o);
610 static void lockThreadedIO(void);
611 static void unlockThreadedIO(void);
612 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
613 static void freeIOJob(iojob *j);
614 static void queueIOJob(iojob *j);
615 static int vmWriteObjectOnSwap(robj *o, off_t page);
616 static robj *vmReadObjectFromSwap(off_t page, int type);
617 static void waitEmptyIOJobsQueue(void);
618 static void vmReopenSwapFile(void);
619 static int vmFreePage(off_t page);
620 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
622 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
623 static int dontWaitForSwappedKey(redisClient *c, robj *key);
624 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
625 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
626 static struct redisCommand *lookupCommand(char *name);
627 static void call(redisClient *c, struct redisCommand *cmd);
628 static void resetClient(redisClient *c);
629 static void convertToRealHash(robj *o);
630 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
631 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
632 static void freePubsubPattern(void *p);
633 static int listMatchPubsubPattern(void *a, void *b);
634 static int compareStringObjects(robj *a, robj *b);
635 static int equalStringObjects(robj *a, robj *b);
636 static void usage();
637 static int rewriteAppendOnlyFileBackground(void);
638 static int vmSwapObjectBlocking(robj *key, robj *val);
639 static int prepareForShutdown();
640 static void touchWatchedKey(redisDb *db, robj *key);
641 static void touchWatchedKeysOnFlush(int dbid);
642 static void unwatchAllKeys(redisClient *c);
643
644 static void authCommand(redisClient *c);
645 static void pingCommand(redisClient *c);
646 static void echoCommand(redisClient *c);
647 static void setCommand(redisClient *c);
648 static void setnxCommand(redisClient *c);
649 static void setexCommand(redisClient *c);
650 static void getCommand(redisClient *c);
651 static void delCommand(redisClient *c);
652 static void existsCommand(redisClient *c);
653 static void incrCommand(redisClient *c);
654 static void decrCommand(redisClient *c);
655 static void incrbyCommand(redisClient *c);
656 static void decrbyCommand(redisClient *c);
657 static void selectCommand(redisClient *c);
658 static void randomkeyCommand(redisClient *c);
659 static void keysCommand(redisClient *c);
660 static void dbsizeCommand(redisClient *c);
661 static void lastsaveCommand(redisClient *c);
662 static void saveCommand(redisClient *c);
663 static void bgsaveCommand(redisClient *c);
664 static void bgrewriteaofCommand(redisClient *c);
665 static void shutdownCommand(redisClient *c);
666 static void moveCommand(redisClient *c);
667 static void renameCommand(redisClient *c);
668 static void renamenxCommand(redisClient *c);
669 static void lpushCommand(redisClient *c);
670 static void rpushCommand(redisClient *c);
671 static void lpopCommand(redisClient *c);
672 static void rpopCommand(redisClient *c);
673 static void llenCommand(redisClient *c);
674 static void lindexCommand(redisClient *c);
675 static void lrangeCommand(redisClient *c);
676 static void ltrimCommand(redisClient *c);
677 static void typeCommand(redisClient *c);
678 static void lsetCommand(redisClient *c);
679 static void saddCommand(redisClient *c);
680 static void sremCommand(redisClient *c);
681 static void smoveCommand(redisClient *c);
682 static void sismemberCommand(redisClient *c);
683 static void scardCommand(redisClient *c);
684 static void spopCommand(redisClient *c);
685 static void srandmemberCommand(redisClient *c);
686 static void sinterCommand(redisClient *c);
687 static void sinterstoreCommand(redisClient *c);
688 static void sunionCommand(redisClient *c);
689 static void sunionstoreCommand(redisClient *c);
690 static void sdiffCommand(redisClient *c);
691 static void sdiffstoreCommand(redisClient *c);
692 static void syncCommand(redisClient *c);
693 static void flushdbCommand(redisClient *c);
694 static void flushallCommand(redisClient *c);
695 static void sortCommand(redisClient *c);
696 static void lremCommand(redisClient *c);
697 static void rpoplpushcommand(redisClient *c);
698 static void infoCommand(redisClient *c);
699 static void mgetCommand(redisClient *c);
700 static void monitorCommand(redisClient *c);
701 static void expireCommand(redisClient *c);
702 static void expireatCommand(redisClient *c);
703 static void getsetCommand(redisClient *c);
704 static void ttlCommand(redisClient *c);
705 static void slaveofCommand(redisClient *c);
706 static void debugCommand(redisClient *c);
707 static void msetCommand(redisClient *c);
708 static void msetnxCommand(redisClient *c);
709 static void zaddCommand(redisClient *c);
710 static void zincrbyCommand(redisClient *c);
711 static void zrangeCommand(redisClient *c);
712 static void zrangebyscoreCommand(redisClient *c);
713 static void zcountCommand(redisClient *c);
714 static void zrevrangeCommand(redisClient *c);
715 static void zcardCommand(redisClient *c);
716 static void zremCommand(redisClient *c);
717 static void zscoreCommand(redisClient *c);
718 static void zremrangebyscoreCommand(redisClient *c);
719 static void multiCommand(redisClient *c);
720 static void execCommand(redisClient *c);
721 static void discardCommand(redisClient *c);
722 static void blpopCommand(redisClient *c);
723 static void brpopCommand(redisClient *c);
724 static void appendCommand(redisClient *c);
725 static void substrCommand(redisClient *c);
726 static void zrankCommand(redisClient *c);
727 static void zrevrankCommand(redisClient *c);
728 static void hsetCommand(redisClient *c);
729 static void hsetnxCommand(redisClient *c);
730 static void hgetCommand(redisClient *c);
731 static void hmsetCommand(redisClient *c);
732 static void hmgetCommand(redisClient *c);
733 static void hdelCommand(redisClient *c);
734 static void hlenCommand(redisClient *c);
735 static void zremrangebyrankCommand(redisClient *c);
736 static void zunionstoreCommand(redisClient *c);
737 static void zinterstoreCommand(redisClient *c);
738 static void hkeysCommand(redisClient *c);
739 static void hvalsCommand(redisClient *c);
740 static void hgetallCommand(redisClient *c);
741 static void hexistsCommand(redisClient *c);
742 static void configCommand(redisClient *c);
743 static void hincrbyCommand(redisClient *c);
744 static void subscribeCommand(redisClient *c);
745 static void unsubscribeCommand(redisClient *c);
746 static void psubscribeCommand(redisClient *c);
747 static void punsubscribeCommand(redisClient *c);
748 static void publishCommand(redisClient *c);
749 static void watchCommand(redisClient *c);
750 static void unwatchCommand(redisClient *c);
751
752 /*================================= Globals ================================= */
753
754 /* Global vars */
755 static struct redisServer server; /* server global state */
756 static struct redisCommand cmdTable[] = {
757 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
761 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
764 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
768 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
780 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
781 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
784 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
789 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
790 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
791 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
792 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
793 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
794 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
798 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
801 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
802 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
809 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
811 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
816 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
817 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
822 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
826 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
839 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
845 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
847 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
852 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
855 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
858 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
863 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
865 {NULL,NULL,0,0,NULL,0,0,0}
866 };
867
868 /*============================ Utility functions ============================ */
869
870 /* Glob-style pattern matching. */
871 static int stringmatchlen(const char *pattern, int patternLen,
872 const char *string, int stringLen, int nocase)
873 {
874 while(patternLen) {
875 switch(pattern[0]) {
876 case '*':
877 while (pattern[1] == '*') {
878 pattern++;
879 patternLen--;
880 }
881 if (patternLen == 1)
882 return 1; /* match */
883 while(stringLen) {
884 if (stringmatchlen(pattern+1, patternLen-1,
885 string, stringLen, nocase))
886 return 1; /* match */
887 string++;
888 stringLen--;
889 }
890 return 0; /* no match */
891 break;
892 case '?':
893 if (stringLen == 0)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 case '[':
899 {
900 int not, match;
901
902 pattern++;
903 patternLen--;
904 not = pattern[0] == '^';
905 if (not) {
906 pattern++;
907 patternLen--;
908 }
909 match = 0;
910 while(1) {
911 if (pattern[0] == '\\') {
912 pattern++;
913 patternLen--;
914 if (pattern[0] == string[0])
915 match = 1;
916 } else if (pattern[0] == ']') {
917 break;
918 } else if (patternLen == 0) {
919 pattern--;
920 patternLen++;
921 break;
922 } else if (pattern[1] == '-' && patternLen >= 3) {
923 int start = pattern[0];
924 int end = pattern[2];
925 int c = string[0];
926 if (start > end) {
927 int t = start;
928 start = end;
929 end = t;
930 }
931 if (nocase) {
932 start = tolower(start);
933 end = tolower(end);
934 c = tolower(c);
935 }
936 pattern += 2;
937 patternLen -= 2;
938 if (c >= start && c <= end)
939 match = 1;
940 } else {
941 if (!nocase) {
942 if (pattern[0] == string[0])
943 match = 1;
944 } else {
945 if (tolower((int)pattern[0]) == tolower((int)string[0]))
946 match = 1;
947 }
948 }
949 pattern++;
950 patternLen--;
951 }
952 if (not)
953 match = !match;
954 if (!match)
955 return 0; /* no match */
956 string++;
957 stringLen--;
958 break;
959 }
960 case '\\':
961 if (patternLen >= 2) {
962 pattern++;
963 patternLen--;
964 }
965 /* fall through */
966 default:
967 if (!nocase) {
968 if (pattern[0] != string[0])
969 return 0; /* no match */
970 } else {
971 if (tolower((int)pattern[0]) != tolower((int)string[0]))
972 return 0; /* no match */
973 }
974 string++;
975 stringLen--;
976 break;
977 }
978 pattern++;
979 patternLen--;
980 if (stringLen == 0) {
981 while(*pattern == '*') {
982 pattern++;
983 patternLen--;
984 }
985 break;
986 }
987 }
988 if (patternLen == 0 && stringLen == 0)
989 return 1;
990 return 0;
991 }
992
993 static int stringmatch(const char *pattern, const char *string, int nocase) {
994 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
995 }
996
997 /* Convert a string representing an amount of memory into the number of
998 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
999 * (1024*1024*1024).
1000 *
1001 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1002 * set to 0 */
1003 static long long memtoll(const char *p, int *err) {
1004 const char *u;
1005 char buf[128];
1006 long mul; /* unit multiplier */
1007 long long val;
1008 unsigned int digits;
1009
1010 if (err) *err = 0;
1011 /* Search the first non digit character. */
1012 u = p;
1013 if (*u == '-') u++;
1014 while(*u && isdigit(*u)) u++;
1015 if (*u == '\0' || !strcasecmp(u,"b")) {
1016 mul = 1;
1017 } else if (!strcasecmp(u,"k")) {
1018 mul = 1000;
1019 } else if (!strcasecmp(u,"kb")) {
1020 mul = 1024;
1021 } else if (!strcasecmp(u,"m")) {
1022 mul = 1000*1000;
1023 } else if (!strcasecmp(u,"mb")) {
1024 mul = 1024*1024;
1025 } else if (!strcasecmp(u,"g")) {
1026 mul = 1000L*1000*1000;
1027 } else if (!strcasecmp(u,"gb")) {
1028 mul = 1024L*1024*1024;
1029 } else {
1030 if (err) *err = 1;
1031 mul = 1;
1032 }
1033 digits = u-p;
1034 if (digits >= sizeof(buf)) {
1035 if (err) *err = 1;
1036 return LLONG_MAX;
1037 }
1038 memcpy(buf,p,digits);
1039 buf[digits] = '\0';
1040 val = strtoll(buf,NULL,10);
1041 return val*mul;
1042 }
1043
1044 /* Convert a long long into a string. Returns the number of
1045 * characters needed to represent the number, that can be shorter if passed
1046 * buffer length is not enough to store the whole number. */
1047 static int ll2string(char *s, size_t len, long long value) {
1048 char buf[32], *p;
1049 unsigned long long v;
1050 size_t l;
1051
1052 if (len == 0) return 0;
1053 v = (value < 0) ? -value : value;
1054 p = buf+31; /* point to the last character */
1055 do {
1056 *p-- = '0'+(v%10);
1057 v /= 10;
1058 } while(v);
1059 if (value < 0) *p-- = '-';
1060 p++;
1061 l = 32-(p-buf);
1062 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1063 memcpy(s,p,l);
1064 s[l] = '\0';
1065 return l;
1066 }
1067
1068 static void redisLog(int level, const char *fmt, ...) {
1069 va_list ap;
1070 FILE *fp;
1071
1072 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1073 if (!fp) return;
1074
1075 va_start(ap, fmt);
1076 if (level >= server.verbosity) {
1077 char *c = ".-*#";
1078 char buf[64];
1079 time_t now;
1080
1081 now = time(NULL);
1082 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1083 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1084 vfprintf(fp, fmt, ap);
1085 fprintf(fp,"\n");
1086 fflush(fp);
1087 }
1088 va_end(ap);
1089
1090 if (server.logfile) fclose(fp);
1091 }
1092
1093 /*====================== Hash table type implementation ==================== */
1094
1095 /* This is an hash table type that uses the SDS dynamic strings libary as
1096 * keys and radis objects as values (objects can hold SDS strings,
1097 * lists, sets). */
1098
1099 static void dictVanillaFree(void *privdata, void *val)
1100 {
1101 DICT_NOTUSED(privdata);
1102 zfree(val);
1103 }
1104
1105 static void dictListDestructor(void *privdata, void *val)
1106 {
1107 DICT_NOTUSED(privdata);
1108 listRelease((list*)val);
1109 }
1110
1111 static int sdsDictKeyCompare(void *privdata, const void *key1,
1112 const void *key2)
1113 {
1114 int l1,l2;
1115 DICT_NOTUSED(privdata);
1116
1117 l1 = sdslen((sds)key1);
1118 l2 = sdslen((sds)key2);
1119 if (l1 != l2) return 0;
1120 return memcmp(key1, key2, l1) == 0;
1121 }
1122
1123 static void dictRedisObjectDestructor(void *privdata, void *val)
1124 {
1125 DICT_NOTUSED(privdata);
1126
1127 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1128 decrRefCount(val);
1129 }
1130
1131 static int dictObjKeyCompare(void *privdata, const void *key1,
1132 const void *key2)
1133 {
1134 const robj *o1 = key1, *o2 = key2;
1135 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1136 }
1137
1138 static unsigned int dictObjHash(const void *key) {
1139 const robj *o = key;
1140 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1141 }
1142
1143 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1144 const void *key2)
1145 {
1146 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1147 int cmp;
1148
1149 if (o1->encoding == REDIS_ENCODING_INT &&
1150 o2->encoding == REDIS_ENCODING_INT)
1151 return o1->ptr == o2->ptr;
1152
1153 o1 = getDecodedObject(o1);
1154 o2 = getDecodedObject(o2);
1155 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1156 decrRefCount(o1);
1157 decrRefCount(o2);
1158 return cmp;
1159 }
1160
1161 static unsigned int dictEncObjHash(const void *key) {
1162 robj *o = (robj*) key;
1163
1164 if (o->encoding == REDIS_ENCODING_RAW) {
1165 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1166 } else {
1167 if (o->encoding == REDIS_ENCODING_INT) {
1168 char buf[32];
1169 int len;
1170
1171 len = ll2string(buf,32,(long)o->ptr);
1172 return dictGenHashFunction((unsigned char*)buf, len);
1173 } else {
1174 unsigned int hash;
1175
1176 o = getDecodedObject(o);
1177 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 decrRefCount(o);
1179 return hash;
1180 }
1181 }
1182 }
1183
1184 /* Sets type and expires */
1185 static dictType setDictType = {
1186 dictEncObjHash, /* hash function */
1187 NULL, /* key dup */
1188 NULL, /* val dup */
1189 dictEncObjKeyCompare, /* key compare */
1190 dictRedisObjectDestructor, /* key destructor */
1191 NULL /* val destructor */
1192 };
1193
1194 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1195 static dictType zsetDictType = {
1196 dictEncObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictEncObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
1201 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1202 };
1203
1204 /* Db->dict */
1205 static dictType dbDictType = {
1206 dictObjHash, /* hash function */
1207 NULL, /* key dup */
1208 NULL, /* val dup */
1209 dictObjKeyCompare, /* key compare */
1210 dictRedisObjectDestructor, /* key destructor */
1211 dictRedisObjectDestructor /* val destructor */
1212 };
1213
1214 /* Db->expires */
1215 static dictType keyptrDictType = {
1216 dictObjHash, /* hash function */
1217 NULL, /* key dup */
1218 NULL, /* val dup */
1219 dictObjKeyCompare, /* key compare */
1220 dictRedisObjectDestructor, /* key destructor */
1221 NULL /* val destructor */
1222 };
1223
1224 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1225 static dictType hashDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 dictRedisObjectDestructor /* val destructor */
1232 };
1233
1234 /* Keylist hash table type has unencoded redis objects as keys and
1235 * lists as values. It's used for blocking operations (BLPOP) and to
1236 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1237 static dictType keylistDictType = {
1238 dictObjHash, /* hash function */
1239 NULL, /* key dup */
1240 NULL, /* val dup */
1241 dictObjKeyCompare, /* key compare */
1242 dictRedisObjectDestructor, /* key destructor */
1243 dictListDestructor /* val destructor */
1244 };
1245
1246 static void version();
1247
1248 /* ========================= Random utility functions ======================= */
1249
1250 /* Redis generally does not try to recover from out of memory conditions
1251 * when allocating objects or strings, it is not clear if it will be possible
1252 * to report this condition to the client since the networking layer itself
1253 * is based on heap allocation for send buffers, so we simply abort.
1254 * At least the code will be simpler to read... */
1255 static void oom(const char *msg) {
1256 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1257 sleep(1);
1258 abort();
1259 }
1260
1261 /* ====================== Redis server networking stuff ===================== */
1262 static void closeTimedoutClients(void) {
1263 redisClient *c;
1264 listNode *ln;
1265 time_t now = time(NULL);
1266 listIter li;
1267
1268 listRewind(server.clients,&li);
1269 while ((ln = listNext(&li)) != NULL) {
1270 c = listNodeValue(ln);
1271 if (server.maxidletime &&
1272 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1273 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1274 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1275 listLength(c->pubsub_patterns) == 0 &&
1276 (now - c->lastinteraction > server.maxidletime))
1277 {
1278 redisLog(REDIS_VERBOSE,"Closing idle client");
1279 freeClient(c);
1280 } else if (c->flags & REDIS_BLOCKED) {
1281 if (c->blockingto != 0 && c->blockingto < now) {
1282 addReply(c,shared.nullmultibulk);
1283 unblockClientWaitingData(c);
1284 }
1285 }
1286 }
1287 }
1288
1289 static int htNeedsResize(dict *dict) {
1290 long long size, used;
1291
1292 size = dictSlots(dict);
1293 used = dictSize(dict);
1294 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1295 (used*100/size < REDIS_HT_MINFILL));
1296 }
1297
1298 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1299 * we resize the hash table to save memory */
1300 static void tryResizeHashTables(void) {
1301 int j;
1302
1303 for (j = 0; j < server.dbnum; j++) {
1304 if (htNeedsResize(server.db[j].dict))
1305 dictResize(server.db[j].dict);
1306 if (htNeedsResize(server.db[j].expires))
1307 dictResize(server.db[j].expires);
1308 }
1309 }
1310
1311 /* Our hash table implementation performs rehashing incrementally while
1312 * we write/read from the hash table. Still if the server is idle, the hash
1313 * table will use two tables for a long time. So we try to use 1 millisecond
1314 * of CPU time at every serverCron() loop in order to rehash some key. */
1315 static void incrementallyRehash(void) {
1316 int j;
1317
1318 for (j = 0; j < server.dbnum; j++) {
1319 if (dictIsRehashing(server.db[j].dict)) {
1320 dictRehashMilliseconds(server.db[j].dict,1);
1321 break; /* already used our millisecond for this loop... */
1322 }
1323 }
1324 }
1325
1326 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1327 void backgroundSaveDoneHandler(int statloc) {
1328 int exitcode = WEXITSTATUS(statloc);
1329 int bysignal = WIFSIGNALED(statloc);
1330
1331 if (!bysignal && exitcode == 0) {
1332 redisLog(REDIS_NOTICE,
1333 "Background saving terminated with success");
1334 server.dirty = 0;
1335 server.lastsave = time(NULL);
1336 } else if (!bysignal && exitcode != 0) {
1337 redisLog(REDIS_WARNING, "Background saving error");
1338 } else {
1339 redisLog(REDIS_WARNING,
1340 "Background saving terminated by signal %d", WTERMSIG(statloc));
1341 rdbRemoveTempFile(server.bgsavechildpid);
1342 }
1343 server.bgsavechildpid = -1;
1344 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1345 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1346 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1347 }
1348
1349 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1350 * Handle this. */
1351 void backgroundRewriteDoneHandler(int statloc) {
1352 int exitcode = WEXITSTATUS(statloc);
1353 int bysignal = WIFSIGNALED(statloc);
1354
1355 if (!bysignal && exitcode == 0) {
1356 int fd;
1357 char tmpfile[256];
1358
1359 redisLog(REDIS_NOTICE,
1360 "Background append only file rewriting terminated with success");
1361 /* Now it's time to flush the differences accumulated by the parent */
1362 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1363 fd = open(tmpfile,O_WRONLY|O_APPEND);
1364 if (fd == -1) {
1365 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1366 goto cleanup;
1367 }
1368 /* Flush our data... */
1369 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1370 (signed) sdslen(server.bgrewritebuf)) {
1371 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1372 close(fd);
1373 goto cleanup;
1374 }
1375 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1376 /* Now our work is to rename the temp file into the stable file. And
1377 * switch the file descriptor used by the server for append only. */
1378 if (rename(tmpfile,server.appendfilename) == -1) {
1379 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1380 close(fd);
1381 goto cleanup;
1382 }
1383 /* Mission completed... almost */
1384 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1385 if (server.appendfd != -1) {
1386 /* If append only is actually enabled... */
1387 close(server.appendfd);
1388 server.appendfd = fd;
1389 if (appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1390 server.appendseldb = -1; /* Make sure it will issue SELECT */
1391 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1392 } else {
1393 /* If append only is disabled we just generate a dump in this
1394 * format. Why not? */
1395 close(fd);
1396 }
1397 } else if (!bysignal && exitcode != 0) {
1398 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1399 } else {
1400 redisLog(REDIS_WARNING,
1401 "Background append only file rewriting terminated by signal %d",
1402 WTERMSIG(statloc));
1403 }
1404 cleanup:
1405 sdsfree(server.bgrewritebuf);
1406 server.bgrewritebuf = sdsempty();
1407 aofRemoveTempFile(server.bgrewritechildpid);
1408 server.bgrewritechildpid = -1;
1409 }
1410
1411 /* This function is called once a background process of some kind terminates,
1412 * as we want to avoid resizing the hash tables when there is a child in order
1413 * to play well with copy-on-write (otherwise when a resize happens lots of
1414 * memory pages are copied). The goal of this function is to update the ability
1415 * for dict.c to resize the hash tables accordingly to the fact we have o not
1416 * running childs. */
1417 static void updateDictResizePolicy(void) {
1418 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1419 dictEnableResize();
1420 else
1421 dictDisableResize();
1422 }
1423
1424 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1425 int j, loops = server.cronloops++;
1426 REDIS_NOTUSED(eventLoop);
1427 REDIS_NOTUSED(id);
1428 REDIS_NOTUSED(clientData);
1429
1430 /* We take a cached value of the unix time in the global state because
1431 * with virtual memory and aging there is to store the current time
1432 * in objects at every object access, and accuracy is not needed.
1433 * To access a global var is faster than calling time(NULL) */
1434 server.unixtime = time(NULL);
1435
1436 /* We received a SIGTERM, shutting down here in a safe way, as it is
1437 * not ok doing so inside the signal handler. */
1438 if (server.shutdown_asap) {
1439 if (prepareForShutdown() == REDIS_OK) exit(0);
1440 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1441 }
1442
1443 /* Show some info about non-empty databases */
1444 for (j = 0; j < server.dbnum; j++) {
1445 long long size, used, vkeys;
1446
1447 size = dictSlots(server.db[j].dict);
1448 used = dictSize(server.db[j].dict);
1449 vkeys = dictSize(server.db[j].expires);
1450 if (!(loops % 50) && (used || vkeys)) {
1451 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1452 /* dictPrintStats(server.dict); */
1453 }
1454 }
1455
1456 /* We don't want to resize the hash tables while a bacground saving
1457 * is in progress: the saving child is created using fork() that is
1458 * implemented with a copy-on-write semantic in most modern systems, so
1459 * if we resize the HT while there is the saving child at work actually
1460 * a lot of memory movements in the parent will cause a lot of pages
1461 * copied. */
1462 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1463 if (!(loops % 10)) tryResizeHashTables();
1464 if (server.activerehashing) incrementallyRehash();
1465 }
1466
1467 /* Show information about connected clients */
1468 if (!(loops % 50)) {
1469 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1470 listLength(server.clients)-listLength(server.slaves),
1471 listLength(server.slaves),
1472 zmalloc_used_memory());
1473 }
1474
1475 /* Close connections of timedout clients */
1476 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1477 closeTimedoutClients();
1478
1479 /* Check if a background saving or AOF rewrite in progress terminated */
1480 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1481 int statloc;
1482 pid_t pid;
1483
1484 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1485 if (pid == server.bgsavechildpid) {
1486 backgroundSaveDoneHandler(statloc);
1487 } else {
1488 backgroundRewriteDoneHandler(statloc);
1489 }
1490 updateDictResizePolicy();
1491 }
1492 } else {
1493 /* If there is not a background saving in progress check if
1494 * we have to save now */
1495 time_t now = time(NULL);
1496 for (j = 0; j < server.saveparamslen; j++) {
1497 struct saveparam *sp = server.saveparams+j;
1498
1499 if (server.dirty >= sp->changes &&
1500 now-server.lastsave > sp->seconds) {
1501 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1502 sp->changes, sp->seconds);
1503 rdbSaveBackground(server.dbfilename);
1504 break;
1505 }
1506 }
1507 }
1508
1509 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1510 * will use few CPU cycles if there are few expiring keys, otherwise
1511 * it will get more aggressive to avoid that too much memory is used by
1512 * keys that can be removed from the keyspace. */
1513 for (j = 0; j < server.dbnum; j++) {
1514 int expired;
1515 redisDb *db = server.db+j;
1516
1517 /* Continue to expire if at the end of the cycle more than 25%
1518 * of the keys were expired. */
1519 do {
1520 long num = dictSize(db->expires);
1521 time_t now = time(NULL);
1522
1523 expired = 0;
1524 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1525 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1526 while (num--) {
1527 dictEntry *de;
1528 time_t t;
1529
1530 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1531 t = (time_t) dictGetEntryVal(de);
1532 if (now > t) {
1533 deleteKey(db,dictGetEntryKey(de));
1534 expired++;
1535 server.stat_expiredkeys++;
1536 }
1537 }
1538 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1539 }
1540
1541 /* Swap a few keys on disk if we are over the memory limit and VM
1542 * is enbled. Try to free objects from the free list first. */
1543 if (vmCanSwapOut()) {
1544 while (server.vm_enabled && zmalloc_used_memory() >
1545 server.vm_max_memory)
1546 {
1547 int retval;
1548
1549 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1550 retval = (server.vm_max_threads == 0) ?
1551 vmSwapOneObjectBlocking() :
1552 vmSwapOneObjectThreaded();
1553 if (retval == REDIS_ERR && !(loops % 300) &&
1554 zmalloc_used_memory() >
1555 (server.vm_max_memory+server.vm_max_memory/10))
1556 {
1557 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1558 }
1559 /* Note that when using threade I/O we free just one object,
1560 * because anyway when the I/O thread in charge to swap this
1561 * object out will finish, the handler of completed jobs
1562 * will try to swap more objects if we are still out of memory. */
1563 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1564 }
1565 }
1566
1567 /* Check if we should connect to a MASTER */
1568 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1569 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1570 if (syncWithMaster() == REDIS_OK) {
1571 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1572 if (server.appendonly) rewriteAppendOnlyFileBackground();
1573 }
1574 }
1575 return 100;
1576 }
1577
1578 /* This function gets called every time Redis is entering the
1579 * main loop of the event driven library, that is, before to sleep
1580 * for ready file descriptors. */
1581 static void beforeSleep(struct aeEventLoop *eventLoop) {
1582 REDIS_NOTUSED(eventLoop);
1583
1584 /* Awake clients that got all the swapped keys they requested */
1585 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1586 listIter li;
1587 listNode *ln;
1588
1589 listRewind(server.io_ready_clients,&li);
1590 while((ln = listNext(&li))) {
1591 redisClient *c = ln->value;
1592 struct redisCommand *cmd;
1593
1594 /* Resume the client. */
1595 listDelNode(server.io_ready_clients,ln);
1596 c->flags &= (~REDIS_IO_WAIT);
1597 server.vm_blocked_clients--;
1598 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1599 readQueryFromClient, c);
1600 cmd = lookupCommand(c->argv[0]->ptr);
1601 assert(cmd != NULL);
1602 call(c,cmd);
1603 resetClient(c);
1604 /* There may be more data to process in the input buffer. */
1605 if (c->querybuf && sdslen(c->querybuf) > 0)
1606 processInputBuffer(c);
1607 }
1608 }
1609 /* Write the AOF buffer on disk */
1610 flushAppendOnlyFile();
1611 }
1612
1613 static void createSharedObjects(void) {
1614 int j;
1615
1616 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1617 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1618 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1619 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1620 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1621 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1622 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1623 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1624 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1625 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1626 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1627 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1629 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR no such key\r\n"));
1631 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR syntax error\r\n"));
1633 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR source and destination objects are the same\r\n"));
1635 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1636 "-ERR index out of range\r\n"));
1637 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1638 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1639 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1640 shared.select0 = createStringObject("select 0\r\n",10);
1641 shared.select1 = createStringObject("select 1\r\n",10);
1642 shared.select2 = createStringObject("select 2\r\n",10);
1643 shared.select3 = createStringObject("select 3\r\n",10);
1644 shared.select4 = createStringObject("select 4\r\n",10);
1645 shared.select5 = createStringObject("select 5\r\n",10);
1646 shared.select6 = createStringObject("select 6\r\n",10);
1647 shared.select7 = createStringObject("select 7\r\n",10);
1648 shared.select8 = createStringObject("select 8\r\n",10);
1649 shared.select9 = createStringObject("select 9\r\n",10);
1650 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1651 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1652 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1653 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1654 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1655 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1656 shared.mbulk3 = createStringObject("*3\r\n",4);
1657 shared.mbulk4 = createStringObject("*4\r\n",4);
1658 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1659 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1660 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1661 }
1662 }
1663
1664 static void appendServerSaveParams(time_t seconds, int changes) {
1665 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1666 server.saveparams[server.saveparamslen].seconds = seconds;
1667 server.saveparams[server.saveparamslen].changes = changes;
1668 server.saveparamslen++;
1669 }
1670
1671 static void resetServerSaveParams() {
1672 zfree(server.saveparams);
1673 server.saveparams = NULL;
1674 server.saveparamslen = 0;
1675 }
1676
1677 static void initServerConfig() {
1678 server.dbnum = REDIS_DEFAULT_DBNUM;
1679 server.port = REDIS_SERVERPORT;
1680 server.verbosity = REDIS_VERBOSE;
1681 server.maxidletime = REDIS_MAXIDLETIME;
1682 server.saveparams = NULL;
1683 server.logfile = NULL; /* NULL = log on standard output */
1684 server.bindaddr = NULL;
1685 server.glueoutputbuf = 1;
1686 server.daemonize = 0;
1687 server.appendonly = 0;
1688 server.appendfsync = APPENDFSYNC_EVERYSEC;
1689 server.no_appendfsync_on_rewrite = 0;
1690 server.lastfsync = time(NULL);
1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
1696 server.requirepass = NULL;
1697 server.rdbcompression = 1;
1698 server.activerehashing = 1;
1699 server.maxclients = 0;
1700 server.blpop_blocked_clients = 0;
1701 server.maxmemory = 0;
1702 server.vm_enabled = 0;
1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1707 server.vm_max_threads = 4;
1708 server.vm_blocked_clients = 0;
1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1711 server.shutdown_asap = 0;
1712
1713 resetServerSaveParams();
1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
1720 server.masterauth = NULL;
1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
1731 }
1732
1733 static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
1738 setupSigSegvAction();
1739
1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
1745 server.clients = listCreate();
1746 server.slaves = listCreate();
1747 server.monitors = listCreate();
1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
1757 for (j = 0; j < server.dbnum; j++) {
1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1764 server.db[j].id = j;
1765 }
1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1770 server.cronloops = 0;
1771 server.bgsavechildpid = -1;
1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
1774 server.aofbuf = sdsempty();
1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
1779 server.stat_expiredkeys = 0;
1780 server.stat_starttime = time(NULL);
1781 server.unixtime = time(NULL);
1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1785
1786 if (server.appendonly) {
1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
1794
1795 if (server.vm_enabled) vmInit();
1796 }
1797
1798 /* Empty the whole database */
1799 static long long emptyDb() {
1800 int j;
1801 long long removed = 0;
1802
1803 for (j = 0; j < server.dbnum; j++) {
1804 removed += dictSize(server.db[j].dict);
1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
1808 return removed;
1809 }
1810
1811 static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815 }
1816
1817 /* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819 static void loadServerConfig(char *filename) {
1820 FILE *fp;
1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1830 exit(1);
1831 }
1832 }
1833
1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1854 server.maxidletime = atoi(argv[1]);
1855 if (server.maxidletime < 0) {
1856 err = "Invalid timeout value"; goto loaderr;
1857 }
1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1864 server.bindaddr = zstrdup(argv[1]);
1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1888 FILE *logfp;
1889
1890 server.logfile = zstrdup(argv[1]);
1891 if (!strcasecmp(server.logfile,"stdout")) {
1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
1904 fclose(logfp);
1905 }
1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1916 server.maxmemory = memtoll(argv[1],NULL);
1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
1946 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1947 && argc == 2) {
1948 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1949 err = "argument must be 'yes' or 'no'"; goto loaderr;
1950 }
1951 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1952 if (!strcasecmp(argv[1],"no")) {
1953 server.appendfsync = APPENDFSYNC_NO;
1954 } else if (!strcasecmp(argv[1],"always")) {
1955 server.appendfsync = APPENDFSYNC_ALWAYS;
1956 } else if (!strcasecmp(argv[1],"everysec")) {
1957 server.appendfsync = APPENDFSYNC_EVERYSEC;
1958 } else {
1959 err = "argument must be 'no', 'always' or 'everysec'";
1960 goto loaderr;
1961 }
1962 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1963 server.requirepass = zstrdup(argv[1]);
1964 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1965 zfree(server.pidfile);
1966 server.pidfile = zstrdup(argv[1]);
1967 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1968 zfree(server.dbfilename);
1969 server.dbfilename = zstrdup(argv[1]);
1970 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1971 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1972 err = "argument must be 'yes' or 'no'"; goto loaderr;
1973 }
1974 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1975 zfree(server.vm_swap_file);
1976 server.vm_swap_file = zstrdup(argv[1]);
1977 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1978 server.vm_max_memory = memtoll(argv[1],NULL);
1979 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1980 server.vm_page_size = memtoll(argv[1], NULL);
1981 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1982 server.vm_pages = memtoll(argv[1], NULL);
1983 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1984 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1985 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1986 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1987 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1988 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1989 } else {
1990 err = "Bad directive or wrong number of arguments"; goto loaderr;
1991 }
1992 for (j = 0; j < argc; j++)
1993 sdsfree(argv[j]);
1994 zfree(argv);
1995 sdsfree(line);
1996 }
1997 if (fp != stdin) fclose(fp);
1998 return;
1999
2000 loaderr:
2001 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2002 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2003 fprintf(stderr, ">>> '%s'\n", line);
2004 fprintf(stderr, "%s\n", err);
2005 exit(1);
2006 }
2007
2008 static void freeClientArgv(redisClient *c) {
2009 int j;
2010
2011 for (j = 0; j < c->argc; j++)
2012 decrRefCount(c->argv[j]);
2013 for (j = 0; j < c->mbargc; j++)
2014 decrRefCount(c->mbargv[j]);
2015 c->argc = 0;
2016 c->mbargc = 0;
2017 }
2018
2019 static void freeClient(redisClient *c) {
2020 listNode *ln;
2021
2022 /* Note that if the client we are freeing is blocked into a blocking
2023 * call, we have to set querybuf to NULL *before* to call
2024 * unblockClientWaitingData() to avoid processInputBuffer() will get
2025 * called. Also it is important to remove the file events after
2026 * this, because this call adds the READABLE event. */
2027 sdsfree(c->querybuf);
2028 c->querybuf = NULL;
2029 if (c->flags & REDIS_BLOCKED)
2030 unblockClientWaitingData(c);
2031
2032 /* UNWATCH all the keys */
2033 unwatchAllKeys(c);
2034 listRelease(c->watched_keys);
2035 /* Unsubscribe from all the pubsub channels */
2036 pubsubUnsubscribeAllChannels(c,0);
2037 pubsubUnsubscribeAllPatterns(c,0);
2038 dictRelease(c->pubsub_channels);
2039 listRelease(c->pubsub_patterns);
2040 /* Obvious cleanup */
2041 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2042 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2043 listRelease(c->reply);
2044 freeClientArgv(c);
2045 close(c->fd);
2046 /* Remove from the list of clients */
2047 ln = listSearchKey(server.clients,c);
2048 redisAssert(ln != NULL);
2049 listDelNode(server.clients,ln);
2050 /* Remove from the list of clients that are now ready to be restarted
2051 * after waiting for swapped keys */
2052 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2053 ln = listSearchKey(server.io_ready_clients,c);
2054 if (ln) {
2055 listDelNode(server.io_ready_clients,ln);
2056 server.vm_blocked_clients--;
2057 }
2058 }
2059 /* Remove from the list of clients waiting for swapped keys */
2060 while (server.vm_enabled && listLength(c->io_keys)) {
2061 ln = listFirst(c->io_keys);
2062 dontWaitForSwappedKey(c,ln->value);
2063 }
2064 listRelease(c->io_keys);
2065 /* Master/slave cleanup */
2066 if (c->flags & REDIS_SLAVE) {
2067 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2068 close(c->repldbfd);
2069 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2070 ln = listSearchKey(l,c);
2071 redisAssert(ln != NULL);
2072 listDelNode(l,ln);
2073 }
2074 if (c->flags & REDIS_MASTER) {
2075 server.master = NULL;
2076 server.replstate = REDIS_REPL_CONNECT;
2077 }
2078 /* Release memory */
2079 zfree(c->argv);
2080 zfree(c->mbargv);
2081 freeClientMultiState(c);
2082 zfree(c);
2083 }
2084
2085 #define GLUEREPLY_UP_TO (1024)
2086 static void glueReplyBuffersIfNeeded(redisClient *c) {
2087 int copylen = 0;
2088 char buf[GLUEREPLY_UP_TO];
2089 listNode *ln;
2090 listIter li;
2091 robj *o;
2092
2093 listRewind(c->reply,&li);
2094 while((ln = listNext(&li))) {
2095 int objlen;
2096
2097 o = ln->value;
2098 objlen = sdslen(o->ptr);
2099 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2100 memcpy(buf+copylen,o->ptr,objlen);
2101 copylen += objlen;
2102 listDelNode(c->reply,ln);
2103 } else {
2104 if (copylen == 0) return;
2105 break;
2106 }
2107 }
2108 /* Now the output buffer is empty, add the new single element */
2109 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2110 listAddNodeHead(c->reply,o);
2111 }
2112
2113 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2114 redisClient *c = privdata;
2115 int nwritten = 0, totwritten = 0, objlen;
2116 robj *o;
2117 REDIS_NOTUSED(el);
2118 REDIS_NOTUSED(mask);
2119
2120 /* Use writev() if we have enough buffers to send */
2121 if (!server.glueoutputbuf &&
2122 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2123 !(c->flags & REDIS_MASTER))
2124 {
2125 sendReplyToClientWritev(el, fd, privdata, mask);
2126 return;
2127 }
2128
2129 while(listLength(c->reply)) {
2130 if (server.glueoutputbuf && listLength(c->reply) > 1)
2131 glueReplyBuffersIfNeeded(c);
2132
2133 o = listNodeValue(listFirst(c->reply));
2134 objlen = sdslen(o->ptr);
2135
2136 if (objlen == 0) {
2137 listDelNode(c->reply,listFirst(c->reply));
2138 continue;
2139 }
2140
2141 if (c->flags & REDIS_MASTER) {
2142 /* Don't reply to a master */
2143 nwritten = objlen - c->sentlen;
2144 } else {
2145 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2146 if (nwritten <= 0) break;
2147 }
2148 c->sentlen += nwritten;
2149 totwritten += nwritten;
2150 /* If we fully sent the object on head go to the next one */
2151 if (c->sentlen == objlen) {
2152 listDelNode(c->reply,listFirst(c->reply));
2153 c->sentlen = 0;
2154 }
2155 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2156 * bytes, in a single threaded server it's a good idea to serve
2157 * other clients as well, even if a very large request comes from
2158 * super fast link that is always able to accept data (in real world
2159 * scenario think about 'KEYS *' against the loopback interfae) */
2160 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2161 }
2162 if (nwritten == -1) {
2163 if (errno == EAGAIN) {
2164 nwritten = 0;
2165 } else {
2166 redisLog(REDIS_VERBOSE,
2167 "Error writing to client: %s", strerror(errno));
2168 freeClient(c);
2169 return;
2170 }
2171 }
2172 if (totwritten > 0) c->lastinteraction = time(NULL);
2173 if (listLength(c->reply) == 0) {
2174 c->sentlen = 0;
2175 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2176 }
2177 }
2178
2179 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2180 {
2181 redisClient *c = privdata;
2182 int nwritten = 0, totwritten = 0, objlen, willwrite;
2183 robj *o;
2184 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2185 int offset, ion = 0;
2186 REDIS_NOTUSED(el);
2187 REDIS_NOTUSED(mask);
2188
2189 listNode *node;
2190 while (listLength(c->reply)) {
2191 offset = c->sentlen;
2192 ion = 0;
2193 willwrite = 0;
2194
2195 /* fill-in the iov[] array */
2196 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2197 o = listNodeValue(node);
2198 objlen = sdslen(o->ptr);
2199
2200 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2201 break;
2202
2203 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2204 break; /* no more iovecs */
2205
2206 iov[ion].iov_base = ((char*)o->ptr) + offset;
2207 iov[ion].iov_len = objlen - offset;
2208 willwrite += objlen - offset;
2209 offset = 0; /* just for the first item */
2210 ion++;
2211 }
2212
2213 if(willwrite == 0)
2214 break;
2215
2216 /* write all collected blocks at once */
2217 if((nwritten = writev(fd, iov, ion)) < 0) {
2218 if (errno != EAGAIN) {
2219 redisLog(REDIS_VERBOSE,
2220 "Error writing to client: %s", strerror(errno));
2221 freeClient(c);
2222 return;
2223 }
2224 break;
2225 }
2226
2227 totwritten += nwritten;
2228 offset = c->sentlen;
2229
2230 /* remove written robjs from c->reply */
2231 while (nwritten && listLength(c->reply)) {
2232 o = listNodeValue(listFirst(c->reply));
2233 objlen = sdslen(o->ptr);
2234
2235 if(nwritten >= objlen - offset) {
2236 listDelNode(c->reply, listFirst(c->reply));
2237 nwritten -= objlen - offset;
2238 c->sentlen = 0;
2239 } else {
2240 /* partial write */
2241 c->sentlen += nwritten;
2242 break;
2243 }
2244 offset = 0;
2245 }
2246 }
2247
2248 if (totwritten > 0)
2249 c->lastinteraction = time(NULL);
2250
2251 if (listLength(c->reply) == 0) {
2252 c->sentlen = 0;
2253 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2254 }
2255 }
2256
2257 static struct redisCommand *lookupCommand(char *name) {
2258 int j = 0;
2259 while(cmdTable[j].name != NULL) {
2260 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2261 j++;
2262 }
2263 return NULL;
2264 }
2265
2266 /* resetClient prepare the client to process the next command */
2267 static void resetClient(redisClient *c) {
2268 freeClientArgv(c);
2269 c->bulklen = -1;
2270 c->multibulk = 0;
2271 }
2272
2273 /* Call() is the core of Redis execution of a command */
2274 static void call(redisClient *c, struct redisCommand *cmd) {
2275 long long dirty;
2276
2277 dirty = server.dirty;
2278 cmd->proc(c);
2279 dirty = server.dirty-dirty;
2280
2281 if (server.appendonly && dirty)
2282 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2283 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2284 listLength(server.slaves))
2285 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2286 if (listLength(server.monitors))
2287 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2288 server.stat_numcommands++;
2289 }
2290
2291 /* If this function gets called we already read a whole
2292 * command, argments are in the client argv/argc fields.
2293 * processCommand() execute the command or prepare the
2294 * server for a bulk read from the client.
2295 *
2296 * If 1 is returned the client is still alive and valid and
2297 * and other operations can be performed by the caller. Otherwise
2298 * if 0 is returned the client was destroied (i.e. after QUIT). */
2299 static int processCommand(redisClient *c) {
2300 struct redisCommand *cmd;
2301
2302 /* Free some memory if needed (maxmemory setting) */
2303 if (server.maxmemory) freeMemoryIfNeeded();
2304
2305 /* Handle the multi bulk command type. This is an alternative protocol
2306 * supported by Redis in order to receive commands that are composed of
2307 * multiple binary-safe "bulk" arguments. The latency of processing is
2308 * a bit higher but this allows things like multi-sets, so if this
2309 * protocol is used only for MSET and similar commands this is a big win. */
2310 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2311 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2312 if (c->multibulk <= 0) {
2313 resetClient(c);
2314 return 1;
2315 } else {
2316 decrRefCount(c->argv[c->argc-1]);
2317 c->argc--;
2318 return 1;
2319 }
2320 } else if (c->multibulk) {
2321 if (c->bulklen == -1) {
2322 if (((char*)c->argv[0]->ptr)[0] != '$') {
2323 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2324 resetClient(c);
2325 return 1;
2326 } else {
2327 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2328 decrRefCount(c->argv[0]);
2329 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2330 c->argc--;
2331 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2332 resetClient(c);
2333 return 1;
2334 }
2335 c->argc--;
2336 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2337 return 1;
2338 }
2339 } else {
2340 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2341 c->mbargv[c->mbargc] = c->argv[0];
2342 c->mbargc++;
2343 c->argc--;
2344 c->multibulk--;
2345 if (c->multibulk == 0) {
2346 robj **auxargv;
2347 int auxargc;
2348
2349 /* Here we need to swap the multi-bulk argc/argv with the
2350 * normal argc/argv of the client structure. */
2351 auxargv = c->argv;
2352 c->argv = c->mbargv;
2353 c->mbargv = auxargv;
2354
2355 auxargc = c->argc;
2356 c->argc = c->mbargc;
2357 c->mbargc = auxargc;
2358
2359 /* We need to set bulklen to something different than -1
2360 * in order for the code below to process the command without
2361 * to try to read the last argument of a bulk command as
2362 * a special argument. */
2363 c->bulklen = 0;
2364 /* continue below and process the command */
2365 } else {
2366 c->bulklen = -1;
2367 return 1;
2368 }
2369 }
2370 }
2371 /* -- end of multi bulk commands processing -- */
2372
2373 /* The QUIT command is handled as a special case. Normal command
2374 * procs are unable to close the client connection safely */
2375 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2376 freeClient(c);
2377 return 0;
2378 }
2379
2380 /* Now lookup the command and check ASAP about trivial error conditions
2381 * such wrong arity, bad command name and so forth. */
2382 cmd = lookupCommand(c->argv[0]->ptr);
2383 if (!cmd) {
2384 addReplySds(c,
2385 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2386 (char*)c->argv[0]->ptr));
2387 resetClient(c);
2388 return 1;
2389 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2390 (c->argc < -cmd->arity)) {
2391 addReplySds(c,
2392 sdscatprintf(sdsempty(),
2393 "-ERR wrong number of arguments for '%s' command\r\n",
2394 cmd->name));
2395 resetClient(c);
2396 return 1;
2397 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2398 /* This is a bulk command, we have to read the last argument yet. */
2399 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2400
2401 decrRefCount(c->argv[c->argc-1]);
2402 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2403 c->argc--;
2404 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2405 resetClient(c);
2406 return 1;
2407 }
2408 c->argc--;
2409 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2410 /* It is possible that the bulk read is already in the
2411 * buffer. Check this condition and handle it accordingly.
2412 * This is just a fast path, alternative to call processInputBuffer().
2413 * It's a good idea since the code is small and this condition
2414 * happens most of the times. */
2415 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2416 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2417 c->argc++;
2418 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2419 } else {
2420 /* Otherwise return... there is to read the last argument
2421 * from the socket. */
2422 return 1;
2423 }
2424 }
2425 /* Let's try to encode the bulk object to save space. */
2426 if (cmd->flags & REDIS_CMD_BULK)
2427 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2428
2429 /* Check if the user is authenticated */
2430 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2431 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2432 resetClient(c);
2433 return 1;
2434 }
2435
2436 /* Handle the maxmemory directive */
2437 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2438 zmalloc_used_memory() > server.maxmemory)
2439 {
2440 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2441 resetClient(c);
2442 return 1;
2443 }
2444
2445 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2446 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2447 &&
2448 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2449 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2450 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2451 resetClient(c);
2452 return 1;
2453 }
2454
2455 /* Exec the command */
2456 if (c->flags & REDIS_MULTI &&
2457 cmd->proc != execCommand && cmd->proc != discardCommand &&
2458 cmd->proc != multiCommand && cmd->proc != watchCommand)
2459 {
2460 queueMultiCommand(c,cmd);
2461 addReply(c,shared.queued);
2462 } else {
2463 if (server.vm_enabled && server.vm_max_threads > 0 &&
2464 blockClientOnSwappedKeys(c,cmd)) return 1;
2465 call(c,cmd);
2466 }
2467
2468 /* Prepare the client for the next command */
2469 resetClient(c);
2470 return 1;
2471 }
2472
2473 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2474 listNode *ln;
2475 listIter li;
2476 int outc = 0, j;
2477 robj **outv;
2478 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2479 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2480 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2481 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2482 robj *lenobj;
2483
2484 if (argc <= REDIS_STATIC_ARGS) {
2485 outv = static_outv;
2486 } else {
2487 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2488 }
2489
2490 lenobj = createObject(REDIS_STRING,
2491 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2492 lenobj->refcount = 0;
2493 outv[outc++] = lenobj;
2494 for (j = 0; j < argc; j++) {
2495 lenobj = createObject(REDIS_STRING,
2496 sdscatprintf(sdsempty(),"$%lu\r\n",
2497 (unsigned long) stringObjectLen(argv[j])));
2498 lenobj->refcount = 0;
2499 outv[outc++] = lenobj;
2500 outv[outc++] = argv[j];
2501 outv[outc++] = shared.crlf;
2502 }
2503
2504 /* Increment all the refcounts at start and decrement at end in order to
2505 * be sure to free objects if there is no slave in a replication state
2506 * able to be feed with commands */
2507 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2508 listRewind(slaves,&li);
2509 while((ln = listNext(&li))) {
2510 redisClient *slave = ln->value;
2511
2512 /* Don't feed slaves that are still waiting for BGSAVE to start */
2513 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2514
2515 /* Feed all the other slaves, MONITORs and so on */
2516 if (slave->slaveseldb != dictid) {
2517 robj *selectcmd;
2518
2519 switch(dictid) {
2520 case 0: selectcmd = shared.select0; break;
2521 case 1: selectcmd = shared.select1; break;
2522 case 2: selectcmd = shared.select2; break;
2523 case 3: selectcmd = shared.select3; break;
2524 case 4: selectcmd = shared.select4; break;
2525 case 5: selectcmd = shared.select5; break;
2526 case 6: selectcmd = shared.select6; break;
2527 case 7: selectcmd = shared.select7; break;
2528 case 8: selectcmd = shared.select8; break;
2529 case 9: selectcmd = shared.select9; break;
2530 default:
2531 selectcmd = createObject(REDIS_STRING,
2532 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2533 selectcmd->refcount = 0;
2534 break;
2535 }
2536 addReply(slave,selectcmd);
2537 slave->slaveseldb = dictid;
2538 }
2539 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2540 }
2541 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2542 if (outv != static_outv) zfree(outv);
2543 }
2544
2545 static sds sdscatrepr(sds s, char *p, size_t len) {
2546 s = sdscatlen(s,"\"",1);
2547 while(len--) {
2548 switch(*p) {
2549 case '\\':
2550 case '"':
2551 s = sdscatprintf(s,"\\%c",*p);
2552 break;
2553 case '\n': s = sdscatlen(s,"\\n",1); break;
2554 case '\r': s = sdscatlen(s,"\\r",1); break;
2555 case '\t': s = sdscatlen(s,"\\t",1); break;
2556 case '\a': s = sdscatlen(s,"\\a",1); break;
2557 case '\b': s = sdscatlen(s,"\\b",1); break;
2558 default:
2559 if (isprint(*p))
2560 s = sdscatprintf(s,"%c",*p);
2561 else
2562 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2563 break;
2564 }
2565 p++;
2566 }
2567 return sdscatlen(s,"\"",1);
2568 }
2569
2570 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2571 listNode *ln;
2572 listIter li;
2573 int j;
2574 sds cmdrepr = sdsnew("+");
2575 robj *cmdobj;
2576 struct timeval tv;
2577
2578 gettimeofday(&tv,NULL);
2579 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2580 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2581
2582 for (j = 0; j < argc; j++) {
2583 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2584 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2585 } else {
2586 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2587 sdslen(argv[j]->ptr));
2588 }
2589 if (j != argc-1)
2590 cmdrepr = sdscatlen(cmdrepr," ",1);
2591 }
2592 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2593 cmdobj = createObject(REDIS_STRING,cmdrepr);
2594
2595 listRewind(monitors,&li);
2596 while((ln = listNext(&li))) {
2597 redisClient *monitor = ln->value;
2598 addReply(monitor,cmdobj);
2599 }
2600 decrRefCount(cmdobj);
2601 }
2602
2603 static void processInputBuffer(redisClient *c) {
2604 again:
2605 /* Before to process the input buffer, make sure the client is not
2606 * waitig for a blocking operation such as BLPOP. Note that the first
2607 * iteration the client is never blocked, otherwise the processInputBuffer
2608 * would not be called at all, but after the execution of the first commands
2609 * in the input buffer the client may be blocked, and the "goto again"
2610 * will try to reiterate. The following line will make it return asap. */
2611 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2612 if (c->bulklen == -1) {
2613 /* Read the first line of the query */
2614 char *p = strchr(c->querybuf,'\n');
2615 size_t querylen;
2616
2617 if (p) {
2618 sds query, *argv;
2619 int argc, j;
2620
2621 query = c->querybuf;
2622 c->querybuf = sdsempty();
2623 querylen = 1+(p-(query));
2624 if (sdslen(query) > querylen) {
2625 /* leave data after the first line of the query in the buffer */
2626 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2627 }
2628 *p = '\0'; /* remove "\n" */
2629 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2630 sdsupdatelen(query);
2631
2632 /* Now we can split the query in arguments */
2633 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2634 sdsfree(query);
2635
2636 if (c->argv) zfree(c->argv);
2637 c->argv = zmalloc(sizeof(robj*)*argc);
2638
2639 for (j = 0; j < argc; j++) {
2640 if (sdslen(argv[j])) {
2641 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2642 c->argc++;
2643 } else {
2644 sdsfree(argv[j]);
2645 }
2646 }
2647 zfree(argv);
2648 if (c->argc) {
2649 /* Execute the command. If the client is still valid
2650 * after processCommand() return and there is something
2651 * on the query buffer try to process the next command. */
2652 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2653 } else {
2654 /* Nothing to process, argc == 0. Just process the query
2655 * buffer if it's not empty or return to the caller */
2656 if (sdslen(c->querybuf)) goto again;
2657 }
2658 return;
2659 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2660 redisLog(REDIS_VERBOSE, "Client protocol error");
2661 freeClient(c);
2662 return;
2663 }
2664 } else {
2665 /* Bulk read handling. Note that if we are at this point
2666 the client already sent a command terminated with a newline,
2667 we are reading the bulk data that is actually the last
2668 argument of the command. */
2669 int qbl = sdslen(c->querybuf);
2670
2671 if (c->bulklen <= qbl) {
2672 /* Copy everything but the final CRLF as final argument */
2673 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2674 c->argc++;
2675 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2676 /* Process the command. If the client is still valid after
2677 * the processing and there is more data in the buffer
2678 * try to parse it. */
2679 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2680 return;
2681 }
2682 }
2683 }
2684
2685 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2686 redisClient *c = (redisClient*) privdata;
2687 char buf[REDIS_IOBUF_LEN];
2688 int nread;
2689 REDIS_NOTUSED(el);
2690 REDIS_NOTUSED(mask);
2691
2692 nread = read(fd, buf, REDIS_IOBUF_LEN);
2693 if (nread == -1) {
2694 if (errno == EAGAIN) {
2695 nread = 0;
2696 } else {
2697 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2698 freeClient(c);
2699 return;
2700 }
2701 } else if (nread == 0) {
2702 redisLog(REDIS_VERBOSE, "Client closed connection");
2703 freeClient(c);
2704 return;
2705 }
2706 if (nread) {
2707 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2708 c->lastinteraction = time(NULL);
2709 } else {
2710 return;
2711 }
2712 processInputBuffer(c);
2713 }
2714
2715 static int selectDb(redisClient *c, int id) {
2716 if (id < 0 || id >= server.dbnum)
2717 return REDIS_ERR;
2718 c->db = &server.db[id];
2719 return REDIS_OK;
2720 }
2721
2722 static void *dupClientReplyValue(void *o) {
2723 incrRefCount((robj*)o);
2724 return o;
2725 }
2726
2727 static int listMatchObjects(void *a, void *b) {
2728 return equalStringObjects(a,b);
2729 }
2730
2731 static redisClient *createClient(int fd) {
2732 redisClient *c = zmalloc(sizeof(*c));
2733
2734 anetNonBlock(NULL,fd);
2735 anetTcpNoDelay(NULL,fd);
2736 if (!c) return NULL;
2737 selectDb(c,0);
2738 c->fd = fd;
2739 c->querybuf = sdsempty();
2740 c->argc = 0;
2741 c->argv = NULL;
2742 c->bulklen = -1;
2743 c->multibulk = 0;
2744 c->mbargc = 0;
2745 c->mbargv = NULL;
2746 c->sentlen = 0;
2747 c->flags = 0;
2748 c->lastinteraction = time(NULL);
2749 c->authenticated = 0;
2750 c->replstate = REDIS_REPL_NONE;
2751 c->reply = listCreate();
2752 listSetFreeMethod(c->reply,decrRefCount);
2753 listSetDupMethod(c->reply,dupClientReplyValue);
2754 c->blocking_keys = NULL;
2755 c->blocking_keys_num = 0;
2756 c->io_keys = listCreate();
2757 c->watched_keys = listCreate();
2758 listSetFreeMethod(c->io_keys,decrRefCount);
2759 c->pubsub_channels = dictCreate(&setDictType,NULL);
2760 c->pubsub_patterns = listCreate();
2761 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2762 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2763 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2764 readQueryFromClient, c) == AE_ERR) {
2765 freeClient(c);
2766 return NULL;
2767 }
2768 listAddNodeTail(server.clients,c);
2769 initClientMultiState(c);
2770 return c;
2771 }
2772
2773 static void addReply(redisClient *c, robj *obj) {
2774 if (listLength(c->reply) == 0 &&
2775 (c->replstate == REDIS_REPL_NONE ||
2776 c->replstate == REDIS_REPL_ONLINE) &&
2777 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2778 sendReplyToClient, c) == AE_ERR) return;
2779
2780 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2781 obj = dupStringObject(obj);
2782 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2783 }
2784 listAddNodeTail(c->reply,getDecodedObject(obj));
2785 }
2786
2787 static void addReplySds(redisClient *c, sds s) {
2788 robj *o = createObject(REDIS_STRING,s);
2789 addReply(c,o);
2790 decrRefCount(o);
2791 }
2792
2793 static void addReplyDouble(redisClient *c, double d) {
2794 char buf[128];
2795
2796 snprintf(buf,sizeof(buf),"%.17g",d);
2797 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2798 (unsigned long) strlen(buf),buf));
2799 }
2800
2801 static void addReplyLongLong(redisClient *c, long long ll) {
2802 char buf[128];
2803 size_t len;
2804
2805 if (ll == 0) {
2806 addReply(c,shared.czero);
2807 return;
2808 } else if (ll == 1) {
2809 addReply(c,shared.cone);
2810 return;
2811 }
2812 buf[0] = ':';
2813 len = ll2string(buf+1,sizeof(buf)-1,ll);
2814 buf[len+1] = '\r';
2815 buf[len+2] = '\n';
2816 addReplySds(c,sdsnewlen(buf,len+3));
2817 }
2818
2819 static void addReplyUlong(redisClient *c, unsigned long ul) {
2820 char buf[128];
2821 size_t len;
2822
2823 if (ul == 0) {
2824 addReply(c,shared.czero);
2825 return;
2826 } else if (ul == 1) {
2827 addReply(c,shared.cone);
2828 return;
2829 }
2830 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2831 addReplySds(c,sdsnewlen(buf,len));
2832 }
2833
2834 static void addReplyBulkLen(redisClient *c, robj *obj) {
2835 size_t len, intlen;
2836 char buf[128];
2837
2838 if (obj->encoding == REDIS_ENCODING_RAW) {
2839 len = sdslen(obj->ptr);
2840 } else {
2841 long n = (long)obj->ptr;
2842
2843 /* Compute how many bytes will take this integer as a radix 10 string */
2844 len = 1;
2845 if (n < 0) {
2846 len++;
2847 n = -n;
2848 }
2849 while((n = n/10) != 0) {
2850 len++;
2851 }
2852 }
2853 buf[0] = '$';
2854 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2855 buf[intlen+1] = '\r';
2856 buf[intlen+2] = '\n';
2857 addReplySds(c,sdsnewlen(buf,intlen+3));
2858 }
2859
2860 static void addReplyBulk(redisClient *c, robj *obj) {
2861 addReplyBulkLen(c,obj);
2862 addReply(c,obj);
2863 addReply(c,shared.crlf);
2864 }
2865
2866 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2867 static void addReplyBulkCString(redisClient *c, char *s) {
2868 if (s == NULL) {
2869 addReply(c,shared.nullbulk);
2870 } else {
2871 robj *o = createStringObject(s,strlen(s));
2872 addReplyBulk(c,o);
2873 decrRefCount(o);
2874 }
2875 }
2876
2877 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2878 int cport, cfd;
2879 char cip[128];
2880 redisClient *c;
2881 REDIS_NOTUSED(el);
2882 REDIS_NOTUSED(mask);
2883 REDIS_NOTUSED(privdata);
2884
2885 cfd = anetAccept(server.neterr, fd, cip, &cport);
2886 if (cfd == AE_ERR) {
2887 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2888 return;
2889 }
2890 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2891 if ((c = createClient(cfd)) == NULL) {
2892 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2893 close(cfd); /* May be already closed, just ingore errors */
2894 return;
2895 }
2896 /* If maxclient directive is set and this is one client more... close the
2897 * connection. Note that we create the client instead to check before
2898 * for this condition, since now the socket is already set in nonblocking
2899 * mode and we can send an error for free using the Kernel I/O */
2900 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2901 char *err = "-ERR max number of clients reached\r\n";
2902
2903 /* That's a best effort error message, don't check write errors */
2904 if (write(c->fd,err,strlen(err)) == -1) {
2905 /* Nothing to do, Just to avoid the warning... */
2906 }
2907 freeClient(c);
2908 return;
2909 }
2910 server.stat_numconnections++;
2911 }
2912
2913 /* ======================= Redis objects implementation ===================== */
2914
2915 static robj *createObject(int type, void *ptr) {
2916 robj *o;
2917
2918 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2919 if (listLength(server.objfreelist)) {
2920 listNode *head = listFirst(server.objfreelist);
2921 o = listNodeValue(head);
2922 listDelNode(server.objfreelist,head);
2923 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2924 } else {
2925 if (server.vm_enabled) {
2926 pthread_mutex_unlock(&server.obj_freelist_mutex);
2927 o = zmalloc(sizeof(*o));
2928 } else {
2929 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2930 }
2931 }
2932 o->type = type;
2933 o->encoding = REDIS_ENCODING_RAW;
2934 o->ptr = ptr;
2935 o->refcount = 1;
2936 if (server.vm_enabled) {
2937 /* Note that this code may run in the context of an I/O thread
2938 * and accessing to server.unixtime in theory is an error
2939 * (no locks). But in practice this is safe, and even if we read
2940 * garbage Redis will not fail, as it's just a statistical info */
2941 o->vm.atime = server.unixtime;
2942 o->storage = REDIS_VM_MEMORY;
2943 }
2944 return o;
2945 }
2946
2947 static robj *createStringObject(char *ptr, size_t len) {
2948 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2949 }
2950
2951 static robj *createStringObjectFromLongLong(long long value) {
2952 robj *o;
2953 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2954 incrRefCount(shared.integers[value]);
2955 o = shared.integers[value];
2956 } else {
2957 if (value >= LONG_MIN && value <= LONG_MAX) {
2958 o = createObject(REDIS_STRING, NULL);
2959 o->encoding = REDIS_ENCODING_INT;
2960 o->ptr = (void*)((long)value);
2961 } else {
2962 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2963 }
2964 }
2965 return o;
2966 }
2967
2968 static robj *dupStringObject(robj *o) {
2969 assert(o->encoding == REDIS_ENCODING_RAW);
2970 return createStringObject(o->ptr,sdslen(o->ptr));
2971 }
2972
2973 static robj *createListObject(void) {
2974 list *l = listCreate();
2975
2976 listSetFreeMethod(l,decrRefCount);
2977 return createObject(REDIS_LIST,l);
2978 }
2979
2980 static robj *createSetObject(void) {
2981 dict *d = dictCreate(&setDictType,NULL);
2982 return createObject(REDIS_SET,d);
2983 }
2984
2985 static robj *createHashObject(void) {
2986 /* All the Hashes start as zipmaps. Will be automatically converted
2987 * into hash tables if there are enough elements or big elements
2988 * inside. */
2989 unsigned char *zm = zipmapNew();
2990 robj *o = createObject(REDIS_HASH,zm);
2991 o->encoding = REDIS_ENCODING_ZIPMAP;
2992 return o;
2993 }
2994
2995 static robj *createZsetObject(void) {
2996 zset *zs = zmalloc(sizeof(*zs));
2997
2998 zs->dict = dictCreate(&zsetDictType,NULL);
2999 zs->zsl = zslCreate();
3000 return createObject(REDIS_ZSET,zs);
3001 }
3002
3003 static void freeStringObject(robj *o) {
3004 if (o->encoding == REDIS_ENCODING_RAW) {
3005 sdsfree(o->ptr);
3006 }
3007 }
3008
3009 static void freeListObject(robj *o) {
3010 listRelease((list*) o->ptr);
3011 }
3012
3013 static void freeSetObject(robj *o) {
3014 dictRelease((dict*) o->ptr);
3015 }
3016
3017 static void freeZsetObject(robj *o) {
3018 zset *zs = o->ptr;
3019
3020 dictRelease(zs->dict);
3021 zslFree(zs->zsl);
3022 zfree(zs);
3023 }
3024
3025 static void freeHashObject(robj *o) {
3026 switch (o->encoding) {
3027 case REDIS_ENCODING_HT:
3028 dictRelease((dict*) o->ptr);
3029 break;
3030 case REDIS_ENCODING_ZIPMAP:
3031 zfree(o->ptr);
3032 break;
3033 default:
3034 redisPanic("Unknown hash encoding type");
3035 break;
3036 }
3037 }
3038
3039 static void incrRefCount(robj *o) {
3040 o->refcount++;
3041 }
3042
3043 static void decrRefCount(void *obj) {
3044 robj *o = obj;
3045
3046 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3047 /* Object is a key of a swapped out value, or in the process of being
3048 * loaded. */
3049 if (server.vm_enabled &&
3050 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3051 {
3052 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3053 redisAssert(o->type == REDIS_STRING);
3054 freeStringObject(o);
3055 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3056 pthread_mutex_lock(&server.obj_freelist_mutex);
3057 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3058 !listAddNodeHead(server.objfreelist,o))
3059 zfree(o);
3060 pthread_mutex_unlock(&server.obj_freelist_mutex);
3061 server.vm_stats_swapped_objects--;
3062 return;
3063 }
3064 /* Object is in memory, or in the process of being swapped out. */
3065 if (--(o->refcount) == 0) {
3066 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3067 vmCancelThreadedIOJob(obj);
3068 switch(o->type) {
3069 case REDIS_STRING: freeStringObject(o); break;
3070 case REDIS_LIST: freeListObject(o); break;
3071 case REDIS_SET: freeSetObject(o); break;
3072 case REDIS_ZSET: freeZsetObject(o); break;
3073 case REDIS_HASH: freeHashObject(o); break;
3074 default: redisPanic("Unknown object type"); break;
3075 }
3076 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3077 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3078 !listAddNodeHead(server.objfreelist,o))
3079 zfree(o);
3080 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3081 }
3082 }
3083
3084 static robj *lookupKey(redisDb *db, robj *key) {
3085 dictEntry *de = dictFind(db->dict,key);
3086 if (de) {
3087 robj *key = dictGetEntryKey(de);
3088 robj *val = dictGetEntryVal(de);
3089
3090 if (server.vm_enabled) {
3091 if (key->storage == REDIS_VM_MEMORY ||
3092 key->storage == REDIS_VM_SWAPPING)
3093 {
3094 /* If we were swapping the object out, stop it, this key
3095 * was requested. */
3096 if (key->storage == REDIS_VM_SWAPPING)
3097 vmCancelThreadedIOJob(key);
3098 /* Update the access time of the key for the aging algorithm. */
3099 key->vm.atime = server.unixtime;
3100 } else {
3101 int notify = (key->storage == REDIS_VM_LOADING);
3102
3103 /* Our value was swapped on disk. Bring it at home. */
3104 redisAssert(val == NULL);
3105 val = vmLoadObject(key);
3106 dictGetEntryVal(de) = val;
3107
3108 /* Clients blocked by the VM subsystem may be waiting for
3109 * this key... */
3110 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3111 }
3112 }
3113 return val;
3114 } else {
3115 return NULL;
3116 }
3117 }
3118
3119 static robj *lookupKeyRead(redisDb *db, robj *key) {
3120 expireIfNeeded(db,key);
3121 return lookupKey(db,key);
3122 }
3123
3124 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3125 deleteIfVolatile(db,key);
3126 touchWatchedKey(db,key);
3127 return lookupKey(db,key);
3128 }
3129
3130 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3131 robj *o = lookupKeyRead(c->db, key);
3132 if (!o) addReply(c,reply);
3133 return o;
3134 }
3135
3136 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3137 robj *o = lookupKeyWrite(c->db, key);
3138 if (!o) addReply(c,reply);
3139 return o;
3140 }
3141
3142 static int checkType(redisClient *c, robj *o, int type) {
3143 if (o->type != type) {
3144 addReply(c,shared.wrongtypeerr);
3145 return 1;
3146 }
3147 return 0;
3148 }
3149
3150 static int deleteKey(redisDb *db, robj *key) {
3151 int retval;
3152
3153 /* We need to protect key from destruction: after the first dictDelete()
3154 * it may happen that 'key' is no longer valid if we don't increment
3155 * it's count. This may happen when we get the object reference directly
3156 * from the hash table with dictRandomKey() or dict iterators */
3157 incrRefCount(key);
3158 if (dictSize(db->expires)) dictDelete(db->expires,key);
3159 retval = dictDelete(db->dict,key);
3160 decrRefCount(key);
3161
3162 return retval == DICT_OK;
3163 }
3164
3165 /* Check if the nul-terminated string 's' can be represented by a long
3166 * (that is, is a number that fits into long without any other space or
3167 * character before or after the digits).
3168 *
3169 * If so, the function returns REDIS_OK and *longval is set to the value
3170 * of the number. Otherwise REDIS_ERR is returned */
3171 static int isStringRepresentableAsLong(sds s, long *longval) {
3172 char buf[32], *endptr;
3173 long value;
3174 int slen;
3175
3176 value = strtol(s, &endptr, 10);
3177 if (endptr[0] != '\0') return REDIS_ERR;
3178 slen = ll2string(buf,32,value);
3179
3180 /* If the number converted back into a string is not identical
3181 * then it's not possible to encode the string as integer */
3182 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3183 if (longval) *longval = value;
3184 return REDIS_OK;
3185 }
3186
3187 /* Try to encode a string object in order to save space */
3188 static robj *tryObjectEncoding(robj *o) {
3189 long value;
3190 sds s = o->ptr;
3191
3192 if (o->encoding != REDIS_ENCODING_RAW)
3193 return o; /* Already encoded */
3194
3195 /* It's not safe to encode shared objects: shared objects can be shared
3196 * everywhere in the "object space" of Redis. Encoded objects can only
3197 * appear as "values" (and not, for instance, as keys) */
3198 if (o->refcount > 1) return o;
3199
3200 /* Currently we try to encode only strings */
3201 redisAssert(o->type == REDIS_STRING);
3202
3203 /* Check if we can represent this string as a long integer */
3204 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3205
3206 /* Ok, this object can be encoded */
3207 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3208 decrRefCount(o);
3209 incrRefCount(shared.integers[value]);
3210 return shared.integers[value];
3211 } else {
3212 o->encoding = REDIS_ENCODING_INT;
3213 sdsfree(o->ptr);
3214 o->ptr = (void*) value;
3215 return o;
3216 }
3217 }
3218
3219 /* Get a decoded version of an encoded object (returned as a new object).
3220 * If the object is already raw-encoded just increment the ref count. */
3221 static robj *getDecodedObject(robj *o) {
3222 robj *dec;
3223
3224 if (o->encoding == REDIS_ENCODING_RAW) {
3225 incrRefCount(o);
3226 return o;
3227 }
3228 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3229 char buf[32];
3230
3231 ll2string(buf,32,(long)o->ptr);
3232 dec = createStringObject(buf,strlen(buf));
3233 return dec;
3234 } else {
3235 redisPanic("Unknown encoding type");
3236 }
3237 }
3238
3239 /* Compare two string objects via strcmp() or alike.
3240 * Note that the objects may be integer-encoded. In such a case we
3241 * use ll2string() to get a string representation of the numbers on the stack
3242 * and compare the strings, it's much faster than calling getDecodedObject().
3243 *
3244 * Important note: if objects are not integer encoded, but binary-safe strings,
3245 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3246 * binary safe. */
3247 static int compareStringObjects(robj *a, robj *b) {
3248 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3249 char bufa[128], bufb[128], *astr, *bstr;
3250 int bothsds = 1;
3251
3252 if (a == b) return 0;
3253 if (a->encoding != REDIS_ENCODING_RAW) {
3254 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3255 astr = bufa;
3256 bothsds = 0;
3257 } else {
3258 astr = a->ptr;
3259 }
3260 if (b->encoding != REDIS_ENCODING_RAW) {
3261 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3262 bstr = bufb;
3263 bothsds = 0;
3264 } else {
3265 bstr = b->ptr;
3266 }
3267 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3268 }
3269
3270 /* Equal string objects return 1 if the two objects are the same from the
3271 * point of view of a string comparison, otherwise 0 is returned. Note that
3272 * this function is faster then checking for (compareStringObject(a,b) == 0)
3273 * because it can perform some more optimization. */
3274 static int equalStringObjects(robj *a, robj *b) {
3275 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3276 return a->ptr == b->ptr;
3277 } else {
3278 return compareStringObjects(a,b) == 0;
3279 }
3280 }
3281
3282 static size_t stringObjectLen(robj *o) {
3283 redisAssert(o->type == REDIS_STRING);
3284 if (o->encoding == REDIS_ENCODING_RAW) {
3285 return sdslen(o->ptr);
3286 } else {
3287 char buf[32];
3288
3289 return ll2string(buf,32,(long)o->ptr);
3290 }
3291 }
3292
3293 static int getDoubleFromObject(robj *o, double *target) {
3294 double value;
3295 char *eptr;
3296
3297 if (o == NULL) {
3298 value = 0;
3299 } else {
3300 redisAssert(o->type == REDIS_STRING);
3301 if (o->encoding == REDIS_ENCODING_RAW) {
3302 value = strtod(o->ptr, &eptr);
3303 if (eptr[0] != '\0') return REDIS_ERR;
3304 } else if (o->encoding == REDIS_ENCODING_INT) {
3305 value = (long)o->ptr;
3306 } else {
3307 redisPanic("Unknown string encoding");
3308 }
3309 }
3310
3311 *target = value;
3312 return REDIS_OK;
3313 }
3314
3315 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3316 double value;
3317 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3318 if (msg != NULL) {
3319 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3320 } else {
3321 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3322 }
3323 return REDIS_ERR;
3324 }
3325
3326 *target = value;
3327 return REDIS_OK;
3328 }
3329
3330 static int getLongLongFromObject(robj *o, long long *target) {
3331 long long value;
3332 char *eptr;
3333
3334 if (o == NULL) {
3335 value = 0;
3336 } else {
3337 redisAssert(o->type == REDIS_STRING);
3338 if (o->encoding == REDIS_ENCODING_RAW) {
3339 value = strtoll(o->ptr, &eptr, 10);
3340 if (eptr[0] != '\0') return REDIS_ERR;
3341 } else if (o->encoding == REDIS_ENCODING_INT) {
3342 value = (long)o->ptr;
3343 } else {
3344 redisPanic("Unknown string encoding");
3345 }
3346 }
3347
3348 *target = value;
3349 return REDIS_OK;
3350 }
3351
3352 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3353 long long value;
3354 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3355 if (msg != NULL) {
3356 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3357 } else {
3358 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3359 }
3360 return REDIS_ERR;
3361 }
3362
3363 *target = value;
3364 return REDIS_OK;
3365 }
3366
3367 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3368 long long value;
3369
3370 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3371 if (value < LONG_MIN || value > LONG_MAX) {
3372 if (msg != NULL) {
3373 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3374 } else {
3375 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3376 }
3377 return REDIS_ERR;
3378 }
3379
3380 *target = value;
3381 return REDIS_OK;
3382 }
3383
3384 /*============================ RDB saving/loading =========================== */
3385
3386 static int rdbSaveType(FILE *fp, unsigned char type) {
3387 if (fwrite(&type,1,1,fp) == 0) return -1;
3388 return 0;
3389 }
3390
3391 static int rdbSaveTime(FILE *fp, time_t t) {
3392 int32_t t32 = (int32_t) t;
3393 if (fwrite(&t32,4,1,fp) == 0) return -1;
3394 return 0;
3395 }
3396
3397 /* check rdbLoadLen() comments for more info */
3398 static int rdbSaveLen(FILE *fp, uint32_t len) {
3399 unsigned char buf[2];
3400
3401 if (len < (1<<6)) {
3402 /* Save a 6 bit len */
3403 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3404 if (fwrite(buf,1,1,fp) == 0) return -1;
3405 } else if (len < (1<<14)) {
3406 /* Save a 14 bit len */
3407 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3408 buf[1] = len&0xFF;
3409 if (fwrite(buf,2,1,fp) == 0) return -1;
3410 } else {
3411 /* Save a 32 bit len */
3412 buf[0] = (REDIS_RDB_32BITLEN<<6);
3413 if (fwrite(buf,1,1,fp) == 0) return -1;
3414 len = htonl(len);
3415 if (fwrite(&len,4,1,fp) == 0) return -1;
3416 }
3417 return 0;
3418 }
3419
3420 /* Encode 'value' as an integer if possible (if integer will fit the
3421 * supported range). If the function sucessful encoded the integer
3422 * then the (up to 5 bytes) encoded representation is written in the
3423 * string pointed by 'enc' and the length is returned. Otherwise
3424 * 0 is returned. */
3425 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3426 /* Finally check if it fits in our ranges */
3427 if (value >= -(1<<7) && value <= (1<<7)-1) {
3428 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3429 enc[1] = value&0xFF;
3430 return 2;
3431 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3432 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3433 enc[1] = value&0xFF;
3434 enc[2] = (value>>8)&0xFF;
3435 return 3;
3436 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3437 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3438 enc[1] = value&0xFF;
3439 enc[2] = (value>>8)&0xFF;
3440 enc[3] = (value>>16)&0xFF;
3441 enc[4] = (value>>24)&0xFF;
3442 return 5;
3443 } else {
3444 return 0;
3445 }
3446 }
3447
3448 /* String objects in the form "2391" "-100" without any space and with a
3449 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3450 * encoded as integers to save space */
3451 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3452 long long value;
3453 char *endptr, buf[32];
3454
3455 /* Check if it's possible to encode this value as a number */
3456 value = strtoll(s, &endptr, 10);
3457 if (endptr[0] != '\0') return 0;
3458 ll2string(buf,32,value);
3459
3460 /* If the number converted back into a string is not identical
3461 * then it's not possible to encode the string as integer */
3462 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3463
3464 return rdbEncodeInteger(value,enc);
3465 }
3466
3467 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3468 size_t comprlen, outlen;
3469 unsigned char byte;
3470 void *out;
3471
3472 /* We require at least four bytes compression for this to be worth it */
3473 if (len <= 4) return 0;
3474 outlen = len-4;
3475 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3476 comprlen = lzf_compress(s, len, out, outlen);
3477 if (comprlen == 0) {
3478 zfree(out);
3479 return 0;
3480 }
3481 /* Data compressed! Let's save it on disk */
3482 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3483 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3484 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3485 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3486 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3487 zfree(out);
3488 return comprlen;
3489
3490 writeerr:
3491 zfree(out);
3492 return -1;
3493 }
3494
3495 /* Save a string objet as [len][data] on disk. If the object is a string
3496 * representation of an integer value we try to safe it in a special form */
3497 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3498 int enclen;
3499
3500 /* Try integer encoding */
3501 if (len <= 11) {
3502 unsigned char buf[5];
3503 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3504 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3505 return 0;
3506 }
3507 }
3508
3509 /* Try LZF compression - under 20 bytes it's unable to compress even
3510 * aaaaaaaaaaaaaaaaaa so skip it */
3511 if (server.rdbcompression && len > 20) {
3512 int retval;
3513
3514 retval = rdbSaveLzfStringObject(fp,s,len);
3515 if (retval == -1) return -1;
3516 if (retval > 0) return 0;
3517 /* retval == 0 means data can't be compressed, save the old way */
3518 }
3519
3520 /* Store verbatim */
3521 if (rdbSaveLen(fp,len) == -1) return -1;
3522 if (len && fwrite(s,len,1,fp) == 0) return -1;
3523 return 0;
3524 }
3525
3526 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3527 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3528 int retval;
3529
3530 /* Avoid to decode the object, then encode it again, if the
3531 * object is alrady integer encoded. */
3532 if (obj->encoding == REDIS_ENCODING_INT) {
3533 long val = (long) obj->ptr;
3534 unsigned char buf[5];
3535 int enclen;
3536
3537 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3538 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3539 return 0;
3540 }
3541 /* otherwise... fall throught and continue with the usual
3542 * code path. */
3543 }
3544
3545 /* Avoid incr/decr ref count business when possible.
3546 * This plays well with copy-on-write given that we are probably
3547 * in a child process (BGSAVE). Also this makes sure key objects
3548 * of swapped objects are not incRefCount-ed (an assert does not allow
3549 * this in order to avoid bugs) */
3550 if (obj->encoding != REDIS_ENCODING_RAW) {
3551 obj = getDecodedObject(obj);
3552 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3553 decrRefCount(obj);
3554 } else {
3555 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3556 }
3557 return retval;
3558 }
3559
3560 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3561 * 8 bit integer specifing the length of the representation.
3562 * This 8 bit integer has special values in order to specify the following
3563 * conditions:
3564 * 253: not a number
3565 * 254: + inf
3566 * 255: - inf
3567 */
3568 static int rdbSaveDoubleValue(FILE *fp, double val) {
3569 unsigned char buf[128];
3570 int len;
3571
3572 if (isnan(val)) {
3573 buf[0] = 253;
3574 len = 1;
3575 } else if (!isfinite(val)) {
3576 len = 1;
3577 buf[0] = (val < 0) ? 255 : 254;
3578 } else {
3579 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3580 /* Check if the float is in a safe range to be casted into a
3581 * long long. We are assuming that long long is 64 bit here.
3582 * Also we are assuming that there are no implementations around where
3583 * double has precision < 52 bit.
3584 *
3585 * Under this assumptions we test if a double is inside an interval
3586 * where casting to long long is safe. Then using two castings we
3587 * make sure the decimal part is zero. If all this is true we use
3588 * integer printing function that is much faster. */
3589 double min = -4503599627370495; /* (2^52)-1 */
3590 double max = 4503599627370496; /* -(2^52) */
3591 if (val > min && val < max && val == ((double)((long long)val)))
3592 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3593 else
3594 #endif
3595 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3596 buf[0] = strlen((char*)buf+1);
3597 len = buf[0]+1;
3598 }
3599 if (fwrite(buf,len,1,fp) == 0) return -1;
3600 return 0;
3601 }
3602
3603 /* Save a Redis object. */
3604 static int rdbSaveObject(FILE *fp, robj *o) {
3605 if (o->type == REDIS_STRING) {
3606 /* Save a string value */
3607 if (rdbSaveStringObject(fp,o) == -1) return -1;
3608 } else if (o->type == REDIS_LIST) {
3609 /* Save a list value */
3610 list *list = o->ptr;
3611 listIter li;
3612 listNode *ln;
3613
3614 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3615 listRewind(list,&li);
3616 while((ln = listNext(&li))) {
3617 robj *eleobj = listNodeValue(ln);
3618
3619 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3620 }
3621 } else if (o->type == REDIS_SET) {
3622 /* Save a set value */
3623 dict *set = o->ptr;
3624 dictIterator *di = dictGetIterator(set);
3625 dictEntry *de;
3626
3627 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3628 while((de = dictNext(di)) != NULL) {
3629 robj *eleobj = dictGetEntryKey(de);
3630
3631 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3632 }
3633 dictReleaseIterator(di);
3634 } else if (o->type == REDIS_ZSET) {
3635 /* Save a set value */
3636 zset *zs = o->ptr;
3637 dictIterator *di = dictGetIterator(zs->dict);
3638 dictEntry *de;
3639
3640 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3641 while((de = dictNext(di)) != NULL) {
3642 robj *eleobj = dictGetEntryKey(de);
3643 double *score = dictGetEntryVal(de);
3644
3645 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3646 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3647 }
3648 dictReleaseIterator(di);
3649 } else if (o->type == REDIS_HASH) {
3650 /* Save a hash value */
3651 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3652 unsigned char *p = zipmapRewind(o->ptr);
3653 unsigned int count = zipmapLen(o->ptr);
3654 unsigned char *key, *val;
3655 unsigned int klen, vlen;
3656
3657 if (rdbSaveLen(fp,count) == -1) return -1;
3658 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3659 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3660 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3661 }
3662 } else {
3663 dictIterator *di = dictGetIterator(o->ptr);
3664 dictEntry *de;
3665
3666 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3667 while((de = dictNext(di)) != NULL) {
3668 robj *key = dictGetEntryKey(de);
3669 robj *val = dictGetEntryVal(de);
3670
3671 if (rdbSaveStringObject(fp,key) == -1) return -1;
3672 if (rdbSaveStringObject(fp,val) == -1) return -1;
3673 }
3674 dictReleaseIterator(di);
3675 }
3676 } else {
3677 redisPanic("Unknown object type");
3678 }
3679 return 0;
3680 }
3681
3682 /* Return the length the object will have on disk if saved with
3683 * the rdbSaveObject() function. Currently we use a trick to get
3684 * this length with very little changes to the code. In the future
3685 * we could switch to a faster solution. */
3686 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3687 if (fp == NULL) fp = server.devnull;
3688 rewind(fp);
3689 assert(rdbSaveObject(fp,o) != 1);
3690 return ftello(fp);
3691 }
3692
3693 /* Return the number of pages required to save this object in the swap file */
3694 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3695 off_t bytes = rdbSavedObjectLen(o,fp);
3696
3697 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3698 }
3699
3700 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3701 static int rdbSave(char *filename) {
3702 dictIterator *di = NULL;
3703 dictEntry *de;
3704 FILE *fp;
3705 char tmpfile[256];
3706 int j;
3707 time_t now = time(NULL);
3708
3709 /* Wait for I/O therads to terminate, just in case this is a
3710 * foreground-saving, to avoid seeking the swap file descriptor at the
3711 * same time. */
3712 if (server.vm_enabled)
3713 waitEmptyIOJobsQueue();
3714
3715 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3716 fp = fopen(tmpfile,"w");
3717 if (!fp) {
3718 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3719 return REDIS_ERR;
3720 }
3721 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3722 for (j = 0; j < server.dbnum; j++) {
3723 redisDb *db = server.db+j;
3724 dict *d = db->dict;
3725 if (dictSize(d) == 0) continue;
3726 di = dictGetIterator(d);
3727 if (!di) {
3728 fclose(fp);
3729 return REDIS_ERR;
3730 }
3731
3732 /* Write the SELECT DB opcode */
3733 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3734 if (rdbSaveLen(fp,j) == -1) goto werr;
3735
3736 /* Iterate this DB writing every entry */
3737 while((de = dictNext(di)) != NULL) {
3738 robj *key = dictGetEntryKey(de);
3739 robj *o = dictGetEntryVal(de);
3740 time_t expiretime = getExpire(db,key);
3741
3742 /* Save the expire time */
3743 if (expiretime != -1) {
3744 /* If this key is already expired skip it */
3745 if (expiretime < now) continue;
3746 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3747 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3748 }
3749 /* Save the key and associated value. This requires special
3750 * handling if the value is swapped out. */
3751 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3752 key->storage == REDIS_VM_SWAPPING) {
3753 /* Save type, key, value */
3754 if (rdbSaveType(fp,o->type) == -1) goto werr;
3755 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3756 if (rdbSaveObject(fp,o) == -1) goto werr;
3757 } else {
3758 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3759 robj *po;
3760 /* Get a preview of the object in memory */
3761 po = vmPreviewObject(key);
3762 /* Save type, key, value */
3763 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3764 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3765 if (rdbSaveObject(fp,po) == -1) goto werr;
3766 /* Remove the loaded object from memory */
3767 decrRefCount(po);
3768 }
3769 }
3770 dictReleaseIterator(di);
3771 }
3772 /* EOF opcode */
3773 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3774
3775 /* Make sure data will not remain on the OS's output buffers */
3776 fflush(fp);
3777 fsync(fileno(fp));
3778 fclose(fp);
3779
3780 /* Use RENAME to make sure the DB file is changed atomically only
3781 * if the generate DB file is ok. */
3782 if (rename(tmpfile,filename) == -1) {
3783 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3784 unlink(tmpfile);
3785 return REDIS_ERR;
3786 }
3787 redisLog(REDIS_NOTICE,"DB saved on disk");
3788 server.dirty = 0;
3789 server.lastsave = time(NULL);
3790 return REDIS_OK;
3791
3792 werr:
3793 fclose(fp);
3794 unlink(tmpfile);
3795 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3796 if (di) dictReleaseIterator(di);
3797 return REDIS_ERR;
3798 }
3799
3800 static int rdbSaveBackground(char *filename) {
3801 pid_t childpid;
3802
3803 if (server.bgsavechildpid != -1) return REDIS_ERR;
3804 if (server.vm_enabled) waitEmptyIOJobsQueue();
3805 if ((childpid = fork()) == 0) {
3806 /* Child */
3807 if (server.vm_enabled) vmReopenSwapFile();
3808 close(server.fd);
3809 if (rdbSave(filename) == REDIS_OK) {
3810 _exit(0);
3811 } else {
3812 _exit(1);
3813 }
3814 } else {
3815 /* Parent */
3816 if (childpid == -1) {
3817 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3818 strerror(errno));
3819 return REDIS_ERR;
3820 }
3821 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3822 server.bgsavechildpid = childpid;
3823 updateDictResizePolicy();
3824 return REDIS_OK;
3825 }
3826 return REDIS_OK; /* unreached */
3827 }
3828
3829 static void rdbRemoveTempFile(pid_t childpid) {
3830 char tmpfile[256];
3831
3832 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3833 unlink(tmpfile);
3834 }
3835
3836 static int rdbLoadType(FILE *fp) {
3837 unsigned char type;
3838 if (fread(&type,1,1,fp) == 0) return -1;
3839 return type;
3840 }
3841
3842 static time_t rdbLoadTime(FILE *fp) {
3843 int32_t t32;
3844 if (fread(&t32,4,1,fp) == 0) return -1;
3845 return (time_t) t32;
3846 }
3847
3848 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3849 * of this file for a description of how this are stored on disk.
3850 *
3851 * isencoded is set to 1 if the readed length is not actually a length but
3852 * an "encoding type", check the above comments for more info */
3853 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3854 unsigned char buf[2];
3855 uint32_t len;
3856 int type;
3857
3858 if (isencoded) *isencoded = 0;
3859 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3860 type = (buf[0]&0xC0)>>6;
3861 if (type == REDIS_RDB_6BITLEN) {
3862 /* Read a 6 bit len */
3863 return buf[0]&0x3F;
3864 } else if (type == REDIS_RDB_ENCVAL) {
3865 /* Read a 6 bit len encoding type */
3866 if (isencoded) *isencoded = 1;
3867 return buf[0]&0x3F;
3868 } else if (type == REDIS_RDB_14BITLEN) {
3869 /* Read a 14 bit len */
3870 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3871 return ((buf[0]&0x3F)<<8)|buf[1];
3872 } else {
3873 /* Read a 32 bit len */
3874 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3875 return ntohl(len);
3876 }
3877 }
3878
3879 /* Load an integer-encoded object from file 'fp', with the specified
3880 * encoding type 'enctype'. If encode is true the function may return
3881 * an integer-encoded object as reply, otherwise the returned object
3882 * will always be encoded as a raw string. */
3883 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3884 unsigned char enc[4];
3885 long long val;
3886
3887 if (enctype == REDIS_RDB_ENC_INT8) {
3888 if (fread(enc,1,1,fp) == 0) return NULL;
3889 val = (signed char)enc[0];
3890 } else if (enctype == REDIS_RDB_ENC_INT16) {
3891 uint16_t v;
3892 if (fread(enc,2,1,fp) == 0) return NULL;
3893 v = enc[0]|(enc[1]<<8);
3894 val = (int16_t)v;
3895 } else if (enctype == REDIS_RDB_ENC_INT32) {
3896 uint32_t v;
3897 if (fread(enc,4,1,fp) == 0) return NULL;
3898 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3899 val = (int32_t)v;
3900 } else {
3901 val = 0; /* anti-warning */
3902 redisPanic("Unknown RDB integer encoding type");
3903 }
3904 if (encode)
3905 return createStringObjectFromLongLong(val);
3906 else
3907 return createObject(REDIS_STRING,sdsfromlonglong(val));
3908 }
3909
3910 static robj *rdbLoadLzfStringObject(FILE*fp) {
3911 unsigned int len, clen;
3912 unsigned char *c = NULL;
3913 sds val = NULL;
3914
3915 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3916 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3917 if ((c = zmalloc(clen)) == NULL) goto err;
3918 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3919 if (fread(c,clen,1,fp) == 0) goto err;
3920 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3921 zfree(c);
3922 return createObject(REDIS_STRING,val);
3923 err:
3924 zfree(c);
3925 sdsfree(val);
3926 return NULL;
3927 }
3928
3929 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3930 int isencoded;
3931 uint32_t len;
3932 sds val;
3933
3934 len = rdbLoadLen(fp,&isencoded);
3935 if (isencoded) {
3936 switch(len) {
3937 case REDIS_RDB_ENC_INT8:
3938 case REDIS_RDB_ENC_INT16:
3939 case REDIS_RDB_ENC_INT32:
3940 return rdbLoadIntegerObject(fp,len,encode);
3941 case REDIS_RDB_ENC_LZF:
3942 return rdbLoadLzfStringObject(fp);
3943 default:
3944 redisPanic("Unknown RDB encoding type");
3945 }
3946 }
3947
3948 if (len == REDIS_RDB_LENERR) return NULL;
3949 val = sdsnewlen(NULL,len);
3950 if (len && fread(val,len,1,fp) == 0) {
3951 sdsfree(val);
3952 return NULL;
3953 }
3954 return createObject(REDIS_STRING,val);
3955 }
3956
3957 static robj *rdbLoadStringObject(FILE *fp) {
3958 return rdbGenericLoadStringObject(fp,0);
3959 }
3960
3961 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3962 return rdbGenericLoadStringObject(fp,1);
3963 }
3964
3965 /* For information about double serialization check rdbSaveDoubleValue() */
3966 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3967 char buf[128];
3968 unsigned char len;
3969
3970 if (fread(&len,1,1,fp) == 0) return -1;
3971 switch(len) {
3972 case 255: *val = R_NegInf; return 0;
3973 case 254: *val = R_PosInf; return 0;
3974 case 253: *val = R_Nan; return 0;
3975 default:
3976 if (fread(buf,len,1,fp) == 0) return -1;
3977 buf[len] = '\0';
3978 sscanf(buf, "%lg", val);
3979 return 0;
3980 }
3981 }
3982
3983 /* Load a Redis object of the specified type from the specified file.
3984 * On success a newly allocated object is returned, otherwise NULL. */
3985 static robj *rdbLoadObject(int type, FILE *fp) {
3986 robj *o;
3987
3988 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3989 if (type == REDIS_STRING) {
3990 /* Read string value */
3991 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3992 o = tryObjectEncoding(o);
3993 } else if (type == REDIS_LIST || type == REDIS_SET) {
3994 /* Read list/set value */
3995 uint32_t listlen;
3996
3997 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3998 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3999 /* It's faster to expand the dict to the right size asap in order
4000 * to avoid rehashing */
4001 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4002 dictExpand(o->ptr,listlen);
4003 /* Load every single element of the list/set */
4004 while(listlen--) {
4005 robj *ele;
4006
4007 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4008 ele = tryObjectEncoding(ele);
4009 if (type == REDIS_LIST) {
4010 listAddNodeTail((list*)o->ptr,ele);
4011 } else {
4012 dictAdd((dict*)o->ptr,ele,NULL);
4013 }
4014 }
4015 } else if (type == REDIS_ZSET) {
4016 /* Read list/set value */
4017 size_t zsetlen;
4018 zset *zs;
4019
4020 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4021 o = createZsetObject();
4022 zs = o->ptr;
4023 /* Load every single element of the list/set */
4024 while(zsetlen--) {
4025 robj *ele;
4026 double *score = zmalloc(sizeof(double));
4027
4028 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4029 ele = tryObjectEncoding(ele);
4030 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4031 dictAdd(zs->dict,ele,score);
4032 zslInsert(zs->zsl,*score,ele);
4033 incrRefCount(ele); /* added to skiplist */
4034 }
4035 } else if (type == REDIS_HASH) {
4036 size_t hashlen;
4037
4038 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4039 o = createHashObject();
4040 /* Too many entries? Use an hash table. */
4041 if (hashlen > server.hash_max_zipmap_entries)
4042 convertToRealHash(o);
4043 /* Load every key/value, then set it into the zipmap or hash
4044 * table, as needed. */
4045 while(hashlen--) {
4046 robj *key, *val;
4047
4048 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4049 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4050 /* If we are using a zipmap and there are too big values
4051 * the object is converted to real hash table encoding. */
4052 if (o->encoding != REDIS_ENCODING_HT &&
4053 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4054 sdslen(val->ptr) > server.hash_max_zipmap_value))
4055 {
4056 convertToRealHash(o);
4057 }
4058
4059 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4060 unsigned char *zm = o->ptr;
4061
4062 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4063 val->ptr,sdslen(val->ptr),NULL);
4064 o->ptr = zm;
4065 decrRefCount(key);
4066 decrRefCount(val);
4067 } else {
4068 key = tryObjectEncoding(key);
4069 val = tryObjectEncoding(val);
4070 dictAdd((dict*)o->ptr,key,val);
4071 }
4072 }
4073 } else {
4074 redisPanic("Unknown object type");
4075 }
4076 return o;
4077 }
4078
4079 static int rdbLoad(char *filename) {
4080 FILE *fp;
4081 uint32_t dbid;
4082 int type, retval, rdbver;
4083 int swap_all_values = 0;
4084 dict *d = server.db[0].dict;
4085 redisDb *db = server.db+0;
4086 char buf[1024];
4087 time_t expiretime, now = time(NULL);
4088 long long loadedkeys = 0;
4089
4090 fp = fopen(filename,"r");
4091 if (!fp) return REDIS_ERR;
4092 if (fread(buf,9,1,fp) == 0) goto eoferr;
4093 buf[9] = '\0';
4094 if (memcmp(buf,"REDIS",5) != 0) {
4095 fclose(fp);
4096 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4097 return REDIS_ERR;
4098 }
4099 rdbver = atoi(buf+5);
4100 if (rdbver != 1) {
4101 fclose(fp);
4102 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4103 return REDIS_ERR;
4104 }
4105 while(1) {
4106 robj *key, *val;
4107
4108 expiretime = -1;
4109 /* Read type. */
4110 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4111 if (type == REDIS_EXPIRETIME) {
4112 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4113 /* We read the time so we need to read the object type again */
4114 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4115 }
4116 if (type == REDIS_EOF) break;
4117 /* Handle SELECT DB opcode as a special case */
4118 if (type == REDIS_SELECTDB) {
4119 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4120 goto eoferr;
4121 if (dbid >= (unsigned)server.dbnum) {
4122 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4123 exit(1);
4124 }
4125 db = server.db+dbid;
4126 d = db->dict;
4127 continue;
4128 }
4129 /* Read key */
4130 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4131 /* Read value */
4132 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4133 /* Check if the key already expired */
4134 if (expiretime != -1 && expiretime < now) {
4135 decrRefCount(key);
4136 decrRefCount(val);
4137 continue;
4138 }
4139 /* Add the new object in the hash table */
4140 retval = dictAdd(d,key,val);
4141 if (retval == DICT_ERR) {
4142 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4143 exit(1);
4144 }
4145 loadedkeys++;
4146 /* Set the expire time if needed */
4147 if (expiretime != -1) setExpire(db,key,expiretime);
4148
4149 /* Handle swapping while loading big datasets when VM is on */
4150
4151 /* If we detecter we are hopeless about fitting something in memory
4152 * we just swap every new key on disk. Directly...
4153 * Note that's important to check for this condition before resorting
4154 * to random sampling, otherwise we may try to swap already
4155 * swapped keys. */
4156 if (swap_all_values) {
4157 dictEntry *de = dictFind(d,key);
4158
4159 /* de may be NULL since the key already expired */
4160 if (de) {
4161 key = dictGetEntryKey(de);
4162 val = dictGetEntryVal(de);
4163
4164 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4165 dictGetEntryVal(de) = NULL;
4166 }
4167 }
4168 continue;
4169 }
4170
4171 /* If we have still some hope of having some value fitting memory
4172 * then we try random sampling. */
4173 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4174 while (zmalloc_used_memory() > server.vm_max_memory) {
4175 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4176 }
4177 if (zmalloc_used_memory() > server.vm_max_memory)
4178 swap_all_values = 1; /* We are already using too much mem */
4179 }
4180 }
4181 fclose(fp);
4182 return REDIS_OK;
4183
4184 eoferr: /* unexpected end of file is handled here with a fatal exit */
4185 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4186 exit(1);
4187 return REDIS_ERR; /* Just to avoid warning */
4188 }
4189
4190 /*================================== Shutdown =============================== */
4191 static int prepareForShutdown() {
4192 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4193 /* Kill the saving child if there is a background saving in progress.
4194 We want to avoid race conditions, for instance our saving child may
4195 overwrite the synchronous saving did by SHUTDOWN. */
4196 if (server.bgsavechildpid != -1) {
4197 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4198 kill(server.bgsavechildpid,SIGKILL);
4199 rdbRemoveTempFile(server.bgsavechildpid);
4200 }
4201 if (server.appendonly) {
4202 /* Append only file: fsync() the AOF and exit */
4203 aof_fsync(server.appendfd);
4204 if (server.vm_enabled) unlink(server.vm_swap_file);
4205 } else {
4206 /* Snapshotting. Perform a SYNC SAVE and exit */
4207 if (rdbSave(server.dbfilename) == REDIS_OK) {
4208 if (server.daemonize)
4209 unlink(server.pidfile);
4210 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4211 } else {
4212 /* Ooops.. error saving! The best we can do is to continue
4213 * operating. Note that if there was a background saving process,
4214 * in the next cron() Redis will be notified that the background
4215 * saving aborted, handling special stuff like slaves pending for
4216 * synchronization... */
4217 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4218 return REDIS_ERR;
4219 }
4220 }
4221 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4222 return REDIS_OK;
4223 }
4224
4225 /*================================== Commands =============================== */
4226
4227 static void authCommand(redisClient *c) {
4228 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4229 c->authenticated = 1;
4230 addReply(c,shared.ok);
4231 } else {
4232 c->authenticated = 0;
4233 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4234 }
4235 }
4236
4237 static void pingCommand(redisClient *c) {
4238 addReply(c,shared.pong);
4239 }
4240
4241 static void echoCommand(redisClient *c) {
4242 addReplyBulk(c,c->argv[1]);
4243 }
4244
4245 /*=================================== Strings =============================== */
4246
4247 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4248 int retval;
4249 long seconds = 0; /* initialized to avoid an harmness warning */
4250
4251 if (expire) {
4252 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4253 return;
4254 if (seconds <= 0) {
4255 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4256 return;
4257 }
4258 }
4259
4260 touchWatchedKey(c->db,key);
4261 if (nx) deleteIfVolatile(c->db,key);
4262 retval = dictAdd(c->db->dict,key,val);
4263 if (retval == DICT_ERR) {
4264 if (!nx) {
4265 /* If the key is about a swapped value, we want a new key object
4266 * to overwrite the old. So we delete the old key in the database.
4267 * This will also make sure that swap pages about the old object
4268 * will be marked as free. */
4269 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4270 incrRefCount(key);
4271 dictReplace(c->db->dict,key,val);
4272 incrRefCount(val);
4273 } else {
4274 addReply(c,shared.czero);
4275 return;
4276 }
4277 } else {
4278 incrRefCount(key);
4279 incrRefCount(val);
4280 }
4281 server.dirty++;
4282 removeExpire(c->db,key);
4283 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4284 addReply(c, nx ? shared.cone : shared.ok);
4285 }
4286
4287 static void setCommand(redisClient *c) {
4288 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4289 }
4290
4291 static void setnxCommand(redisClient *c) {
4292 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4293 }
4294
4295 static void setexCommand(redisClient *c) {
4296 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4297 }
4298
4299 static int getGenericCommand(redisClient *c) {
4300 robj *o;
4301
4302 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4303 return REDIS_OK;
4304
4305 if (o->type != REDIS_STRING) {
4306 addReply(c,shared.wrongtypeerr);
4307 return REDIS_ERR;
4308 } else {
4309 addReplyBulk(c,o);
4310 return REDIS_OK;
4311 }
4312 }
4313
4314 static void getCommand(redisClient *c) {
4315 getGenericCommand(c);
4316 }
4317
4318 static void getsetCommand(redisClient *c) {
4319 if (getGenericCommand(c) == REDIS_ERR) return;
4320 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4321 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4322 } else {
4323 incrRefCount(c->argv[1]);
4324 }
4325 incrRefCount(c->argv[2]);
4326 server.dirty++;
4327 removeExpire(c->db,c->argv[1]);
4328 }
4329
4330 static void mgetCommand(redisClient *c) {
4331 int j;
4332
4333 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4334 for (j = 1; j < c->argc; j++) {
4335 robj *o = lookupKeyRead(c->db,c->argv[j]);
4336 if (o == NULL) {
4337 addReply(c,shared.nullbulk);
4338 } else {
4339 if (o->type != REDIS_STRING) {
4340 addReply(c,shared.nullbulk);
4341 } else {
4342 addReplyBulk(c,o);
4343 }
4344 }
4345 }
4346 }
4347
4348 static void msetGenericCommand(redisClient *c, int nx) {
4349 int j, busykeys = 0;
4350
4351 if ((c->argc % 2) == 0) {
4352 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4353 return;
4354 }
4355 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4356 * set nothing at all if at least one already key exists. */
4357 if (nx) {
4358 for (j = 1; j < c->argc; j += 2) {
4359 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4360 busykeys++;
4361 }
4362 }
4363 }
4364 if (busykeys) {
4365 addReply(c, shared.czero);
4366 return;
4367 }
4368
4369 for (j = 1; j < c->argc; j += 2) {
4370 int retval;
4371
4372 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4373 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4374 if (retval == DICT_ERR) {
4375 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4376 incrRefCount(c->argv[j+1]);
4377 } else {
4378 incrRefCount(c->argv[j]);
4379 incrRefCount(c->argv[j+1]);
4380 }
4381 removeExpire(c->db,c->argv[j]);
4382 }
4383 server.dirty += (c->argc-1)/2;
4384 addReply(c, nx ? shared.cone : shared.ok);
4385 }
4386
4387 static void msetCommand(redisClient *c) {
4388 msetGenericCommand(c,0);
4389 }
4390
4391 static void msetnxCommand(redisClient *c) {
4392 msetGenericCommand(c,1);
4393 }
4394
4395 static void incrDecrCommand(redisClient *c, long long incr) {
4396 long long value;
4397 int retval;
4398 robj *o;
4399
4400 o = lookupKeyWrite(c->db,c->argv[1]);
4401 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4402 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4403
4404 value += incr;
4405 o = createStringObjectFromLongLong(value);
4406 retval = dictAdd(c->db->dict,c->argv[1],o);
4407 if (retval == DICT_ERR) {
4408 dictReplace(c->db->dict,c->argv[1],o);
4409 removeExpire(c->db,c->argv[1]);
4410 } else {
4411 incrRefCount(c->argv[1]);
4412 }
4413 server.dirty++;
4414 addReply(c,shared.colon);
4415 addReply(c,o);
4416 addReply(c,shared.crlf);
4417 }
4418
4419 static void incrCommand(redisClient *c) {
4420 incrDecrCommand(c,1);
4421 }
4422
4423 static void decrCommand(redisClient *c) {
4424 incrDecrCommand(c,-1);
4425 }
4426
4427 static void incrbyCommand(redisClient *c) {
4428 long long incr;
4429
4430 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4431 incrDecrCommand(c,incr);
4432 }
4433
4434 static void decrbyCommand(redisClient *c) {
4435 long long incr;
4436
4437 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4438 incrDecrCommand(c,-incr);
4439 }
4440
4441 static void appendCommand(redisClient *c) {
4442 int retval;
4443 size_t totlen;
4444 robj *o;
4445
4446 o = lookupKeyWrite(c->db,c->argv[1]);
4447 if (o == NULL) {
4448 /* Create the key */
4449 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4450 incrRefCount(c->argv[1]);
4451 incrRefCount(c->argv[2]);
4452 totlen = stringObjectLen(c->argv[2]);
4453 } else {
4454 dictEntry *de;
4455
4456 de = dictFind(c->db->dict,c->argv[1]);
4457 assert(de != NULL);
4458
4459 o = dictGetEntryVal(de);
4460 if (o->type != REDIS_STRING) {
4461 addReply(c,shared.wrongtypeerr);
4462 return;
4463 }
4464 /* If the object is specially encoded or shared we have to make
4465 * a copy */
4466 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4467 robj *decoded = getDecodedObject(o);
4468
4469 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4470 decrRefCount(decoded);
4471 dictReplace(c->db->dict,c->argv[1],o);
4472 }
4473 /* APPEND! */
4474 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4475 o->ptr = sdscatlen(o->ptr,
4476 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4477 } else {
4478 o->ptr = sdscatprintf(o->ptr, "%ld",
4479 (unsigned long) c->argv[2]->ptr);
4480 }
4481 totlen = sdslen(o->ptr);
4482 }
4483 server.dirty++;
4484 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4485 }
4486
4487 static void substrCommand(redisClient *c) {
4488 robj *o;
4489 long start = atoi(c->argv[2]->ptr);
4490 long end = atoi(c->argv[3]->ptr);
4491 size_t rangelen, strlen;
4492 sds range;
4493
4494 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4495 checkType(c,o,REDIS_STRING)) return;
4496
4497 o = getDecodedObject(o);
4498 strlen = sdslen(o->ptr);
4499
4500 /* convert negative indexes */
4501 if (start < 0) start = strlen+start;
4502 if (end < 0) end = strlen+end;
4503 if (start < 0) start = 0;
4504 if (end < 0) end = 0;
4505
4506 /* indexes sanity checks */
4507 if (start > end || (size_t)start >= strlen) {
4508 /* Out of range start or start > end result in null reply */
4509 addReply(c,shared.nullbulk);
4510 decrRefCount(o);
4511 return;
4512 }
4513 if ((size_t)end >= strlen) end = strlen-1;
4514 rangelen = (end-start)+1;
4515
4516 /* Return the result */
4517 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4518 range = sdsnewlen((char*)o->ptr+start,rangelen);
4519 addReplySds(c,range);
4520 addReply(c,shared.crlf);
4521 decrRefCount(o);
4522 }
4523
4524 /* ========================= Type agnostic commands ========================= */
4525
4526 static void delCommand(redisClient *c) {
4527 int deleted = 0, j;
4528
4529 for (j = 1; j < c->argc; j++) {
4530 if (deleteKey(c->db,c->argv[j])) {
4531 touchWatchedKey(c->db,c->argv[j]);
4532 server.dirty++;
4533 deleted++;
4534 }
4535 }
4536 addReplyLongLong(c,deleted);
4537 }
4538
4539 static void existsCommand(redisClient *c) {
4540 expireIfNeeded(c->db,c->argv[1]);
4541 if (dictFind(c->db->dict,c->argv[1])) {
4542 addReply(c, shared.cone);
4543 } else {
4544 addReply(c, shared.czero);
4545 }
4546 }
4547
4548 static void selectCommand(redisClient *c) {
4549 int id = atoi(c->argv[1]->ptr);
4550
4551 if (selectDb(c,id) == REDIS_ERR) {
4552 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4553 } else {
4554 addReply(c,shared.ok);
4555 }
4556 }
4557
4558 static void randomkeyCommand(redisClient *c) {
4559 dictEntry *de;
4560 robj *key;
4561
4562 while(1) {
4563 de = dictGetRandomKey(c->db->dict);
4564 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4565 }
4566
4567 if (de == NULL) {
4568 addReply(c,shared.nullbulk);
4569 return;
4570 }
4571
4572 key = dictGetEntryKey(de);
4573 if (server.vm_enabled) {
4574 key = dupStringObject(key);
4575 addReplyBulk(c,key);
4576 decrRefCount(key);
4577 } else {
4578 addReplyBulk(c,key);
4579 }
4580 }
4581
4582 static void keysCommand(redisClient *c) {
4583 dictIterator *di;
4584 dictEntry *de;
4585 sds pattern = c->argv[1]->ptr;
4586 int plen = sdslen(pattern);
4587 unsigned long numkeys = 0;
4588 robj *lenobj = createObject(REDIS_STRING,NULL);
4589
4590 di = dictGetIterator(c->db->dict);
4591 addReply(c,lenobj);
4592 decrRefCount(lenobj);
4593 while((de = dictNext(di)) != NULL) {
4594 robj *keyobj = dictGetEntryKey(de);
4595
4596 sds key = keyobj->ptr;
4597 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4598 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4599 if (expireIfNeeded(c->db,keyobj) == 0) {
4600 addReplyBulk(c,keyobj);
4601 numkeys++;
4602 }
4603 }
4604 }
4605 dictReleaseIterator(di);
4606 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4607 }
4608
4609 static void dbsizeCommand(redisClient *c) {
4610 addReplySds(c,
4611 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4612 }
4613
4614 static void lastsaveCommand(redisClient *c) {
4615 addReplySds(c,
4616 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4617 }
4618
4619 static void typeCommand(redisClient *c) {
4620 robj *o;
4621 char *type;
4622
4623 o = lookupKeyRead(c->db,c->argv[1]);
4624 if (o == NULL) {
4625 type = "+none";
4626 } else {
4627 switch(o->type) {
4628 case REDIS_STRING: type = "+string"; break;
4629 case REDIS_LIST: type = "+list"; break;
4630 case REDIS_SET: type = "+set"; break;
4631 case REDIS_ZSET: type = "+zset"; break;
4632 case REDIS_HASH: type = "+hash"; break;
4633 default: type = "+unknown"; break;
4634 }
4635 }
4636 addReplySds(c,sdsnew(type));
4637 addReply(c,shared.crlf);
4638 }
4639
4640 static void saveCommand(redisClient *c) {
4641 if (server.bgsavechildpid != -1) {
4642 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4643 return;
4644 }
4645 if (rdbSave(server.dbfilename) == REDIS_OK) {
4646 addReply(c,shared.ok);
4647 } else {
4648 addReply(c,shared.err);
4649 }
4650 }
4651
4652 static void bgsaveCommand(redisClient *c) {
4653 if (server.bgsavechildpid != -1) {
4654 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4655 return;
4656 }
4657 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4658 char *status = "+Background saving started\r\n";
4659 addReplySds(c,sdsnew(status));
4660 } else {
4661 addReply(c,shared.err);
4662 }
4663 }
4664
4665 static void shutdownCommand(redisClient *c) {
4666 if (prepareForShutdown() == REDIS_OK)
4667 exit(0);
4668 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4669 }
4670
4671 static void renameGenericCommand(redisClient *c, int nx) {
4672 robj *o;
4673
4674 /* To use the same key as src and dst is probably an error */
4675 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4676 addReply(c,shared.sameobjecterr);
4677 return;
4678 }
4679
4680 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4681 return;
4682
4683 incrRefCount(o);
4684 deleteIfVolatile(c->db,c->argv[2]);
4685 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4686 if (nx) {
4687 decrRefCount(o);
4688 addReply(c,shared.czero);
4689 return;
4690 }
4691 dictReplace(c->db->dict,c->argv[2],o);
4692 } else {
4693 incrRefCount(c->argv[2]);
4694 }
4695 deleteKey(c->db,c->argv[1]);
4696 touchWatchedKey(c->db,c->argv[2]);
4697 server.dirty++;
4698 addReply(c,nx ? shared.cone : shared.ok);
4699 }
4700
4701 static void renameCommand(redisClient *c) {
4702 renameGenericCommand(c,0);
4703 }
4704
4705 static void renamenxCommand(redisClient *c) {
4706 renameGenericCommand(c,1);
4707 }
4708
4709 static void moveCommand(redisClient *c) {
4710 robj *o;
4711 redisDb *src, *dst;
4712 int srcid;
4713
4714 /* Obtain source and target DB pointers */
4715 src = c->db;
4716 srcid = c->db->id;
4717 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4718 addReply(c,shared.outofrangeerr);
4719 return;
4720 }
4721 dst = c->db;
4722 selectDb(c,srcid); /* Back to the source DB */
4723
4724 /* If the user is moving using as target the same
4725 * DB as the source DB it is probably an error. */
4726 if (src == dst) {
4727 addReply(c,shared.sameobjecterr);
4728 return;
4729 }
4730
4731 /* Check if the element exists and get a reference */
4732 o = lookupKeyWrite(c->db,c->argv[1]);
4733 if (!o) {
4734 addReply(c,shared.czero);
4735 return;
4736 }
4737
4738 /* Try to add the element to the target DB */
4739 deleteIfVolatile(dst,c->argv[1]);
4740 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4741 addReply(c,shared.czero);
4742 return;
4743 }
4744 incrRefCount(c->argv[1]);
4745 incrRefCount(o);
4746
4747 /* OK! key moved, free the entry in the source DB */
4748 deleteKey(src,c->argv[1]);
4749 server.dirty++;
4750 addReply(c,shared.cone);
4751 }
4752
4753 /* =================================== Lists ================================ */
4754 static void pushGenericCommand(redisClient *c, int where) {
4755 robj *lobj;
4756 list *list;
4757
4758 lobj = lookupKeyWrite(c->db,c->argv[1]);
4759 if (lobj == NULL) {
4760 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4761 addReply(c,shared.cone);
4762 return;
4763 }
4764 lobj = createListObject();
4765 list = lobj->ptr;
4766 if (where == REDIS_HEAD) {
4767 listAddNodeHead(list,c->argv[2]);
4768 } else {
4769 listAddNodeTail(list,c->argv[2]);
4770 }
4771 dictAdd(c->db->dict,c->argv[1],lobj);
4772 incrRefCount(c->argv[1]);
4773 incrRefCount(c->argv[2]);
4774 } else {
4775 if (lobj->type != REDIS_LIST) {
4776 addReply(c,shared.wrongtypeerr);
4777 return;
4778 }
4779 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4780 addReply(c,shared.cone);
4781 return;
4782 }
4783 list = lobj->ptr;
4784 if (where == REDIS_HEAD) {
4785 listAddNodeHead(list,c->argv[2]);
4786 } else {
4787 listAddNodeTail(list,c->argv[2]);
4788 }
4789 incrRefCount(c->argv[2]);
4790 }
4791 server.dirty++;
4792 addReplyLongLong(c,listLength(list));
4793 }
4794
4795 static void lpushCommand(redisClient *c) {
4796 pushGenericCommand(c,REDIS_HEAD);
4797 }
4798
4799 static void rpushCommand(redisClient *c) {
4800 pushGenericCommand(c,REDIS_TAIL);
4801 }
4802
4803 static void llenCommand(redisClient *c) {
4804 robj *o;
4805 list *l;
4806
4807 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4808 checkType(c,o,REDIS_LIST)) return;
4809
4810 l = o->ptr;
4811 addReplyUlong(c,listLength(l));
4812 }
4813
4814 static void lindexCommand(redisClient *c) {
4815 robj *o;
4816 int index = atoi(c->argv[2]->ptr);
4817 list *list;
4818 listNode *ln;
4819
4820 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4821 checkType(c,o,REDIS_LIST)) return;
4822 list = o->ptr;
4823
4824 ln = listIndex(list, index);
4825 if (ln == NULL) {
4826 addReply(c,shared.nullbulk);
4827 } else {
4828 robj *ele = listNodeValue(ln);
4829 addReplyBulk(c,ele);
4830 }
4831 }
4832
4833 static void lsetCommand(redisClient *c) {
4834 robj *o;
4835 int index = atoi(c->argv[2]->ptr);
4836 list *list;
4837 listNode *ln;
4838
4839 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4840 checkType(c,o,REDIS_LIST)) return;
4841 list = o->ptr;
4842
4843 ln = listIndex(list, index);
4844 if (ln == NULL) {
4845 addReply(c,shared.outofrangeerr);
4846 } else {
4847 robj *ele = listNodeValue(ln);
4848
4849 decrRefCount(ele);
4850 listNodeValue(ln) = c->argv[3];
4851 incrRefCount(c->argv[3]);
4852 addReply(c,shared.ok);
4853 server.dirty++;
4854 }
4855 }
4856
4857 static void popGenericCommand(redisClient *c, int where) {
4858 robj *o;
4859 list *list;
4860 listNode *ln;
4861
4862 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4863 checkType(c,o,REDIS_LIST)) return;
4864 list = o->ptr;
4865
4866 if (where == REDIS_HEAD)
4867 ln = listFirst(list);
4868 else
4869 ln = listLast(list);
4870
4871 if (ln == NULL) {
4872 addReply(c,shared.nullbulk);
4873 } else {
4874 robj *ele = listNodeValue(ln);
4875 addReplyBulk(c,ele);
4876 listDelNode(list,ln);
4877 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4878 server.dirty++;
4879 }
4880 }
4881
4882 static void lpopCommand(redisClient *c) {
4883 popGenericCommand(c,REDIS_HEAD);
4884 }
4885
4886 static void rpopCommand(redisClient *c) {
4887 popGenericCommand(c,REDIS_TAIL);
4888 }
4889
4890 static void lrangeCommand(redisClient *c) {
4891 robj *o;
4892 int start = atoi(c->argv[2]->ptr);
4893 int end = atoi(c->argv[3]->ptr);
4894 int llen;
4895 int rangelen, j;
4896 list *list;
4897 listNode *ln;
4898 robj *ele;
4899
4900 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4901 || checkType(c,o,REDIS_LIST)) return;
4902 list = o->ptr;
4903 llen = listLength(list);
4904
4905 /* convert negative indexes */
4906 if (start < 0) start = llen+start;
4907 if (end < 0) end = llen+end;
4908 if (start < 0) start = 0;
4909 if (end < 0) end = 0;
4910
4911 /* indexes sanity checks */
4912 if (start > end || start >= llen) {
4913 /* Out of range start or start > end result in empty list */
4914 addReply(c,shared.emptymultibulk);
4915 return;
4916 }
4917 if (end >= llen) end = llen-1;
4918 rangelen = (end-start)+1;
4919
4920 /* Return the result in form of a multi-bulk reply */
4921 ln = listIndex(list, start);
4922 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4923 for (j = 0; j < rangelen; j++) {
4924 ele = listNodeValue(ln);
4925 addReplyBulk(c,ele);
4926 ln = ln->next;
4927 }
4928 }
4929
4930 static void ltrimCommand(redisClient *c) {
4931 robj *o;
4932 int start = atoi(c->argv[2]->ptr);
4933 int end = atoi(c->argv[3]->ptr);
4934 int llen;
4935 int j, ltrim, rtrim;
4936 list *list;
4937 listNode *ln;
4938
4939 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4940 checkType(c,o,REDIS_LIST)) return;
4941 list = o->ptr;
4942 llen = listLength(list);
4943
4944 /* convert negative indexes */
4945 if (start < 0) start = llen+start;
4946 if (end < 0) end = llen+end;
4947 if (start < 0) start = 0;
4948 if (end < 0) end = 0;
4949
4950 /* indexes sanity checks */
4951 if (start > end || start >= llen) {
4952 /* Out of range start or start > end result in empty list */
4953 ltrim = llen;
4954 rtrim = 0;
4955 } else {
4956 if (end >= llen) end = llen-1;
4957 ltrim = start;
4958 rtrim = llen-end-1;
4959 }
4960
4961 /* Remove list elements to perform the trim */
4962 for (j = 0; j < ltrim; j++) {
4963 ln = listFirst(list);
4964 listDelNode(list,ln);
4965 }
4966 for (j = 0; j < rtrim; j++) {
4967 ln = listLast(list);
4968 listDelNode(list,ln);
4969 }
4970 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4971 server.dirty++;
4972 addReply(c,shared.ok);
4973 }
4974
4975 static void lremCommand(redisClient *c) {
4976 robj *o;
4977 list *list;
4978 listNode *ln, *next;
4979 int toremove = atoi(c->argv[2]->ptr);
4980 int removed = 0;
4981 int fromtail = 0;
4982
4983 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4984 checkType(c,o,REDIS_LIST)) return;
4985 list = o->ptr;
4986
4987 if (toremove < 0) {
4988 toremove = -toremove;
4989 fromtail = 1;
4990 }
4991 ln = fromtail ? list->tail : list->head;
4992 while (ln) {
4993 robj *ele = listNodeValue(ln);
4994
4995 next = fromtail ? ln->prev : ln->next;
4996 if (equalStringObjects(ele,c->argv[3])) {
4997 listDelNode(list,ln);
4998 server.dirty++;
4999 removed++;
5000 if (toremove && removed == toremove) break;
5001 }
5002 ln = next;
5003 }
5004 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
5005 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5006 }
5007
5008 /* This is the semantic of this command:
5009 * RPOPLPUSH srclist dstlist:
5010 * IF LLEN(srclist) > 0
5011 * element = RPOP srclist
5012 * LPUSH dstlist element
5013 * RETURN element
5014 * ELSE
5015 * RETURN nil
5016 * END
5017 * END
5018 *
5019 * The idea is to be able to get an element from a list in a reliable way
5020 * since the element is not just returned but pushed against another list
5021 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5022 */
5023 static void rpoplpushcommand(redisClient *c) {
5024 robj *sobj;
5025 list *srclist;
5026 listNode *ln;
5027
5028 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5029 checkType(c,sobj,REDIS_LIST)) return;
5030 srclist = sobj->ptr;
5031 ln = listLast(srclist);
5032
5033 if (ln == NULL) {
5034 addReply(c,shared.nullbulk);
5035 } else {
5036 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5037 robj *ele = listNodeValue(ln);
5038 list *dstlist;
5039
5040 if (dobj && dobj->type != REDIS_LIST) {
5041 addReply(c,shared.wrongtypeerr);
5042 return;
5043 }
5044
5045 /* Add the element to the target list (unless it's directly
5046 * passed to some BLPOP-ing client */
5047 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5048 if (dobj == NULL) {
5049 /* Create the list if the key does not exist */
5050 dobj = createListObject();
5051 dictAdd(c->db->dict,c->argv[2],dobj);
5052 incrRefCount(c->argv[2]);
5053 }
5054 dstlist = dobj->ptr;
5055 listAddNodeHead(dstlist,ele);
5056 incrRefCount(ele);
5057 }
5058
5059 /* Send the element to the client as reply as well */
5060 addReplyBulk(c,ele);
5061
5062 /* Finally remove the element from the source list */
5063 listDelNode(srclist,ln);
5064 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5065 server.dirty++;
5066 }
5067 }
5068
5069 /* ==================================== Sets ================================ */
5070
5071 static void saddCommand(redisClient *c) {
5072 robj *set;
5073
5074 set = lookupKeyWrite(c->db,c->argv[1]);
5075 if (set == NULL) {
5076 set = createSetObject();
5077 dictAdd(c->db->dict,c->argv[1],set);
5078 incrRefCount(c->argv[1]);
5079 } else {
5080 if (set->type != REDIS_SET) {
5081 addReply(c,shared.wrongtypeerr);
5082 return;
5083 }
5084 }
5085 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5086 incrRefCount(c->argv[2]);
5087 server.dirty++;
5088 addReply(c,shared.cone);
5089 } else {
5090 addReply(c,shared.czero);
5091 }
5092 }
5093
5094 static void sremCommand(redisClient *c) {
5095 robj *set;
5096
5097 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5098 checkType(c,set,REDIS_SET)) return;
5099
5100 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5101 server.dirty++;
5102 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5103 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5104 addReply(c,shared.cone);
5105 } else {
5106 addReply(c,shared.czero);
5107 }
5108 }
5109
5110 static void smoveCommand(redisClient *c) {
5111 robj *srcset, *dstset;
5112
5113 srcset = lookupKeyWrite(c->db,c->argv[1]);
5114 dstset = lookupKeyWrite(c->db,c->argv[2]);
5115
5116 /* If the source key does not exist return 0, if it's of the wrong type
5117 * raise an error */
5118 if (srcset == NULL || srcset->type != REDIS_SET) {
5119 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5120 return;
5121 }
5122 /* Error if the destination key is not a set as well */
5123 if (dstset && dstset->type != REDIS_SET) {
5124 addReply(c,shared.wrongtypeerr);
5125 return;
5126 }
5127 /* Remove the element from the source set */
5128 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5129 /* Key not found in the src set! return zero */
5130 addReply(c,shared.czero);
5131 return;
5132 }
5133 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5134 deleteKey(c->db,c->argv[1]);
5135 server.dirty++;
5136 /* Add the element to the destination set */
5137 if (!dstset) {
5138 dstset = createSetObject();
5139 dictAdd(c->db->dict,c->argv[2],dstset);
5140 incrRefCount(c->argv[2]);
5141 }
5142 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5143 incrRefCount(c->argv[3]);
5144 addReply(c,shared.cone);
5145 }
5146
5147 static void sismemberCommand(redisClient *c) {
5148 robj *set;
5149
5150 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5151 checkType(c,set,REDIS_SET)) return;
5152
5153 if (dictFind(set->ptr,c->argv[2]))
5154 addReply(c,shared.cone);
5155 else
5156 addReply(c,shared.czero);
5157 }
5158
5159 static void scardCommand(redisClient *c) {
5160 robj *o;
5161 dict *s;
5162
5163 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5164 checkType(c,o,REDIS_SET)) return;
5165
5166 s = o->ptr;
5167 addReplyUlong(c,dictSize(s));
5168 }
5169
5170 static void spopCommand(redisClient *c) {
5171 robj *set;
5172 dictEntry *de;
5173
5174 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5175 checkType(c,set,REDIS_SET)) return;
5176
5177 de = dictGetRandomKey(set->ptr);
5178 if (de == NULL) {
5179 addReply(c,shared.nullbulk);
5180 } else {
5181 robj *ele = dictGetEntryKey(de);
5182
5183 addReplyBulk(c,ele);
5184 dictDelete(set->ptr,ele);
5185 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5186 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5187 server.dirty++;
5188 }
5189 }
5190
5191 static void srandmemberCommand(redisClient *c) {
5192 robj *set;
5193 dictEntry *de;
5194
5195 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5196 checkType(c,set,REDIS_SET)) return;
5197
5198 de = dictGetRandomKey(set->ptr);
5199 if (de == NULL) {
5200 addReply(c,shared.nullbulk);
5201 } else {
5202 robj *ele = dictGetEntryKey(de);
5203
5204 addReplyBulk(c,ele);
5205 }
5206 }
5207
5208 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5209 dict **d1 = (void*) s1, **d2 = (void*) s2;
5210
5211 return dictSize(*d1)-dictSize(*d2);
5212 }
5213
5214 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5215 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5216 dictIterator *di;
5217 dictEntry *de;
5218 robj *lenobj = NULL, *dstset = NULL;
5219 unsigned long j, cardinality = 0;
5220
5221 for (j = 0; j < setsnum; j++) {
5222 robj *setobj;
5223
5224 setobj = dstkey ?
5225 lookupKeyWrite(c->db,setskeys[j]) :
5226 lookupKeyRead(c->db,setskeys[j]);
5227 if (!setobj) {
5228 zfree(dv);
5229 if (dstkey) {
5230 if (deleteKey(c->db,dstkey))
5231 server.dirty++;
5232 addReply(c,shared.czero);
5233 } else {
5234 addReply(c,shared.emptymultibulk);
5235 }
5236 return;
5237 }
5238 if (setobj->type != REDIS_SET) {
5239 zfree(dv);
5240 addReply(c,shared.wrongtypeerr);
5241 return;
5242 }
5243 dv[j] = setobj->ptr;
5244 }
5245 /* Sort sets from the smallest to largest, this will improve our
5246 * algorithm's performace */
5247 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5248
5249 /* The first thing we should output is the total number of elements...
5250 * since this is a multi-bulk write, but at this stage we don't know
5251 * the intersection set size, so we use a trick, append an empty object
5252 * to the output list and save the pointer to later modify it with the
5253 * right length */
5254 if (!dstkey) {
5255 lenobj = createObject(REDIS_STRING,NULL);
5256 addReply(c,lenobj);
5257 decrRefCount(lenobj);
5258 } else {
5259 /* If we have a target key where to store the resulting set
5260 * create this key with an empty set inside */
5261 dstset = createSetObject();
5262 }
5263
5264 /* Iterate all the elements of the first (smallest) set, and test
5265 * the element against all the other sets, if at least one set does
5266 * not include the element it is discarded */
5267 di = dictGetIterator(dv[0]);
5268
5269 while((de = dictNext(di)) != NULL) {
5270 robj *ele;
5271
5272 for (j = 1; j < setsnum; j++)
5273 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5274 if (j != setsnum)
5275 continue; /* at least one set does not contain the member */
5276 ele = dictGetEntryKey(de);
5277 if (!dstkey) {
5278 addReplyBulk(c,ele);
5279 cardinality++;
5280 } else {
5281 dictAdd(dstset->ptr,ele,NULL);
5282 incrRefCount(ele);
5283 }
5284 }
5285 dictReleaseIterator(di);
5286
5287 if (dstkey) {
5288 /* Store the resulting set into the target, if the intersection
5289 * is not an empty set. */
5290 deleteKey(c->db,dstkey);
5291 if (dictSize((dict*)dstset->ptr) > 0) {
5292 dictAdd(c->db->dict,dstkey,dstset);
5293 incrRefCount(dstkey);
5294 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5295 } else {
5296 decrRefCount(dstset);
5297 addReply(c,shared.czero);
5298 }
5299 server.dirty++;
5300 } else {
5301 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5302 }
5303 zfree(dv);
5304 }
5305
5306 static void sinterCommand(redisClient *c) {
5307 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5308 }
5309
5310 static void sinterstoreCommand(redisClient *c) {
5311 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5312 }
5313
5314 #define REDIS_OP_UNION 0
5315 #define REDIS_OP_DIFF 1
5316 #define REDIS_OP_INTER 2
5317
5318 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5319 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5320 dictIterator *di;
5321 dictEntry *de;
5322 robj *dstset = NULL;
5323 int j, cardinality = 0;
5324
5325 for (j = 0; j < setsnum; j++) {
5326 robj *setobj;
5327
5328 setobj = dstkey ?
5329 lookupKeyWrite(c->db,setskeys[j]) :
5330 lookupKeyRead(c->db,setskeys[j]);
5331 if (!setobj) {
5332 dv[j] = NULL;
5333 continue;
5334 }
5335 if (setobj->type != REDIS_SET) {
5336 zfree(dv);
5337 addReply(c,shared.wrongtypeerr);
5338 return;
5339 }
5340 dv[j] = setobj->ptr;
5341 }
5342
5343 /* We need a temp set object to store our union. If the dstkey
5344 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5345 * this set object will be the resulting object to set into the target key*/
5346 dstset = createSetObject();
5347
5348 /* Iterate all the elements of all the sets, add every element a single
5349 * time to the result set */
5350 for (j = 0; j < setsnum; j++) {
5351 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5352 if (!dv[j]) continue; /* non existing keys are like empty sets */
5353
5354 di = dictGetIterator(dv[j]);
5355
5356 while((de = dictNext(di)) != NULL) {
5357 robj *ele;
5358
5359 /* dictAdd will not add the same element multiple times */
5360 ele = dictGetEntryKey(de);
5361 if (op == REDIS_OP_UNION || j == 0) {
5362 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5363 incrRefCount(ele);
5364 cardinality++;
5365 }
5366 } else if (op == REDIS_OP_DIFF) {
5367 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5368 cardinality--;
5369 }
5370 }
5371 }
5372 dictReleaseIterator(di);
5373
5374 /* result set is empty? Exit asap. */
5375 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5376 }
5377
5378 /* Output the content of the resulting set, if not in STORE mode */
5379 if (!dstkey) {
5380 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5381 di = dictGetIterator(dstset->ptr);
5382 while((de = dictNext(di)) != NULL) {
5383 robj *ele;
5384
5385 ele = dictGetEntryKey(de);
5386 addReplyBulk(c,ele);
5387 }
5388 dictReleaseIterator(di);
5389 decrRefCount(dstset);
5390 } else {
5391 /* If we have a target key where to store the resulting set
5392 * create this key with the result set inside */
5393 deleteKey(c->db,dstkey);
5394 if (dictSize((dict*)dstset->ptr) > 0) {
5395 dictAdd(c->db->dict,dstkey,dstset);
5396 incrRefCount(dstkey);
5397 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5398 } else {
5399 decrRefCount(dstset);
5400 addReply(c,shared.czero);
5401 }
5402 server.dirty++;
5403 }
5404 zfree(dv);
5405 }
5406
5407 static void sunionCommand(redisClient *c) {
5408 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5409 }
5410
5411 static void sunionstoreCommand(redisClient *c) {
5412 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5413 }
5414
5415 static void sdiffCommand(redisClient *c) {
5416 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5417 }
5418
5419 static void sdiffstoreCommand(redisClient *c) {
5420 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5421 }
5422
5423 /* ==================================== ZSets =============================== */
5424
5425 /* ZSETs are ordered sets using two data structures to hold the same elements
5426 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5427 * data structure.
5428 *
5429 * The elements are added to an hash table mapping Redis objects to scores.
5430 * At the same time the elements are added to a skip list mapping scores
5431 * to Redis objects (so objects are sorted by scores in this "view"). */
5432
5433 /* This skiplist implementation is almost a C translation of the original
5434 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5435 * Alternative to Balanced Trees", modified in three ways:
5436 * a) this implementation allows for repeated values.
5437 * b) the comparison is not just by key (our 'score') but by satellite data.
5438 * c) there is a back pointer, so it's a doubly linked list with the back
5439 * pointers being only at "level 1". This allows to traverse the list
5440 * from tail to head, useful for ZREVRANGE. */
5441
5442 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5443 zskiplistNode *zn = zmalloc(sizeof(*zn));
5444
5445 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5446 if (level > 1)
5447 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5448 else
5449 zn->span = NULL;
5450 zn->score = score;
5451 zn->obj = obj;
5452 return zn;
5453 }
5454
5455 static zskiplist *zslCreate(void) {
5456 int j;
5457 zskiplist *zsl;
5458
5459 zsl = zmalloc(sizeof(*zsl));
5460 zsl->level = 1;
5461 zsl->length = 0;
5462 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5463 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5464 zsl->header->forward[j] = NULL;
5465
5466 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5467 if (j < ZSKIPLIST_MAXLEVEL-1)
5468 zsl->header->span[j] = 0;
5469 }
5470 zsl->header->backward = NULL;
5471 zsl->tail = NULL;
5472 return zsl;
5473 }
5474
5475 static void zslFreeNode(zskiplistNode *node) {
5476 decrRefCount(node->obj);
5477 zfree(node->forward);
5478 zfree(node->span);
5479 zfree(node);
5480 }
5481
5482 static void zslFree(zskiplist *zsl) {
5483 zskiplistNode *node = zsl->header->forward[0], *next;
5484
5485 zfree(zsl->header->forward);
5486 zfree(zsl->header->span);
5487 zfree(zsl->header);
5488 while(node) {
5489 next = node->forward[0];
5490 zslFreeNode(node);
5491 node = next;
5492 }
5493 zfree(zsl);
5494 }
5495
5496 static int zslRandomLevel(void) {
5497 int level = 1;
5498 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5499 level += 1;
5500 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5501 }
5502
5503 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5504 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5505 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5506 int i, level;
5507
5508 x = zsl->header;
5509 for (i = zsl->level-1; i >= 0; i--) {
5510 /* store rank that is crossed to reach the insert position */
5511 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5512
5513 while (x->forward[i] &&
5514 (x->forward[i]->score < score ||
5515 (x->forward[i]->score == score &&
5516 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5517 rank[i] += i > 0 ? x->span[i-1] : 1;
5518 x = x->forward[i];
5519 }
5520 update[i] = x;
5521 }
5522 /* we assume the key is not already inside, since we allow duplicated
5523 * scores, and the re-insertion of score and redis object should never
5524 * happpen since the caller of zslInsert() should test in the hash table
5525 * if the element is already inside or not. */
5526 level = zslRandomLevel();
5527 if (level > zsl->level) {
5528 for (i = zsl->level; i < level; i++) {
5529 rank[i] = 0;
5530 update[i] = zsl->header;
5531 update[i]->span[i-1] = zsl->length;
5532 }
5533 zsl->level = level;
5534 }
5535 x = zslCreateNode(level,score,obj);
5536 for (i = 0; i < level; i++) {
5537 x->forward[i] = update[i]->forward[i];
5538 update[i]->forward[i] = x;
5539
5540 /* update span covered by update[i] as x is inserted here */
5541 if (i > 0) {
5542 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5543 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5544 }
5545 }
5546
5547 /* increment span for untouched levels */
5548 for (i = level; i < zsl->level; i++) {
5549 update[i]->span[i-1]++;
5550 }
5551
5552 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5553 if (x->forward[0])
5554 x->forward[0]->backward = x;
5555 else
5556 zsl->tail = x;
5557 zsl->length++;
5558 }
5559
5560 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5561 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5562 int i;
5563 for (i = 0; i < zsl->level; i++) {
5564 if (update[i]->forward[i] == x) {
5565 if (i > 0) {
5566 update[i]->span[i-1] += x->span[i-1] - 1;
5567 }
5568 update[i]->forward[i] = x->forward[i];
5569 } else {
5570 /* invariant: i > 0, because update[0]->forward[0]
5571 * is always equal to x */
5572 update[i]->span[i-1] -= 1;
5573 }
5574 }
5575 if (x->forward[0]) {
5576 x->forward[0]->backward = x->backward;
5577 } else {
5578 zsl->tail = x->backward;
5579 }
5580 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5581 zsl->level--;
5582 zsl->length--;
5583 }
5584
5585 /* Delete an element with matching score/object from the skiplist. */
5586 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5587 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5588 int i;
5589
5590 x = zsl->header;
5591 for (i = zsl->level-1; i >= 0; i--) {
5592 while (x->forward[i] &&
5593 (x->forward[i]->score < score ||
5594 (x->forward[i]->score == score &&
5595 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5596 x = x->forward[i];
5597 update[i] = x;
5598 }
5599 /* We may have multiple elements with the same score, what we need
5600 * is to find the element with both the right score and object. */
5601 x = x->forward[0];
5602 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5603 zslDeleteNode(zsl, x, update);
5604 zslFreeNode(x);
5605 return 1;
5606 } else {
5607 return 0; /* not found */
5608 }
5609 return 0; /* not found */
5610 }
5611
5612 /* Delete all the elements with score between min and max from the skiplist.
5613 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5614 * Note that this function takes the reference to the hash table view of the
5615 * sorted set, in order to remove the elements from the hash table too. */
5616 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5617 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5618 unsigned long removed = 0;
5619 int i;
5620
5621 x = zsl->header;
5622 for (i = zsl->level-1; i >= 0; i--) {
5623 while (x->forward[i] && x->forward[i]->score < min)
5624 x = x->forward[i];
5625 update[i] = x;
5626 }
5627 /* We may have multiple elements with the same score, what we need
5628 * is to find the element with both the right score and object. */
5629 x = x->forward[0];
5630 while (x && x->score <= max) {
5631 zskiplistNode *next = x->forward[0];
5632 zslDeleteNode(zsl, x, update);
5633 dictDelete(dict,x->obj);
5634 zslFreeNode(x);
5635 removed++;
5636 x = next;
5637 }
5638 return removed; /* not found */
5639 }
5640
5641 /* Delete all the elements with rank between start and end from the skiplist.
5642 * Start and end are inclusive. Note that start and end need to be 1-based */
5643 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5644 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5645 unsigned long traversed = 0, removed = 0;
5646 int i;
5647
5648 x = zsl->header;
5649 for (i = zsl->level-1; i >= 0; i--) {
5650 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5651 traversed += i > 0 ? x->span[i-1] : 1;
5652 x = x->forward[i];
5653 }
5654 update[i] = x;
5655 }
5656
5657 traversed++;
5658 x = x->forward[0];
5659 while (x && traversed <= end) {
5660 zskiplistNode *next = x->forward[0];
5661 zslDeleteNode(zsl, x, update);
5662 dictDelete(dict,x->obj);
5663 zslFreeNode(x);
5664 removed++;
5665 traversed++;
5666 x = next;
5667 }
5668 return removed;
5669 }
5670
5671 /* Find the first node having a score equal or greater than the specified one.
5672 * Returns NULL if there is no match. */
5673 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5674 zskiplistNode *x;
5675 int i;
5676
5677 x = zsl->header;
5678 for (i = zsl->level-1; i >= 0; i--) {
5679 while (x->forward[i] && x->forward[i]->score < score)
5680 x = x->forward[i];
5681 }
5682 /* We may have multiple elements with the same score, what we need
5683 * is to find the element with both the right score and object. */
5684 return x->forward[0];
5685 }
5686
5687 /* Find the rank for an element by both score and key.
5688 * Returns 0 when the element cannot be found, rank otherwise.
5689 * Note that the rank is 1-based due to the span of zsl->header to the
5690 * first element. */
5691 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5692 zskiplistNode *x;
5693 unsigned long rank = 0;
5694 int i;
5695
5696 x = zsl->header;
5697 for (i = zsl->level-1; i >= 0; i--) {
5698 while (x->forward[i] &&
5699 (x->forward[i]->score < score ||
5700 (x->forward[i]->score == score &&
5701 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5702 rank += i > 0 ? x->span[i-1] : 1;
5703 x = x->forward[i];
5704 }
5705
5706 /* x might be equal to zsl->header, so test if obj is non-NULL */
5707 if (x->obj && equalStringObjects(x->obj,o)) {
5708 return rank;
5709 }
5710 }
5711 return 0;
5712 }
5713
5714 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5715 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5716 zskiplistNode *x;
5717 unsigned long traversed = 0;
5718 int i;
5719
5720 x = zsl->header;
5721 for (i = zsl->level-1; i >= 0; i--) {
5722 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5723 {
5724 traversed += i > 0 ? x->span[i-1] : 1;
5725 x = x->forward[i];
5726 }
5727 if (traversed == rank) {
5728 return x;
5729 }
5730 }
5731 return NULL;
5732 }
5733
5734 /* The actual Z-commands implementations */
5735
5736 /* This generic command implements both ZADD and ZINCRBY.
5737 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5738 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5739 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5740 robj *zsetobj;
5741 zset *zs;
5742 double *score;
5743
5744 zsetobj = lookupKeyWrite(c->db,key);
5745 if (zsetobj == NULL) {
5746 zsetobj = createZsetObject();
5747 dictAdd(c->db->dict,key,zsetobj);
5748 incrRefCount(key);
5749 } else {
5750 if (zsetobj->type != REDIS_ZSET) {
5751 addReply(c,shared.wrongtypeerr);
5752 return;
5753 }
5754 }
5755 zs = zsetobj->ptr;
5756
5757 /* Ok now since we implement both ZADD and ZINCRBY here the code
5758 * needs to handle the two different conditions. It's all about setting
5759 * '*score', that is, the new score to set, to the right value. */
5760 score = zmalloc(sizeof(double));
5761 if (doincrement) {
5762 dictEntry *de;
5763
5764 /* Read the old score. If the element was not present starts from 0 */
5765 de = dictFind(zs->dict,ele);
5766 if (de) {
5767 double *oldscore = dictGetEntryVal(de);
5768 *score = *oldscore + scoreval;
5769 } else {
5770 *score = scoreval;
5771 }
5772 } else {
5773 *score = scoreval;
5774 }
5775
5776 /* What follows is a simple remove and re-insert operation that is common
5777 * to both ZADD and ZINCRBY... */
5778 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5779 /* case 1: New element */
5780 incrRefCount(ele); /* added to hash */
5781 zslInsert(zs->zsl,*score,ele);
5782 incrRefCount(ele); /* added to skiplist */
5783 server.dirty++;
5784 if (doincrement)
5785 addReplyDouble(c,*score);
5786 else
5787 addReply(c,shared.cone);
5788 } else {
5789 dictEntry *de;
5790 double *oldscore;
5791
5792 /* case 2: Score update operation */
5793 de = dictFind(zs->dict,ele);
5794 redisAssert(de != NULL);
5795 oldscore = dictGetEntryVal(de);
5796 if (*score != *oldscore) {
5797 int deleted;
5798
5799 /* Remove and insert the element in the skip list with new score */
5800 deleted = zslDelete(zs->zsl,*oldscore,ele);
5801 redisAssert(deleted != 0);
5802 zslInsert(zs->zsl,*score,ele);
5803 incrRefCount(ele);
5804 /* Update the score in the hash table */
5805 dictReplace(zs->dict,ele,score);
5806 server.dirty++;
5807 } else {
5808 zfree(score);
5809 }
5810 if (doincrement)
5811 addReplyDouble(c,*score);
5812 else
5813 addReply(c,shared.czero);
5814 }
5815 }
5816
5817 static void zaddCommand(redisClient *c) {
5818 double scoreval;
5819
5820 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5821 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5822 }
5823
5824 static void zincrbyCommand(redisClient *c) {
5825 double scoreval;
5826
5827 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5828 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5829 }
5830
5831 static void zremCommand(redisClient *c) {
5832 robj *zsetobj;
5833 zset *zs;
5834 dictEntry *de;
5835 double *oldscore;
5836 int deleted;
5837
5838 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5839 checkType(c,zsetobj,REDIS_ZSET)) return;
5840
5841 zs = zsetobj->ptr;
5842 de = dictFind(zs->dict,c->argv[2]);
5843 if (de == NULL) {
5844 addReply(c,shared.czero);
5845 return;
5846 }
5847 /* Delete from the skiplist */
5848 oldscore = dictGetEntryVal(de);
5849 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5850 redisAssert(deleted != 0);
5851
5852 /* Delete from the hash table */
5853 dictDelete(zs->dict,c->argv[2]);
5854 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5855 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5856 server.dirty++;
5857 addReply(c,shared.cone);
5858 }
5859
5860 static void zremrangebyscoreCommand(redisClient *c) {
5861 double min;
5862 double max;
5863 long deleted;
5864 robj *zsetobj;
5865 zset *zs;
5866
5867 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5868 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5869
5870 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5871 checkType(c,zsetobj,REDIS_ZSET)) return;
5872
5873 zs = zsetobj->ptr;
5874 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5875 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5876 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5877 server.dirty += deleted;
5878 addReplyLongLong(c,deleted);
5879 }
5880
5881 static void zremrangebyrankCommand(redisClient *c) {
5882 long start;
5883 long end;
5884 int llen;
5885 long deleted;
5886 robj *zsetobj;
5887 zset *zs;
5888
5889 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5890 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5891
5892 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5893 checkType(c,zsetobj,REDIS_ZSET)) return;
5894 zs = zsetobj->ptr;
5895 llen = zs->zsl->length;
5896
5897 /* convert negative indexes */
5898 if (start < 0) start = llen+start;
5899 if (end < 0) end = llen+end;
5900 if (start < 0) start = 0;
5901 if (end < 0) end = 0;
5902
5903 /* indexes sanity checks */
5904 if (start > end || start >= llen) {
5905 addReply(c,shared.czero);
5906 return;
5907 }
5908 if (end >= llen) end = llen-1;
5909
5910 /* increment start and end because zsl*Rank functions
5911 * use 1-based rank */
5912 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5913 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5914 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5915 server.dirty += deleted;
5916 addReplyLongLong(c, deleted);
5917 }
5918
5919 typedef struct {
5920 dict *dict;
5921 double weight;
5922 } zsetopsrc;
5923
5924 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5925 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5926 unsigned long size1, size2;
5927 size1 = d1->dict ? dictSize(d1->dict) : 0;
5928 size2 = d2->dict ? dictSize(d2->dict) : 0;
5929 return size1 - size2;
5930 }
5931
5932 #define REDIS_AGGR_SUM 1
5933 #define REDIS_AGGR_MIN 2
5934 #define REDIS_AGGR_MAX 3
5935 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5936
5937 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5938 if (aggregate == REDIS_AGGR_SUM) {
5939 *target = *target + val;
5940 } else if (aggregate == REDIS_AGGR_MIN) {
5941 *target = val < *target ? val : *target;
5942 } else if (aggregate == REDIS_AGGR_MAX) {
5943 *target = val > *target ? val : *target;
5944 } else {
5945 /* safety net */
5946 redisPanic("Unknown ZUNION/INTER aggregate type");
5947 }
5948 }
5949
5950 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5951 int i, j, setnum;
5952 int aggregate = REDIS_AGGR_SUM;
5953 zsetopsrc *src;
5954 robj *dstobj;
5955 zset *dstzset;
5956 dictIterator *di;
5957 dictEntry *de;
5958
5959 /* expect setnum input keys to be given */
5960 setnum = atoi(c->argv[2]->ptr);
5961 if (setnum < 1) {
5962 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5963 return;
5964 }
5965
5966 /* test if the expected number of keys would overflow */
5967 if (3+setnum > c->argc) {
5968 addReply(c,shared.syntaxerr);
5969 return;
5970 }
5971
5972 /* read keys to be used for input */
5973 src = zmalloc(sizeof(zsetopsrc) * setnum);
5974 for (i = 0, j = 3; i < setnum; i++, j++) {
5975 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
5976 if (!obj) {
5977 src[i].dict = NULL;
5978 } else {
5979 if (obj->type == REDIS_ZSET) {
5980 src[i].dict = ((zset*)obj->ptr)->dict;
5981 } else if (obj->type == REDIS_SET) {
5982 src[i].dict = (obj->ptr);
5983 } else {
5984 zfree(src);
5985 addReply(c,shared.wrongtypeerr);
5986 return;
5987 }
5988 }
5989
5990 /* default all weights to 1 */
5991 src[i].weight = 1.0;
5992 }
5993
5994 /* parse optional extra arguments */
5995 if (j < c->argc) {
5996 int remaining = c->argc - j;
5997
5998 while (remaining) {
5999 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6000 j++; remaining--;
6001 for (i = 0; i < setnum; i++, j++, remaining--) {
6002 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6003 return;
6004 }
6005 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6006 j++; remaining--;
6007 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6008 aggregate = REDIS_AGGR_SUM;
6009 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6010 aggregate = REDIS_AGGR_MIN;
6011 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6012 aggregate = REDIS_AGGR_MAX;
6013 } else {
6014 zfree(src);
6015 addReply(c,shared.syntaxerr);
6016 return;
6017 }
6018 j++; remaining--;
6019 } else {
6020 zfree(src);
6021 addReply(c,shared.syntaxerr);
6022 return;
6023 }
6024 }
6025 }
6026
6027 /* sort sets from the smallest to largest, this will improve our
6028 * algorithm's performance */
6029 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6030
6031 dstobj = createZsetObject();
6032 dstzset = dstobj->ptr;
6033
6034 if (op == REDIS_OP_INTER) {
6035 /* skip going over all entries if the smallest zset is NULL or empty */
6036 if (src[0].dict && dictSize(src[0].dict) > 0) {
6037 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6038 * from small to large, all src[i > 0].dict are non-empty too */
6039 di = dictGetIterator(src[0].dict);
6040 while((de = dictNext(di)) != NULL) {
6041 double *score = zmalloc(sizeof(double)), value;
6042 *score = src[0].weight * zunionInterDictValue(de);
6043
6044 for (j = 1; j < setnum; j++) {
6045 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6046 if (other) {
6047 value = src[j].weight * zunionInterDictValue(other);
6048 zunionInterAggregate(score, value, aggregate);
6049 } else {
6050 break;
6051 }
6052 }
6053
6054 /* skip entry when not present in every source dict */
6055 if (j != setnum) {
6056 zfree(score);
6057 } else {
6058 robj *o = dictGetEntryKey(de);
6059 dictAdd(dstzset->dict,o,score);
6060 incrRefCount(o); /* added to dictionary */
6061 zslInsert(dstzset->zsl,*score,o);
6062 incrRefCount(o); /* added to skiplist */
6063 }
6064 }
6065 dictReleaseIterator(di);
6066 }
6067 } else if (op == REDIS_OP_UNION) {
6068 for (i = 0; i < setnum; i++) {
6069 if (!src[i].dict) continue;
6070
6071 di = dictGetIterator(src[i].dict);
6072 while((de = dictNext(di)) != NULL) {
6073 /* skip key when already processed */
6074 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6075
6076 double *score = zmalloc(sizeof(double)), value;
6077 *score = src[i].weight * zunionInterDictValue(de);
6078
6079 /* because the zsets are sorted by size, its only possible
6080 * for sets at larger indices to hold this entry */
6081 for (j = (i+1); j < setnum; j++) {
6082 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6083 if (other) {
6084 value = src[j].weight * zunionInterDictValue(other);
6085 zunionInterAggregate(score, value, aggregate);
6086 }
6087 }
6088
6089 robj *o = dictGetEntryKey(de);
6090 dictAdd(dstzset->dict,o,score);
6091 incrRefCount(o); /* added to dictionary */
6092 zslInsert(dstzset->zsl,*score,o);
6093 incrRefCount(o); /* added to skiplist */
6094 }
6095 dictReleaseIterator(di);
6096 }
6097 } else {
6098 /* unknown operator */
6099 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6100 }
6101
6102 deleteKey(c->db,dstkey);
6103 if (dstzset->zsl->length) {
6104 dictAdd(c->db->dict,dstkey,dstobj);
6105 incrRefCount(dstkey);
6106 addReplyLongLong(c, dstzset->zsl->length);
6107 server.dirty++;
6108 } else {
6109 decrRefCount(dstobj);
6110 addReply(c, shared.czero);
6111 }
6112 zfree(src);
6113 }
6114
6115 static void zunionstoreCommand(redisClient *c) {
6116 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6117 }
6118
6119 static void zinterstoreCommand(redisClient *c) {
6120 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6121 }
6122
6123 static void zrangeGenericCommand(redisClient *c, int reverse) {
6124 robj *o;
6125 long start;
6126 long end;
6127 int withscores = 0;
6128 int llen;
6129 int rangelen, j;
6130 zset *zsetobj;
6131 zskiplist *zsl;
6132 zskiplistNode *ln;
6133 robj *ele;
6134
6135 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6136 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6137
6138 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6139 withscores = 1;
6140 } else if (c->argc >= 5) {
6141 addReply(c,shared.syntaxerr);
6142 return;
6143 }
6144
6145 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6146 || checkType(c,o,REDIS_ZSET)) return;
6147 zsetobj = o->ptr;
6148 zsl = zsetobj->zsl;
6149 llen = zsl->length;
6150
6151 /* convert negative indexes */
6152 if (start < 0) start = llen+start;
6153 if (end < 0) end = llen+end;
6154 if (start < 0) start = 0;
6155 if (end < 0) end = 0;
6156
6157 /* indexes sanity checks */
6158 if (start > end || start >= llen) {
6159 /* Out of range start or start > end result in empty list */
6160 addReply(c,shared.emptymultibulk);
6161 return;
6162 }
6163 if (end >= llen) end = llen-1;
6164 rangelen = (end-start)+1;
6165
6166 /* check if starting point is trivial, before searching
6167 * the element in log(N) time */
6168 if (reverse) {
6169 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6170 } else {
6171 ln = start == 0 ?
6172 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6173 }
6174
6175 /* Return the result in form of a multi-bulk reply */
6176 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6177 withscores ? (rangelen*2) : rangelen));
6178 for (j = 0; j < rangelen; j++) {
6179 ele = ln->obj;
6180 addReplyBulk(c,ele);
6181 if (withscores)
6182 addReplyDouble(c,ln->score);
6183 ln = reverse ? ln->backward : ln->forward[0];
6184 }
6185 }
6186
6187 static void zrangeCommand(redisClient *c) {
6188 zrangeGenericCommand(c,0);
6189 }
6190
6191 static void zrevrangeCommand(redisClient *c) {
6192 zrangeGenericCommand(c,1);
6193 }
6194
6195 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6196 * If justcount is non-zero, just the count is returned. */
6197 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6198 robj *o;
6199 double min, max;
6200 int minex = 0, maxex = 0; /* are min or max exclusive? */
6201 int offset = 0, limit = -1;
6202 int withscores = 0;
6203 int badsyntax = 0;
6204
6205 /* Parse the min-max interval. If one of the values is prefixed
6206 * by the "(" character, it's considered "open". For instance
6207 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6208 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6209 if (((char*)c->argv[2]->ptr)[0] == '(') {
6210 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6211 minex = 1;
6212 } else {
6213 min = strtod(c->argv[2]->ptr,NULL);
6214 }
6215 if (((char*)c->argv[3]->ptr)[0] == '(') {
6216 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6217 maxex = 1;
6218 } else {
6219 max = strtod(c->argv[3]->ptr,NULL);
6220 }
6221
6222 /* Parse "WITHSCORES": note that if the command was called with
6223 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6224 * enter the following paths to parse WITHSCORES and LIMIT. */
6225 if (c->argc == 5 || c->argc == 8) {
6226 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6227 withscores = 1;
6228 else
6229 badsyntax = 1;
6230 }
6231 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6232 badsyntax = 1;
6233 if (badsyntax) {
6234 addReplySds(c,
6235 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6236 return;
6237 }
6238
6239 /* Parse "LIMIT" */
6240 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6241 addReply(c,shared.syntaxerr);
6242 return;
6243 } else if (c->argc == (7 + withscores)) {
6244 offset = atoi(c->argv[5]->ptr);
6245 limit = atoi(c->argv[6]->ptr);
6246 if (offset < 0) offset = 0;
6247 }
6248
6249 /* Ok, lookup the key and get the range */
6250 o = lookupKeyRead(c->db,c->argv[1]);
6251 if (o == NULL) {
6252 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6253 } else {
6254 if (o->type != REDIS_ZSET) {
6255 addReply(c,shared.wrongtypeerr);
6256 } else {
6257 zset *zsetobj = o->ptr;
6258 zskiplist *zsl = zsetobj->zsl;
6259 zskiplistNode *ln;
6260 robj *ele, *lenobj = NULL;
6261 unsigned long rangelen = 0;
6262
6263 /* Get the first node with the score >= min, or with
6264 * score > min if 'minex' is true. */
6265 ln = zslFirstWithScore(zsl,min);
6266 while (minex && ln && ln->score == min) ln = ln->forward[0];
6267
6268 if (ln == NULL) {
6269 /* No element matching the speciifed interval */
6270 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6271 return;
6272 }
6273
6274 /* We don't know in advance how many matching elements there
6275 * are in the list, so we push this object that will represent
6276 * the multi-bulk length in the output buffer, and will "fix"
6277 * it later */
6278 if (!justcount) {
6279 lenobj = createObject(REDIS_STRING,NULL);
6280 addReply(c,lenobj);
6281 decrRefCount(lenobj);
6282 }
6283
6284 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6285 if (offset) {
6286 offset--;
6287 ln = ln->forward[0];
6288 continue;
6289 }
6290 if (limit == 0) break;
6291 if (!justcount) {
6292 ele = ln->obj;
6293 addReplyBulk(c,ele);
6294 if (withscores)
6295 addReplyDouble(c,ln->score);
6296 }
6297 ln = ln->forward[0];
6298 rangelen++;
6299 if (limit > 0) limit--;
6300 }
6301 if (justcount) {
6302 addReplyLongLong(c,(long)rangelen);
6303 } else {
6304 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6305 withscores ? (rangelen*2) : rangelen);
6306 }
6307 }
6308 }
6309 }
6310
6311 static void zrangebyscoreCommand(redisClient *c) {
6312 genericZrangebyscoreCommand(c,0);
6313 }
6314
6315 static void zcountCommand(redisClient *c) {
6316 genericZrangebyscoreCommand(c,1);
6317 }
6318
6319 static void zcardCommand(redisClient *c) {
6320 robj *o;
6321 zset *zs;
6322
6323 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6324 checkType(c,o,REDIS_ZSET)) return;
6325
6326 zs = o->ptr;
6327 addReplyUlong(c,zs->zsl->length);
6328 }
6329
6330 static void zscoreCommand(redisClient *c) {
6331 robj *o;
6332 zset *zs;
6333 dictEntry *de;
6334
6335 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6336 checkType(c,o,REDIS_ZSET)) return;
6337
6338 zs = o->ptr;
6339 de = dictFind(zs->dict,c->argv[2]);
6340 if (!de) {
6341 addReply(c,shared.nullbulk);
6342 } else {
6343 double *score = dictGetEntryVal(de);
6344
6345 addReplyDouble(c,*score);
6346 }
6347 }
6348
6349 static void zrankGenericCommand(redisClient *c, int reverse) {
6350 robj *o;
6351 zset *zs;
6352 zskiplist *zsl;
6353 dictEntry *de;
6354 unsigned long rank;
6355 double *score;
6356
6357 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6358 checkType(c,o,REDIS_ZSET)) return;
6359
6360 zs = o->ptr;
6361 zsl = zs->zsl;
6362 de = dictFind(zs->dict,c->argv[2]);
6363 if (!de) {
6364 addReply(c,shared.nullbulk);
6365 return;
6366 }
6367
6368 score = dictGetEntryVal(de);
6369 rank = zslGetRank(zsl, *score, c->argv[2]);
6370 if (rank) {
6371 if (reverse) {
6372 addReplyLongLong(c, zsl->length - rank);
6373 } else {
6374 addReplyLongLong(c, rank-1);
6375 }
6376 } else {
6377 addReply(c,shared.nullbulk);
6378 }
6379 }
6380
6381 static void zrankCommand(redisClient *c) {
6382 zrankGenericCommand(c, 0);
6383 }
6384
6385 static void zrevrankCommand(redisClient *c) {
6386 zrankGenericCommand(c, 1);
6387 }
6388
6389 /* ========================= Hashes utility functions ======================= */
6390 #define REDIS_HASH_KEY 1
6391 #define REDIS_HASH_VALUE 2
6392
6393 /* Check the length of a number of objects to see if we need to convert a
6394 * zipmap to a real hash. Note that we only check string encoded objects
6395 * as their string length can be queried in constant time. */
6396 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6397 int i;
6398 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6399
6400 for (i = start; i <= end; i++) {
6401 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6402 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6403 {
6404 convertToRealHash(subject);
6405 return;
6406 }
6407 }
6408 }
6409
6410 /* Encode given objects in-place when the hash uses a dict. */
6411 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6412 if (subject->encoding == REDIS_ENCODING_HT) {
6413 if (o1) *o1 = tryObjectEncoding(*o1);
6414 if (o2) *o2 = tryObjectEncoding(*o2);
6415 }
6416 }
6417
6418 /* Get the value from a hash identified by key. Returns either a string
6419 * object or NULL if the value cannot be found. The refcount of the object
6420 * is always increased by 1 when the value was found. */
6421 static robj *hashGet(robj *o, robj *key) {
6422 robj *value = NULL;
6423 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6424 unsigned char *v;
6425 unsigned int vlen;
6426 key = getDecodedObject(key);
6427 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6428 value = createStringObject((char*)v,vlen);
6429 }
6430 decrRefCount(key);
6431 } else {
6432 dictEntry *de = dictFind(o->ptr,key);
6433 if (de != NULL) {
6434 value = dictGetEntryVal(de);
6435 incrRefCount(value);
6436 }
6437 }
6438 return value;
6439 }
6440
6441 /* Test if the key exists in the given hash. Returns 1 if the key
6442 * exists and 0 when it doesn't. */
6443 static int hashExists(robj *o, robj *key) {
6444 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6445 key = getDecodedObject(key);
6446 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6447 decrRefCount(key);
6448 return 1;
6449 }
6450 decrRefCount(key);
6451 } else {
6452 if (dictFind(o->ptr,key) != NULL) {
6453 return 1;
6454 }
6455 }
6456 return 0;
6457 }
6458
6459 /* Add an element, discard the old if the key already exists.
6460 * Return 0 on insert and 1 on update. */
6461 static int hashSet(robj *o, robj *key, robj *value) {
6462 int update = 0;
6463 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6464 key = getDecodedObject(key);
6465 value = getDecodedObject(value);
6466 o->ptr = zipmapSet(o->ptr,
6467 key->ptr,sdslen(key->ptr),
6468 value->ptr,sdslen(value->ptr), &update);
6469 decrRefCount(key);
6470 decrRefCount(value);
6471
6472 /* Check if the zipmap needs to be upgraded to a real hash table */
6473 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6474 convertToRealHash(o);
6475 } else {
6476 if (dictReplace(o->ptr,key,value)) {
6477 /* Insert */
6478 incrRefCount(key);
6479 } else {
6480 /* Update */
6481 update = 1;
6482 }
6483 incrRefCount(value);
6484 }
6485 return update;
6486 }
6487
6488 /* Delete an element from a hash.
6489 * Return 1 on deleted and 0 on not found. */
6490 static int hashDelete(robj *o, robj *key) {
6491 int deleted = 0;
6492 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6493 key = getDecodedObject(key);
6494 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6495 decrRefCount(key);
6496 } else {
6497 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6498 /* Always check if the dictionary needs a resize after a delete. */
6499 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6500 }
6501 return deleted;
6502 }
6503
6504 /* Return the number of elements in a hash. */
6505 static unsigned long hashLength(robj *o) {
6506 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6507 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6508 }
6509
6510 /* Structure to hold hash iteration abstration. Note that iteration over
6511 * hashes involves both fields and values. Because it is possible that
6512 * not both are required, store pointers in the iterator to avoid
6513 * unnecessary memory allocation for fields/values. */
6514 typedef struct {
6515 int encoding;
6516 unsigned char *zi;
6517 unsigned char *zk, *zv;
6518 unsigned int zklen, zvlen;
6519
6520 dictIterator *di;
6521 dictEntry *de;
6522 } hashIterator;
6523
6524 static hashIterator *hashInitIterator(robj *subject) {
6525 hashIterator *hi = zmalloc(sizeof(hashIterator));
6526 hi->encoding = subject->encoding;
6527 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6528 hi->zi = zipmapRewind(subject->ptr);
6529 } else if (hi->encoding == REDIS_ENCODING_HT) {
6530 hi->di = dictGetIterator(subject->ptr);
6531 } else {
6532 redisAssert(NULL);
6533 }
6534 return hi;
6535 }
6536
6537 static void hashReleaseIterator(hashIterator *hi) {
6538 if (hi->encoding == REDIS_ENCODING_HT) {
6539 dictReleaseIterator(hi->di);
6540 }
6541 zfree(hi);
6542 }
6543
6544 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6545 * could be found and REDIS_ERR when the iterator reaches the end. */
6546 static int hashNext(hashIterator *hi) {
6547 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6548 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6549 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6550 } else {
6551 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6552 }
6553 return REDIS_OK;
6554 }
6555
6556 /* Get key or value object at current iteration position.
6557 * This increases the refcount of the field object by 1. */
6558 static robj *hashCurrent(hashIterator *hi, int what) {
6559 robj *o;
6560 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6561 if (what & REDIS_HASH_KEY) {
6562 o = createStringObject((char*)hi->zk,hi->zklen);
6563 } else {
6564 o = createStringObject((char*)hi->zv,hi->zvlen);
6565 }
6566 } else {
6567 if (what & REDIS_HASH_KEY) {
6568 o = dictGetEntryKey(hi->de);
6569 } else {
6570 o = dictGetEntryVal(hi->de);
6571 }
6572 incrRefCount(o);
6573 }
6574 return o;
6575 }
6576
6577 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6578 robj *o = lookupKeyWrite(c->db,key);
6579 if (o == NULL) {
6580 o = createHashObject();
6581 dictAdd(c->db->dict,key,o);
6582 incrRefCount(key);
6583 } else {
6584 if (o->type != REDIS_HASH) {
6585 addReply(c,shared.wrongtypeerr);
6586 return NULL;
6587 }
6588 }
6589 return o;
6590 }
6591
6592 /* ============================= Hash commands ============================== */
6593 static void hsetCommand(redisClient *c) {
6594 int update;
6595 robj *o;
6596
6597 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6598 hashTryConversion(o,c->argv,2,3);
6599 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6600 update = hashSet(o,c->argv[2],c->argv[3]);
6601 addReply(c, update ? shared.czero : shared.cone);
6602 server.dirty++;
6603 }
6604
6605 static void hsetnxCommand(redisClient *c) {
6606 robj *o;
6607 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6608 hashTryConversion(o,c->argv,2,3);
6609
6610 if (hashExists(o, c->argv[2])) {
6611 addReply(c, shared.czero);
6612 } else {
6613 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6614 hashSet(o,c->argv[2],c->argv[3]);
6615 addReply(c, shared.cone);
6616 server.dirty++;
6617 }
6618 }
6619
6620 static void hmsetCommand(redisClient *c) {
6621 int i;
6622 robj *o;
6623
6624 if ((c->argc % 2) == 1) {
6625 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6626 return;
6627 }
6628
6629 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6630 hashTryConversion(o,c->argv,2,c->argc-1);
6631 for (i = 2; i < c->argc; i += 2) {
6632 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6633 hashSet(o,c->argv[i],c->argv[i+1]);
6634 }
6635 addReply(c, shared.ok);
6636 server.dirty++;
6637 }
6638
6639 static void hincrbyCommand(redisClient *c) {
6640 long long value, incr;
6641 robj *o, *current, *new;
6642
6643 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6644 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6645 if ((current = hashGet(o,c->argv[2])) != NULL) {
6646 if (getLongLongFromObjectOrReply(c,current,&value,
6647 "hash value is not an integer") != REDIS_OK) {
6648 decrRefCount(current);
6649 return;
6650 }
6651 decrRefCount(current);
6652 } else {
6653 value = 0;
6654 }
6655
6656 value += incr;
6657 new = createStringObjectFromLongLong(value);
6658 hashTryObjectEncoding(o,&c->argv[2],NULL);
6659 hashSet(o,c->argv[2],new);
6660 decrRefCount(new);
6661 addReplyLongLong(c,value);
6662 server.dirty++;
6663 }
6664
6665 static void hgetCommand(redisClient *c) {
6666 robj *o, *value;
6667 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6668 checkType(c,o,REDIS_HASH)) return;
6669
6670 if ((value = hashGet(o,c->argv[2])) != NULL) {
6671 addReplyBulk(c,value);
6672 decrRefCount(value);
6673 } else {
6674 addReply(c,shared.nullbulk);
6675 }
6676 }
6677
6678 static void hmgetCommand(redisClient *c) {
6679 int i;
6680 robj *o, *value;
6681 o = lookupKeyRead(c->db,c->argv[1]);
6682 if (o != NULL && o->type != REDIS_HASH) {
6683 addReply(c,shared.wrongtypeerr);
6684 }
6685
6686 /* Note the check for o != NULL happens inside the loop. This is
6687 * done because objects that cannot be found are considered to be
6688 * an empty hash. The reply should then be a series of NULLs. */
6689 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6690 for (i = 2; i < c->argc; i++) {
6691 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6692 addReplyBulk(c,value);
6693 decrRefCount(value);
6694 } else {
6695 addReply(c,shared.nullbulk);
6696 }
6697 }
6698 }
6699
6700 static void hdelCommand(redisClient *c) {
6701 robj *o;
6702 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6703 checkType(c,o,REDIS_HASH)) return;
6704
6705 if (hashDelete(o,c->argv[2])) {
6706 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6707 addReply(c,shared.cone);
6708 server.dirty++;
6709 } else {
6710 addReply(c,shared.czero);
6711 }
6712 }
6713
6714 static void hlenCommand(redisClient *c) {
6715 robj *o;
6716 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6717 checkType(c,o,REDIS_HASH)) return;
6718
6719 addReplyUlong(c,hashLength(o));
6720 }
6721
6722 static void genericHgetallCommand(redisClient *c, int flags) {
6723 robj *o, *lenobj, *obj;
6724 unsigned long count = 0;
6725 hashIterator *hi;
6726
6727 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6728 || checkType(c,o,REDIS_HASH)) return;
6729
6730 lenobj = createObject(REDIS_STRING,NULL);
6731 addReply(c,lenobj);
6732 decrRefCount(lenobj);
6733
6734 hi = hashInitIterator(o);
6735 while (hashNext(hi) != REDIS_ERR) {
6736 if (flags & REDIS_HASH_KEY) {
6737 obj = hashCurrent(hi,REDIS_HASH_KEY);
6738 addReplyBulk(c,obj);
6739 decrRefCount(obj);
6740 count++;
6741 }
6742 if (flags & REDIS_HASH_VALUE) {
6743 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6744 addReplyBulk(c,obj);
6745 decrRefCount(obj);
6746 count++;
6747 }
6748 }
6749 hashReleaseIterator(hi);
6750
6751 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6752 }
6753
6754 static void hkeysCommand(redisClient *c) {
6755 genericHgetallCommand(c,REDIS_HASH_KEY);
6756 }
6757
6758 static void hvalsCommand(redisClient *c) {
6759 genericHgetallCommand(c,REDIS_HASH_VALUE);
6760 }
6761
6762 static void hgetallCommand(redisClient *c) {
6763 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6764 }
6765
6766 static void hexistsCommand(redisClient *c) {
6767 robj *o;
6768 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6769 checkType(c,o,REDIS_HASH)) return;
6770
6771 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6772 }
6773
6774 static void convertToRealHash(robj *o) {
6775 unsigned char *key, *val, *p, *zm = o->ptr;
6776 unsigned int klen, vlen;
6777 dict *dict = dictCreate(&hashDictType,NULL);
6778
6779 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6780 p = zipmapRewind(zm);
6781 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6782 robj *keyobj, *valobj;
6783
6784 keyobj = createStringObject((char*)key,klen);
6785 valobj = createStringObject((char*)val,vlen);
6786 keyobj = tryObjectEncoding(keyobj);
6787 valobj = tryObjectEncoding(valobj);
6788 dictAdd(dict,keyobj,valobj);
6789 }
6790 o->encoding = REDIS_ENCODING_HT;
6791 o->ptr = dict;
6792 zfree(zm);
6793 }
6794
6795 /* ========================= Non type-specific commands ==================== */
6796
6797 static void flushdbCommand(redisClient *c) {
6798 server.dirty += dictSize(c->db->dict);
6799 touchWatchedKeysOnFlush(c->db->id);
6800 dictEmpty(c->db->dict);
6801 dictEmpty(c->db->expires);
6802 addReply(c,shared.ok);
6803 }
6804
6805 static void flushallCommand(redisClient *c) {
6806 touchWatchedKeysOnFlush(-1);
6807 server.dirty += emptyDb();
6808 addReply(c,shared.ok);
6809 if (server.bgsavechildpid != -1) {
6810 kill(server.bgsavechildpid,SIGKILL);
6811 rdbRemoveTempFile(server.bgsavechildpid);
6812 }
6813 rdbSave(server.dbfilename);
6814 server.dirty++;
6815 }
6816
6817 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6818 redisSortOperation *so = zmalloc(sizeof(*so));
6819 so->type = type;
6820 so->pattern = pattern;
6821 return so;
6822 }
6823
6824 /* Return the value associated to the key with a name obtained
6825 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6826 * The returned object will always have its refcount increased by 1
6827 * when it is non-NULL. */
6828 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6829 char *p, *f;
6830 sds spat, ssub;
6831 robj keyobj, fieldobj, *o;
6832 int prefixlen, sublen, postfixlen, fieldlen;
6833 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6834 struct {
6835 long len;
6836 long free;
6837 char buf[REDIS_SORTKEY_MAX+1];
6838 } keyname, fieldname;
6839
6840 /* If the pattern is "#" return the substitution object itself in order
6841 * to implement the "SORT ... GET #" feature. */
6842 spat = pattern->ptr;
6843 if (spat[0] == '#' && spat[1] == '\0') {
6844 incrRefCount(subst);
6845 return subst;
6846 }
6847
6848 /* The substitution object may be specially encoded. If so we create
6849 * a decoded object on the fly. Otherwise getDecodedObject will just
6850 * increment the ref count, that we'll decrement later. */
6851 subst = getDecodedObject(subst);
6852
6853 ssub = subst->ptr;
6854 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6855 p = strchr(spat,'*');
6856 if (!p) {
6857 decrRefCount(subst);
6858 return NULL;
6859 }
6860
6861 /* Find out if we're dealing with a hash dereference. */
6862 if ((f = strstr(p+1, "->")) != NULL) {
6863 fieldlen = sdslen(spat)-(f-spat);
6864 /* this also copies \0 character */
6865 memcpy(fieldname.buf,f+2,fieldlen-1);
6866 fieldname.len = fieldlen-2;
6867 } else {
6868 fieldlen = 0;
6869 }
6870
6871 prefixlen = p-spat;
6872 sublen = sdslen(ssub);
6873 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6874 memcpy(keyname.buf,spat,prefixlen);
6875 memcpy(keyname.buf+prefixlen,ssub,sublen);
6876 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6877 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6878 keyname.len = prefixlen+sublen+postfixlen;
6879 decrRefCount(subst);
6880
6881 /* Lookup substituted key */
6882 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6883 o = lookupKeyRead(db,&keyobj);
6884 if (o == NULL) return NULL;
6885
6886 if (fieldlen > 0) {
6887 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6888
6889 /* Retrieve value from hash by the field name. This operation
6890 * already increases the refcount of the returned object. */
6891 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6892 o = hashGet(o, &fieldobj);
6893 } else {
6894 if (o->type != REDIS_STRING) return NULL;
6895
6896 /* Every object that this function returns needs to have its refcount
6897 * increased. sortCommand decreases it again. */
6898 incrRefCount(o);
6899 }
6900
6901 return o;
6902 }
6903
6904 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6905 * the additional parameter is not standard but a BSD-specific we have to
6906 * pass sorting parameters via the global 'server' structure */
6907 static int sortCompare(const void *s1, const void *s2) {
6908 const redisSortObject *so1 = s1, *so2 = s2;
6909 int cmp;
6910
6911 if (!server.sort_alpha) {
6912 /* Numeric sorting. Here it's trivial as we precomputed scores */
6913 if (so1->u.score > so2->u.score) {
6914 cmp = 1;
6915 } else if (so1->u.score < so2->u.score) {
6916 cmp = -1;
6917 } else {
6918 cmp = 0;
6919 }
6920 } else {
6921 /* Alphanumeric sorting */
6922 if (server.sort_bypattern) {
6923 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6924 /* At least one compare object is NULL */
6925 if (so1->u.cmpobj == so2->u.cmpobj)
6926 cmp = 0;
6927 else if (so1->u.cmpobj == NULL)
6928 cmp = -1;
6929 else
6930 cmp = 1;
6931 } else {
6932 /* We have both the objects, use strcoll */
6933 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6934 }
6935 } else {
6936 /* Compare elements directly. */
6937 cmp = compareStringObjects(so1->obj,so2->obj);
6938 }
6939 }
6940 return server.sort_desc ? -cmp : cmp;
6941 }
6942
6943 /* The SORT command is the most complex command in Redis. Warning: this code
6944 * is optimized for speed and a bit less for readability */
6945 static void sortCommand(redisClient *c) {
6946 list *operations;
6947 int outputlen = 0;
6948 int desc = 0, alpha = 0;
6949 int limit_start = 0, limit_count = -1, start, end;
6950 int j, dontsort = 0, vectorlen;
6951 int getop = 0; /* GET operation counter */
6952 robj *sortval, *sortby = NULL, *storekey = NULL;
6953 redisSortObject *vector; /* Resulting vector to sort */
6954
6955 /* Lookup the key to sort. It must be of the right types */
6956 sortval = lookupKeyRead(c->db,c->argv[1]);
6957 if (sortval == NULL) {
6958 addReply(c,shared.emptymultibulk);
6959 return;
6960 }
6961 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6962 sortval->type != REDIS_ZSET)
6963 {
6964 addReply(c,shared.wrongtypeerr);
6965 return;
6966 }
6967
6968 /* Create a list of operations to perform for every sorted element.
6969 * Operations can be GET/DEL/INCR/DECR */
6970 operations = listCreate();
6971 listSetFreeMethod(operations,zfree);
6972 j = 2;
6973
6974 /* Now we need to protect sortval incrementing its count, in the future
6975 * SORT may have options able to overwrite/delete keys during the sorting
6976 * and the sorted key itself may get destroied */
6977 incrRefCount(sortval);
6978
6979 /* The SORT command has an SQL-alike syntax, parse it */
6980 while(j < c->argc) {
6981 int leftargs = c->argc-j-1;
6982 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6983 desc = 0;
6984 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6985 desc = 1;
6986 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6987 alpha = 1;
6988 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6989 limit_start = atoi(c->argv[j+1]->ptr);
6990 limit_count = atoi(c->argv[j+2]->ptr);
6991 j+=2;
6992 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6993 storekey = c->argv[j+1];
6994 j++;
6995 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6996 sortby = c->argv[j+1];
6997 /* If the BY pattern does not contain '*', i.e. it is constant,
6998 * we don't need to sort nor to lookup the weight keys. */
6999 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7000 j++;
7001 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7002 listAddNodeTail(operations,createSortOperation(
7003 REDIS_SORT_GET,c->argv[j+1]));
7004 getop++;
7005 j++;
7006 } else {
7007 decrRefCount(sortval);
7008 listRelease(operations);
7009 addReply(c,shared.syntaxerr);
7010 return;
7011 }
7012 j++;
7013 }
7014
7015 /* Load the sorting vector with all the objects to sort */
7016 switch(sortval->type) {
7017 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7018 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7019 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7020 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7021 }
7022 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7023 j = 0;
7024
7025 if (sortval->type == REDIS_LIST) {
7026 list *list = sortval->ptr;
7027 listNode *ln;
7028 listIter li;
7029
7030 listRewind(list,&li);
7031 while((ln = listNext(&li))) {
7032 robj *ele = ln->value;
7033 vector[j].obj = ele;
7034 vector[j].u.score = 0;
7035 vector[j].u.cmpobj = NULL;
7036 j++;
7037 }
7038 } else {
7039 dict *set;
7040 dictIterator *di;
7041 dictEntry *setele;
7042
7043 if (sortval->type == REDIS_SET) {
7044 set = sortval->ptr;
7045 } else {
7046 zset *zs = sortval->ptr;
7047 set = zs->dict;
7048 }
7049
7050 di = dictGetIterator(set);
7051 while((setele = dictNext(di)) != NULL) {
7052 vector[j].obj = dictGetEntryKey(setele);
7053 vector[j].u.score = 0;
7054 vector[j].u.cmpobj = NULL;
7055 j++;
7056 }
7057 dictReleaseIterator(di);
7058 }
7059 redisAssert(j == vectorlen);
7060
7061 /* Now it's time to load the right scores in the sorting vector */
7062 if (dontsort == 0) {
7063 for (j = 0; j < vectorlen; j++) {
7064 robj *byval;
7065 if (sortby) {
7066 /* lookup value to sort by */
7067 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7068 if (!byval) continue;
7069 } else {
7070 /* use object itself to sort by */
7071 byval = vector[j].obj;
7072 }
7073
7074 if (alpha) {
7075 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7076 } else {
7077 if (byval->encoding == REDIS_ENCODING_RAW) {
7078 vector[j].u.score = strtod(byval->ptr,NULL);
7079 } else if (byval->encoding == REDIS_ENCODING_INT) {
7080 /* Don't need to decode the object if it's
7081 * integer-encoded (the only encoding supported) so
7082 * far. We can just cast it */
7083 vector[j].u.score = (long)byval->ptr;
7084 } else {
7085 redisAssert(1 != 1);
7086 }
7087 }
7088
7089 /* when the object was retrieved using lookupKeyByPattern,
7090 * its refcount needs to be decreased. */
7091 if (sortby) {
7092 decrRefCount(byval);
7093 }
7094 }
7095 }
7096
7097 /* We are ready to sort the vector... perform a bit of sanity check
7098 * on the LIMIT option too. We'll use a partial version of quicksort. */
7099 start = (limit_start < 0) ? 0 : limit_start;
7100 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7101 if (start >= vectorlen) {
7102 start = vectorlen-1;
7103 end = vectorlen-2;
7104 }
7105 if (end >= vectorlen) end = vectorlen-1;
7106
7107 if (dontsort == 0) {
7108 server.sort_desc = desc;
7109 server.sort_alpha = alpha;
7110 server.sort_bypattern = sortby ? 1 : 0;
7111 if (sortby && (start != 0 || end != vectorlen-1))
7112 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7113 else
7114 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7115 }
7116
7117 /* Send command output to the output buffer, performing the specified
7118 * GET/DEL/INCR/DECR operations if any. */
7119 outputlen = getop ? getop*(end-start+1) : end-start+1;
7120 if (storekey == NULL) {
7121 /* STORE option not specified, sent the sorting result to client */
7122 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7123 for (j = start; j <= end; j++) {
7124 listNode *ln;
7125 listIter li;
7126
7127 if (!getop) addReplyBulk(c,vector[j].obj);
7128 listRewind(operations,&li);
7129 while((ln = listNext(&li))) {
7130 redisSortOperation *sop = ln->value;
7131 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7132 vector[j].obj);
7133
7134 if (sop->type == REDIS_SORT_GET) {
7135 if (!val) {
7136 addReply(c,shared.nullbulk);
7137 } else {
7138 addReplyBulk(c,val);
7139 decrRefCount(val);
7140 }
7141 } else {
7142 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7143 }
7144 }
7145 }
7146 } else {
7147 robj *listObject = createListObject();
7148 list *listPtr = (list*) listObject->ptr;
7149
7150 /* STORE option specified, set the sorting result as a List object */
7151 for (j = start; j <= end; j++) {
7152 listNode *ln;
7153 listIter li;
7154
7155 if (!getop) {
7156 listAddNodeTail(listPtr,vector[j].obj);
7157 incrRefCount(vector[j].obj);
7158 }
7159 listRewind(operations,&li);
7160 while((ln = listNext(&li))) {
7161 redisSortOperation *sop = ln->value;
7162 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7163 vector[j].obj);
7164
7165 if (sop->type == REDIS_SORT_GET) {
7166 if (!val) {
7167 listAddNodeTail(listPtr,createStringObject("",0));
7168 } else {
7169 /* We should do a incrRefCount on val because it is
7170 * added to the list, but also a decrRefCount because
7171 * it is returned by lookupKeyByPattern. This results
7172 * in doing nothing at all. */
7173 listAddNodeTail(listPtr,val);
7174 }
7175 } else {
7176 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7177 }
7178 }
7179 }
7180 if (dictReplace(c->db->dict,storekey,listObject)) {
7181 incrRefCount(storekey);
7182 }
7183 /* Note: we add 1 because the DB is dirty anyway since even if the
7184 * SORT result is empty a new key is set and maybe the old content
7185 * replaced. */
7186 server.dirty += 1+outputlen;
7187 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7188 }
7189
7190 /* Cleanup */
7191 decrRefCount(sortval);
7192 listRelease(operations);
7193 for (j = 0; j < vectorlen; j++) {
7194 if (alpha && vector[j].u.cmpobj)
7195 decrRefCount(vector[j].u.cmpobj);
7196 }
7197 zfree(vector);
7198 }
7199
7200 /* Convert an amount of bytes into a human readable string in the form
7201 * of 100B, 2G, 100M, 4K, and so forth. */
7202 static void bytesToHuman(char *s, unsigned long long n) {
7203 double d;
7204
7205 if (n < 1024) {
7206 /* Bytes */
7207 sprintf(s,"%lluB",n);
7208 return;
7209 } else if (n < (1024*1024)) {
7210 d = (double)n/(1024);
7211 sprintf(s,"%.2fK",d);
7212 } else if (n < (1024LL*1024*1024)) {
7213 d = (double)n/(1024*1024);
7214 sprintf(s,"%.2fM",d);
7215 } else if (n < (1024LL*1024*1024*1024)) {
7216 d = (double)n/(1024LL*1024*1024);
7217 sprintf(s,"%.2fG",d);
7218 }
7219 }
7220
7221 /* Create the string returned by the INFO command. This is decoupled
7222 * by the INFO command itself as we need to report the same information
7223 * on memory corruption problems. */
7224 static sds genRedisInfoString(void) {
7225 sds info;
7226 time_t uptime = time(NULL)-server.stat_starttime;
7227 int j;
7228 char hmem[64];
7229
7230 bytesToHuman(hmem,zmalloc_used_memory());
7231 info = sdscatprintf(sdsempty(),
7232 "redis_version:%s\r\n"
7233 "redis_git_sha1:%s\r\n"
7234 "redis_git_dirty:%d\r\n"
7235 "arch_bits:%s\r\n"
7236 "multiplexing_api:%s\r\n"
7237 "process_id:%ld\r\n"
7238 "uptime_in_seconds:%ld\r\n"
7239 "uptime_in_days:%ld\r\n"
7240 "connected_clients:%d\r\n"
7241 "connected_slaves:%d\r\n"
7242 "blocked_clients:%d\r\n"
7243 "used_memory:%zu\r\n"
7244 "used_memory_human:%s\r\n"
7245 "changes_since_last_save:%lld\r\n"
7246 "bgsave_in_progress:%d\r\n"
7247 "last_save_time:%ld\r\n"
7248 "bgrewriteaof_in_progress:%d\r\n"
7249 "total_connections_received:%lld\r\n"
7250 "total_commands_processed:%lld\r\n"
7251 "expired_keys:%lld\r\n"
7252 "hash_max_zipmap_entries:%zu\r\n"
7253 "hash_max_zipmap_value:%zu\r\n"
7254 "pubsub_channels:%ld\r\n"
7255 "pubsub_patterns:%u\r\n"
7256 "vm_enabled:%d\r\n"
7257 "role:%s\r\n"
7258 ,REDIS_VERSION,
7259 REDIS_GIT_SHA1,
7260 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7261 (sizeof(long) == 8) ? "64" : "32",
7262 aeGetApiName(),
7263 (long) getpid(),
7264 uptime,
7265 uptime/(3600*24),
7266 listLength(server.clients)-listLength(server.slaves),
7267 listLength(server.slaves),
7268 server.blpop_blocked_clients,
7269 zmalloc_used_memory(),
7270 hmem,
7271 server.dirty,
7272 server.bgsavechildpid != -1,
7273 server.lastsave,
7274 server.bgrewritechildpid != -1,
7275 server.stat_numconnections,
7276 server.stat_numcommands,
7277 server.stat_expiredkeys,
7278 server.hash_max_zipmap_entries,
7279 server.hash_max_zipmap_value,
7280 dictSize(server.pubsub_channels),
7281 listLength(server.pubsub_patterns),
7282 server.vm_enabled != 0,
7283 server.masterhost == NULL ? "master" : "slave"
7284 );
7285 if (server.masterhost) {
7286 info = sdscatprintf(info,
7287 "master_host:%s\r\n"
7288 "master_port:%d\r\n"
7289 "master_link_status:%s\r\n"
7290 "master_last_io_seconds_ago:%d\r\n"
7291 ,server.masterhost,
7292 server.masterport,
7293 (server.replstate == REDIS_REPL_CONNECTED) ?
7294 "up" : "down",
7295 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7296 );
7297 }
7298 if (server.vm_enabled) {
7299 lockThreadedIO();
7300 info = sdscatprintf(info,
7301 "vm_conf_max_memory:%llu\r\n"
7302 "vm_conf_page_size:%llu\r\n"
7303 "vm_conf_pages:%llu\r\n"
7304 "vm_stats_used_pages:%llu\r\n"
7305 "vm_stats_swapped_objects:%llu\r\n"
7306 "vm_stats_swappin_count:%llu\r\n"
7307 "vm_stats_swappout_count:%llu\r\n"
7308 "vm_stats_io_newjobs_len:%lu\r\n"
7309 "vm_stats_io_processing_len:%lu\r\n"
7310 "vm_stats_io_processed_len:%lu\r\n"
7311 "vm_stats_io_active_threads:%lu\r\n"
7312 "vm_stats_blocked_clients:%lu\r\n"
7313 ,(unsigned long long) server.vm_max_memory,
7314 (unsigned long long) server.vm_page_size,
7315 (unsigned long long) server.vm_pages,
7316 (unsigned long long) server.vm_stats_used_pages,
7317 (unsigned long long) server.vm_stats_swapped_objects,
7318 (unsigned long long) server.vm_stats_swapins,
7319 (unsigned long long) server.vm_stats_swapouts,
7320 (unsigned long) listLength(server.io_newjobs),
7321 (unsigned long) listLength(server.io_processing),
7322 (unsigned long) listLength(server.io_processed),
7323 (unsigned long) server.io_active_threads,
7324 (unsigned long) server.vm_blocked_clients
7325 );
7326 unlockThreadedIO();
7327 }
7328 for (j = 0; j < server.dbnum; j++) {
7329 long long keys, vkeys;
7330
7331 keys = dictSize(server.db[j].dict);
7332 vkeys = dictSize(server.db[j].expires);
7333 if (keys || vkeys) {
7334 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7335 j, keys, vkeys);
7336 }
7337 }
7338 return info;
7339 }
7340
7341 static void infoCommand(redisClient *c) {
7342 sds info = genRedisInfoString();
7343 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7344 (unsigned long)sdslen(info)));
7345 addReplySds(c,info);
7346 addReply(c,shared.crlf);
7347 }
7348
7349 static void monitorCommand(redisClient *c) {
7350 /* ignore MONITOR if aleady slave or in monitor mode */
7351 if (c->flags & REDIS_SLAVE) return;
7352
7353 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7354 c->slaveseldb = 0;
7355 listAddNodeTail(server.monitors,c);
7356 addReply(c,shared.ok);
7357 }
7358
7359 /* ================================= Expire ================================= */
7360 static int removeExpire(redisDb *db, robj *key) {
7361 if (dictDelete(db->expires,key) == DICT_OK) {
7362 return 1;
7363 } else {
7364 return 0;
7365 }
7366 }
7367
7368 static int setExpire(redisDb *db, robj *key, time_t when) {
7369 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7370 return 0;
7371 } else {
7372 incrRefCount(key);
7373 return 1;
7374 }
7375 }
7376
7377 /* Return the expire time of the specified key, or -1 if no expire
7378 * is associated with this key (i.e. the key is non volatile) */
7379 static time_t getExpire(redisDb *db, robj *key) {
7380 dictEntry *de;
7381
7382 /* No expire? return ASAP */
7383 if (dictSize(db->expires) == 0 ||
7384 (de = dictFind(db->expires,key)) == NULL) return -1;
7385
7386 return (time_t) dictGetEntryVal(de);
7387 }
7388
7389 static int expireIfNeeded(redisDb *db, robj *key) {
7390 time_t when;
7391 dictEntry *de;
7392
7393 /* No expire? return ASAP */
7394 if (dictSize(db->expires) == 0 ||
7395 (de = dictFind(db->expires,key)) == NULL) return 0;
7396
7397 /* Lookup the expire */
7398 when = (time_t) dictGetEntryVal(de);
7399 if (time(NULL) <= when) return 0;
7400
7401 /* Delete the key */
7402 dictDelete(db->expires,key);
7403 server.stat_expiredkeys++;
7404 return dictDelete(db->dict,key) == DICT_OK;
7405 }
7406
7407 static int deleteIfVolatile(redisDb *db, robj *key) {
7408 dictEntry *de;
7409
7410 /* No expire? return ASAP */
7411 if (dictSize(db->expires) == 0 ||
7412 (de = dictFind(db->expires,key)) == NULL) return 0;
7413
7414 /* Delete the key */
7415 server.dirty++;
7416 server.stat_expiredkeys++;
7417 dictDelete(db->expires,key);
7418 return dictDelete(db->dict,key) == DICT_OK;
7419 }
7420
7421 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7422 dictEntry *de;
7423 time_t seconds;
7424
7425 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7426
7427 seconds -= offset;
7428
7429 de = dictFind(c->db->dict,key);
7430 if (de == NULL) {
7431 addReply(c,shared.czero);
7432 return;
7433 }
7434 if (seconds <= 0) {
7435 if (deleteKey(c->db,key)) server.dirty++;
7436 addReply(c, shared.cone);
7437 return;
7438 } else {
7439 time_t when = time(NULL)+seconds;
7440 if (setExpire(c->db,key,when)) {
7441 addReply(c,shared.cone);
7442 server.dirty++;
7443 } else {
7444 addReply(c,shared.czero);
7445 }
7446 return;
7447 }
7448 }
7449
7450 static void expireCommand(redisClient *c) {
7451 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7452 }
7453
7454 static void expireatCommand(redisClient *c) {
7455 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7456 }
7457
7458 static void ttlCommand(redisClient *c) {
7459 time_t expire;
7460 int ttl = -1;
7461
7462 expire = getExpire(c->db,c->argv[1]);
7463 if (expire != -1) {
7464 ttl = (int) (expire-time(NULL));
7465 if (ttl < 0) ttl = -1;
7466 }
7467 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7468 }
7469
7470 /* ================================ MULTI/EXEC ============================== */
7471
7472 /* Client state initialization for MULTI/EXEC */
7473 static void initClientMultiState(redisClient *c) {
7474 c->mstate.commands = NULL;
7475 c->mstate.count = 0;
7476 }
7477
7478 /* Release all the resources associated with MULTI/EXEC state */
7479 static void freeClientMultiState(redisClient *c) {
7480 int j;
7481
7482 for (j = 0; j < c->mstate.count; j++) {
7483 int i;
7484 multiCmd *mc = c->mstate.commands+j;
7485
7486 for (i = 0; i < mc->argc; i++)
7487 decrRefCount(mc->argv[i]);
7488 zfree(mc->argv);
7489 }
7490 zfree(c->mstate.commands);
7491 }
7492
7493 /* Add a new command into the MULTI commands queue */
7494 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7495 multiCmd *mc;
7496 int j;
7497
7498 c->mstate.commands = zrealloc(c->mstate.commands,
7499 sizeof(multiCmd)*(c->mstate.count+1));
7500 mc = c->mstate.commands+c->mstate.count;
7501 mc->cmd = cmd;
7502 mc->argc = c->argc;
7503 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7504 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7505 for (j = 0; j < c->argc; j++)
7506 incrRefCount(mc->argv[j]);
7507 c->mstate.count++;
7508 }
7509
7510 static void multiCommand(redisClient *c) {
7511 if (c->flags & REDIS_MULTI) {
7512 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7513 return;
7514 }
7515 c->flags |= REDIS_MULTI;
7516 addReply(c,shared.ok);
7517 }
7518
7519 static void discardCommand(redisClient *c) {
7520 if (!(c->flags & REDIS_MULTI)) {
7521 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7522 return;
7523 }
7524
7525 freeClientMultiState(c);
7526 initClientMultiState(c);
7527 c->flags &= (~REDIS_MULTI);
7528 addReply(c,shared.ok);
7529 }
7530
7531 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7532 * implememntation for more information. */
7533 static void execCommandReplicateMulti(redisClient *c) {
7534 struct redisCommand *cmd;
7535 robj *multistring = createStringObject("MULTI",5);
7536
7537 cmd = lookupCommand("multi");
7538 if (server.appendonly)
7539 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7540 if (listLength(server.slaves))
7541 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7542 decrRefCount(multistring);
7543 }
7544
7545 static void execCommand(redisClient *c) {
7546 int j;
7547 robj **orig_argv;
7548 int orig_argc;
7549
7550 if (!(c->flags & REDIS_MULTI)) {
7551 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7552 return;
7553 }
7554
7555 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7556 * A failed EXEC will return a multi bulk nil object. */
7557 if (c->flags & REDIS_DIRTY_CAS) {
7558 freeClientMultiState(c);
7559 initClientMultiState(c);
7560 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7561 unwatchAllKeys(c);
7562 addReply(c,shared.nullmultibulk);
7563 return;
7564 }
7565
7566 /* Replicate a MULTI request now that we are sure the block is executed.
7567 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7568 * both the AOF and the replication link will have the same consistency
7569 * and atomicity guarantees. */
7570 execCommandReplicateMulti(c);
7571
7572 /* Exec all the queued commands */
7573 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7574 orig_argv = c->argv;
7575 orig_argc = c->argc;
7576 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7577 for (j = 0; j < c->mstate.count; j++) {
7578 c->argc = c->mstate.commands[j].argc;
7579 c->argv = c->mstate.commands[j].argv;
7580 call(c,c->mstate.commands[j].cmd);
7581 }
7582 c->argv = orig_argv;
7583 c->argc = orig_argc;
7584 freeClientMultiState(c);
7585 initClientMultiState(c);
7586 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7587 /* Make sure the EXEC command is always replicated / AOF, since we
7588 * always send the MULTI command (we can't know beforehand if the
7589 * next operations will contain at least a modification to the DB). */
7590 server.dirty++;
7591 }
7592
7593 /* =========================== Blocking Operations ========================= */
7594
7595 /* Currently Redis blocking operations support is limited to list POP ops,
7596 * so the current implementation is not fully generic, but it is also not
7597 * completely specific so it will not require a rewrite to support new
7598 * kind of blocking operations in the future.
7599 *
7600 * Still it's important to note that list blocking operations can be already
7601 * used as a notification mechanism in order to implement other blocking
7602 * operations at application level, so there must be a very strong evidence
7603 * of usefulness and generality before new blocking operations are implemented.
7604 *
7605 * This is how the current blocking POP works, we use BLPOP as example:
7606 * - If the user calls BLPOP and the key exists and contains a non empty list
7607 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7608 * if there is not to block.
7609 * - If instead BLPOP is called and the key does not exists or the list is
7610 * empty we need to block. In order to do so we remove the notification for
7611 * new data to read in the client socket (so that we'll not serve new
7612 * requests if the blocking request is not served). Also we put the client
7613 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7614 * blocking for this keys.
7615 * - If a PUSH operation against a key with blocked clients waiting is
7616 * performed, we serve the first in the list: basically instead to push
7617 * the new element inside the list we return it to the (first / oldest)
7618 * blocking client, unblock the client, and remove it form the list.
7619 *
7620 * The above comment and the source code should be enough in order to understand
7621 * the implementation and modify / fix it later.
7622 */
7623
7624 /* Set a client in blocking mode for the specified key, with the specified
7625 * timeout */
7626 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7627 dictEntry *de;
7628 list *l;
7629 int j;
7630
7631 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7632 c->blocking_keys_num = numkeys;
7633 c->blockingto = timeout;
7634 for (j = 0; j < numkeys; j++) {
7635 /* Add the key in the client structure, to map clients -> keys */
7636 c->blocking_keys[j] = keys[j];
7637 incrRefCount(keys[j]);
7638
7639 /* And in the other "side", to map keys -> clients */
7640 de = dictFind(c->db->blocking_keys,keys[j]);
7641 if (de == NULL) {
7642 int retval;
7643
7644 /* For every key we take a list of clients blocked for it */
7645 l = listCreate();
7646 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7647 incrRefCount(keys[j]);
7648 assert(retval == DICT_OK);
7649 } else {
7650 l = dictGetEntryVal(de);
7651 }
7652 listAddNodeTail(l,c);
7653 }
7654 /* Mark the client as a blocked client */
7655 c->flags |= REDIS_BLOCKED;
7656 server.blpop_blocked_clients++;
7657 }
7658
7659 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7660 static void unblockClientWaitingData(redisClient *c) {
7661 dictEntry *de;
7662 list *l;
7663 int j;
7664
7665 assert(c->blocking_keys != NULL);
7666 /* The client may wait for multiple keys, so unblock it for every key. */
7667 for (j = 0; j < c->blocking_keys_num; j++) {
7668 /* Remove this client from the list of clients waiting for this key. */
7669 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7670 assert(de != NULL);
7671 l = dictGetEntryVal(de);
7672 listDelNode(l,listSearchKey(l,c));
7673 /* If the list is empty we need to remove it to avoid wasting memory */
7674 if (listLength(l) == 0)
7675 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7676 decrRefCount(c->blocking_keys[j]);
7677 }
7678 /* Cleanup the client structure */
7679 zfree(c->blocking_keys);
7680 c->blocking_keys = NULL;
7681 c->flags &= (~REDIS_BLOCKED);
7682 server.blpop_blocked_clients--;
7683 /* We want to process data if there is some command waiting
7684 * in the input buffer. Note that this is safe even if
7685 * unblockClientWaitingData() gets called from freeClient() because
7686 * freeClient() will be smart enough to call this function
7687 * *after* c->querybuf was set to NULL. */
7688 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7689 }
7690
7691 /* This should be called from any function PUSHing into lists.
7692 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7693 * 'ele' is the element pushed.
7694 *
7695 * If the function returns 0 there was no client waiting for a list push
7696 * against this key.
7697 *
7698 * If the function returns 1 there was a client waiting for a list push
7699 * against this key, the element was passed to this client thus it's not
7700 * needed to actually add it to the list and the caller should return asap. */
7701 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7702 struct dictEntry *de;
7703 redisClient *receiver;
7704 list *l;
7705 listNode *ln;
7706
7707 de = dictFind(c->db->blocking_keys,key);
7708 if (de == NULL) return 0;
7709 l = dictGetEntryVal(de);
7710 ln = listFirst(l);
7711 assert(ln != NULL);
7712 receiver = ln->value;
7713
7714 addReplySds(receiver,sdsnew("*2\r\n"));
7715 addReplyBulk(receiver,key);
7716 addReplyBulk(receiver,ele);
7717 unblockClientWaitingData(receiver);
7718 return 1;
7719 }
7720
7721 /* Blocking RPOP/LPOP */
7722 static void blockingPopGenericCommand(redisClient *c, int where) {
7723 robj *o;
7724 time_t timeout;
7725 int j;
7726
7727 for (j = 1; j < c->argc-1; j++) {
7728 o = lookupKeyWrite(c->db,c->argv[j]);
7729 if (o != NULL) {
7730 if (o->type != REDIS_LIST) {
7731 addReply(c,shared.wrongtypeerr);
7732 return;
7733 } else {
7734 list *list = o->ptr;
7735 if (listLength(list) != 0) {
7736 /* If the list contains elements fall back to the usual
7737 * non-blocking POP operation */
7738 robj *argv[2], **orig_argv;
7739 int orig_argc;
7740
7741 /* We need to alter the command arguments before to call
7742 * popGenericCommand() as the command takes a single key. */
7743 orig_argv = c->argv;
7744 orig_argc = c->argc;
7745 argv[1] = c->argv[j];
7746 c->argv = argv;
7747 c->argc = 2;
7748
7749 /* Also the return value is different, we need to output
7750 * the multi bulk reply header and the key name. The
7751 * "real" command will add the last element (the value)
7752 * for us. If this souds like an hack to you it's just
7753 * because it is... */
7754 addReplySds(c,sdsnew("*2\r\n"));
7755 addReplyBulk(c,argv[1]);
7756 popGenericCommand(c,where);
7757
7758 /* Fix the client structure with the original stuff */
7759 c->argv = orig_argv;
7760 c->argc = orig_argc;
7761 return;
7762 }
7763 }
7764 }
7765 }
7766 /* If the list is empty or the key does not exists we must block */
7767 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7768 if (timeout > 0) timeout += time(NULL);
7769 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7770 }
7771
7772 static void blpopCommand(redisClient *c) {
7773 blockingPopGenericCommand(c,REDIS_HEAD);
7774 }
7775
7776 static void brpopCommand(redisClient *c) {
7777 blockingPopGenericCommand(c,REDIS_TAIL);
7778 }
7779
7780 /* =============================== Replication ============================= */
7781
7782 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7783 ssize_t nwritten, ret = size;
7784 time_t start = time(NULL);
7785
7786 timeout++;
7787 while(size) {
7788 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7789 nwritten = write(fd,ptr,size);
7790 if (nwritten == -1) return -1;
7791 ptr += nwritten;
7792 size -= nwritten;
7793 }
7794 if ((time(NULL)-start) > timeout) {
7795 errno = ETIMEDOUT;
7796 return -1;
7797 }
7798 }
7799 return ret;
7800 }
7801
7802 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7803 ssize_t nread, totread = 0;
7804 time_t start = time(NULL);
7805
7806 timeout++;
7807 while(size) {
7808 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7809 nread = read(fd,ptr,size);
7810 if (nread == -1) return -1;
7811 ptr += nread;
7812 size -= nread;
7813 totread += nread;
7814 }
7815 if ((time(NULL)-start) > timeout) {
7816 errno = ETIMEDOUT;
7817 return -1;
7818 }
7819 }
7820 return totread;
7821 }
7822
7823 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7824 ssize_t nread = 0;
7825
7826 size--;
7827 while(size) {
7828 char c;
7829
7830 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7831 if (c == '\n') {
7832 *ptr = '\0';
7833 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7834 return nread;
7835 } else {
7836 *ptr++ = c;
7837 *ptr = '\0';
7838 nread++;
7839 }
7840 }
7841 return nread;
7842 }
7843
7844 static void syncCommand(redisClient *c) {
7845 /* ignore SYNC if aleady slave or in monitor mode */
7846 if (c->flags & REDIS_SLAVE) return;
7847
7848 /* SYNC can't be issued when the server has pending data to send to
7849 * the client about already issued commands. We need a fresh reply
7850 * buffer registering the differences between the BGSAVE and the current
7851 * dataset, so that we can copy to other slaves if needed. */
7852 if (listLength(c->reply) != 0) {
7853 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7854 return;
7855 }
7856
7857 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7858 /* Here we need to check if there is a background saving operation
7859 * in progress, or if it is required to start one */
7860 if (server.bgsavechildpid != -1) {
7861 /* Ok a background save is in progress. Let's check if it is a good
7862 * one for replication, i.e. if there is another slave that is
7863 * registering differences since the server forked to save */
7864 redisClient *slave;
7865 listNode *ln;
7866 listIter li;
7867
7868 listRewind(server.slaves,&li);
7869 while((ln = listNext(&li))) {
7870 slave = ln->value;
7871 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7872 }
7873 if (ln) {
7874 /* Perfect, the server is already registering differences for
7875 * another slave. Set the right state, and copy the buffer. */
7876 listRelease(c->reply);
7877 c->reply = listDup(slave->reply);
7878 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7879 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7880 } else {
7881 /* No way, we need to wait for the next BGSAVE in order to
7882 * register differences */
7883 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7884 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7885 }
7886 } else {
7887 /* Ok we don't have a BGSAVE in progress, let's start one */
7888 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7889 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7890 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7891 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7892 return;
7893 }
7894 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7895 }
7896 c->repldbfd = -1;
7897 c->flags |= REDIS_SLAVE;
7898 c->slaveseldb = 0;
7899 listAddNodeTail(server.slaves,c);
7900 return;
7901 }
7902
7903 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7904 redisClient *slave = privdata;
7905 REDIS_NOTUSED(el);
7906 REDIS_NOTUSED(mask);
7907 char buf[REDIS_IOBUF_LEN];
7908 ssize_t nwritten, buflen;
7909
7910 if (slave->repldboff == 0) {
7911 /* Write the bulk write count before to transfer the DB. In theory here
7912 * we don't know how much room there is in the output buffer of the
7913 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7914 * operations) will never be smaller than the few bytes we need. */
7915 sds bulkcount;
7916
7917 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7918 slave->repldbsize);
7919 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7920 {
7921 sdsfree(bulkcount);
7922 freeClient(slave);
7923 return;
7924 }
7925 sdsfree(bulkcount);
7926 }
7927 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7928 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7929 if (buflen <= 0) {
7930 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7931 (buflen == 0) ? "premature EOF" : strerror(errno));
7932 freeClient(slave);
7933 return;
7934 }
7935 if ((nwritten = write(fd,buf,buflen)) == -1) {
7936 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7937 strerror(errno));
7938 freeClient(slave);
7939 return;
7940 }
7941 slave->repldboff += nwritten;
7942 if (slave->repldboff == slave->repldbsize) {
7943 close(slave->repldbfd);
7944 slave->repldbfd = -1;
7945 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7946 slave->replstate = REDIS_REPL_ONLINE;
7947 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7948 sendReplyToClient, slave) == AE_ERR) {
7949 freeClient(slave);
7950 return;
7951 }
7952 addReplySds(slave,sdsempty());
7953 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7954 }
7955 }
7956
7957 /* This function is called at the end of every backgrond saving.
7958 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7959 * otherwise REDIS_ERR is passed to the function.
7960 *
7961 * The goal of this function is to handle slaves waiting for a successful
7962 * background saving in order to perform non-blocking synchronization. */
7963 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7964 listNode *ln;
7965 int startbgsave = 0;
7966 listIter li;
7967
7968 listRewind(server.slaves,&li);
7969 while((ln = listNext(&li))) {
7970 redisClient *slave = ln->value;
7971
7972 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7973 startbgsave = 1;
7974 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7975 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7976 struct redis_stat buf;
7977
7978 if (bgsaveerr != REDIS_OK) {
7979 freeClient(slave);
7980 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7981 continue;
7982 }
7983 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7984 redis_fstat(slave->repldbfd,&buf) == -1) {
7985 freeClient(slave);
7986 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7987 continue;
7988 }
7989 slave->repldboff = 0;
7990 slave->repldbsize = buf.st_size;
7991 slave->replstate = REDIS_REPL_SEND_BULK;
7992 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7993 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7994 freeClient(slave);
7995 continue;
7996 }
7997 }
7998 }
7999 if (startbgsave) {
8000 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8001 listIter li;
8002
8003 listRewind(server.slaves,&li);
8004 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8005 while((ln = listNext(&li))) {
8006 redisClient *slave = ln->value;
8007
8008 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8009 freeClient(slave);
8010 }
8011 }
8012 }
8013 }
8014
8015 static int syncWithMaster(void) {
8016 char buf[1024], tmpfile[256], authcmd[1024];
8017 long dumpsize;
8018 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8019 int dfd, maxtries = 5;
8020
8021 if (fd == -1) {
8022 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8023 strerror(errno));
8024 return REDIS_ERR;
8025 }
8026
8027 /* AUTH with the master if required. */
8028 if(server.masterauth) {
8029 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8030 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8031 close(fd);
8032 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8033 strerror(errno));
8034 return REDIS_ERR;
8035 }
8036 /* Read the AUTH result. */
8037 if (syncReadLine(fd,buf,1024,3600) == -1) {
8038 close(fd);
8039 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8040 strerror(errno));
8041 return REDIS_ERR;
8042 }
8043 if (buf[0] != '+') {
8044 close(fd);
8045 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8046 return REDIS_ERR;
8047 }
8048 }
8049
8050 /* Issue the SYNC command */
8051 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8052 close(fd);
8053 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8054 strerror(errno));
8055 return REDIS_ERR;
8056 }
8057 /* Read the bulk write count */
8058 if (syncReadLine(fd,buf,1024,3600) == -1) {
8059 close(fd);
8060 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8061 strerror(errno));
8062 return REDIS_ERR;
8063 }
8064 if (buf[0] != '$') {
8065 close(fd);
8066 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8067 return REDIS_ERR;
8068 }
8069 dumpsize = strtol(buf+1,NULL,10);
8070 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8071 /* Read the bulk write data on a temp file */
8072 while(maxtries--) {
8073 snprintf(tmpfile,256,
8074 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8075 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8076 if (dfd != -1) break;
8077 sleep(1);
8078 }
8079 if (dfd == -1) {
8080 close(fd);
8081 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8082 return REDIS_ERR;
8083 }
8084 while(dumpsize) {
8085 int nread, nwritten;
8086
8087 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8088 if (nread == -1) {
8089 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8090 strerror(errno));
8091 close(fd);
8092 close(dfd);
8093 return REDIS_ERR;
8094 }
8095 nwritten = write(dfd,buf,nread);
8096 if (nwritten == -1) {
8097 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8098 close(fd);
8099 close(dfd);
8100 return REDIS_ERR;
8101 }
8102 dumpsize -= nread;
8103 }
8104 close(dfd);
8105 if (rename(tmpfile,server.dbfilename) == -1) {
8106 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8107 unlink(tmpfile);
8108 close(fd);
8109 return REDIS_ERR;
8110 }
8111 emptyDb();
8112 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8113 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8114 close(fd);
8115 return REDIS_ERR;
8116 }
8117 server.master = createClient(fd);
8118 server.master->flags |= REDIS_MASTER;
8119 server.master->authenticated = 1;
8120 server.replstate = REDIS_REPL_CONNECTED;
8121 return REDIS_OK;
8122 }
8123
8124 static void slaveofCommand(redisClient *c) {
8125 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8126 !strcasecmp(c->argv[2]->ptr,"one")) {
8127 if (server.masterhost) {
8128 sdsfree(server.masterhost);
8129 server.masterhost = NULL;
8130 if (server.master) freeClient(server.master);
8131 server.replstate = REDIS_REPL_NONE;
8132 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8133 }
8134 } else {
8135 sdsfree(server.masterhost);
8136 server.masterhost = sdsdup(c->argv[1]->ptr);
8137 server.masterport = atoi(c->argv[2]->ptr);
8138 if (server.master) freeClient(server.master);
8139 server.replstate = REDIS_REPL_CONNECT;
8140 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8141 server.masterhost, server.masterport);
8142 }
8143 addReply(c,shared.ok);
8144 }
8145
8146 /* ============================ Maxmemory directive ======================== */
8147
8148 /* Try to free one object form the pre-allocated objects free list.
8149 * This is useful under low mem conditions as by default we take 1 million
8150 * free objects allocated. On success REDIS_OK is returned, otherwise
8151 * REDIS_ERR. */
8152 static int tryFreeOneObjectFromFreelist(void) {
8153 robj *o;
8154
8155 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8156 if (listLength(server.objfreelist)) {
8157 listNode *head = listFirst(server.objfreelist);
8158 o = listNodeValue(head);
8159 listDelNode(server.objfreelist,head);
8160 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8161 zfree(o);
8162 return REDIS_OK;
8163 } else {
8164 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8165 return REDIS_ERR;
8166 }
8167 }
8168
8169 /* This function gets called when 'maxmemory' is set on the config file to limit
8170 * the max memory used by the server, and we are out of memory.
8171 * This function will try to, in order:
8172 *
8173 * - Free objects from the free list
8174 * - Try to remove keys with an EXPIRE set
8175 *
8176 * It is not possible to free enough memory to reach used-memory < maxmemory
8177 * the server will start refusing commands that will enlarge even more the
8178 * memory usage.
8179 */
8180 static void freeMemoryIfNeeded(void) {
8181 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8182 int j, k, freed = 0;
8183
8184 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8185 for (j = 0; j < server.dbnum; j++) {
8186 int minttl = -1;
8187 robj *minkey = NULL;
8188 struct dictEntry *de;
8189
8190 if (dictSize(server.db[j].expires)) {
8191 freed = 1;
8192 /* From a sample of three keys drop the one nearest to
8193 * the natural expire */
8194 for (k = 0; k < 3; k++) {
8195 time_t t;
8196
8197 de = dictGetRandomKey(server.db[j].expires);
8198 t = (time_t) dictGetEntryVal(de);
8199 if (minttl == -1 || t < minttl) {
8200 minkey = dictGetEntryKey(de);
8201 minttl = t;
8202 }
8203 }
8204 deleteKey(server.db+j,minkey);
8205 }
8206 }
8207 if (!freed) return; /* nothing to free... */
8208 }
8209 }
8210
8211 /* ============================== Append Only file ========================== */
8212
8213 /* Write the append only file buffer on disk.
8214 *
8215 * Since we are required to write the AOF before replying to the client,
8216 * and the only way the client socket can get a write is entering when the
8217 * the event loop, we accumulate all the AOF writes in a memory
8218 * buffer and write it on disk using this function just before entering
8219 * the event loop again. */
8220 static void flushAppendOnlyFile(void) {
8221 time_t now;
8222 ssize_t nwritten;
8223
8224 if (sdslen(server.aofbuf) == 0) return;
8225
8226 /* We want to perform a single write. This should be guaranteed atomic
8227 * at least if the filesystem we are writing is a real physical one.
8228 * While this will save us against the server being killed I don't think
8229 * there is much to do about the whole server stopping for power problems
8230 * or alike */
8231 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8232 if (nwritten != (signed)sdslen(server.aofbuf)) {
8233 /* Ooops, we are in troubles. The best thing to do for now is
8234 * aborting instead of giving the illusion that everything is
8235 * working as expected. */
8236 if (nwritten == -1) {
8237 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8238 } else {
8239 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8240 }
8241 exit(1);
8242 }
8243 sdsfree(server.aofbuf);
8244 server.aofbuf = sdsempty();
8245
8246 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8247 * childs performing heavy I/O on disk. */
8248 if (server.no_appendfsync_on_rewrite &&
8249 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8250 return;
8251 /* Fsync if needed */
8252 now = time(NULL);
8253 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8254 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8255 now-server.lastfsync > 1))
8256 {
8257 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8258 * flushing metadata. */
8259 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8260 server.lastfsync = now;
8261 }
8262 }
8263
8264 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8265 int j;
8266 buf = sdscatprintf(buf,"*%d\r\n",argc);
8267 for (j = 0; j < argc; j++) {
8268 robj *o = getDecodedObject(argv[j]);
8269 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8270 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8271 buf = sdscatlen(buf,"\r\n",2);
8272 decrRefCount(o);
8273 }
8274 return buf;
8275 }
8276
8277 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8278 int argc = 3;
8279 long when;
8280 robj *argv[3];
8281
8282 /* Make sure we can use strtol */
8283 seconds = getDecodedObject(seconds);
8284 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8285 decrRefCount(seconds);
8286
8287 argv[0] = createStringObject("EXPIREAT",8);
8288 argv[1] = key;
8289 argv[2] = createObject(REDIS_STRING,
8290 sdscatprintf(sdsempty(),"%ld",when));
8291 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8292 decrRefCount(argv[0]);
8293 decrRefCount(argv[2]);
8294 return buf;
8295 }
8296
8297 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8298 sds buf = sdsempty();
8299 robj *tmpargv[3];
8300
8301 /* The DB this command was targetting is not the same as the last command
8302 * we appendend. To issue a SELECT command is needed. */
8303 if (dictid != server.appendseldb) {
8304 char seldb[64];
8305
8306 snprintf(seldb,sizeof(seldb),"%d",dictid);
8307 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8308 (unsigned long)strlen(seldb),seldb);
8309 server.appendseldb = dictid;
8310 }
8311
8312 if (cmd->proc == expireCommand) {
8313 /* Translate EXPIRE into EXPIREAT */
8314 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8315 } else if (cmd->proc == setexCommand) {
8316 /* Translate SETEX to SET and EXPIREAT */
8317 tmpargv[0] = createStringObject("SET",3);
8318 tmpargv[1] = argv[1];
8319 tmpargv[2] = argv[3];
8320 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8321 decrRefCount(tmpargv[0]);
8322 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8323 } else {
8324 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8325 }
8326
8327 /* Append to the AOF buffer. This will be flushed on disk just before
8328 * of re-entering the event loop, so before the client will get a
8329 * positive reply about the operation performed. */
8330 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8331
8332 /* If a background append only file rewriting is in progress we want to
8333 * accumulate the differences between the child DB and the current one
8334 * in a buffer, so that when the child process will do its work we
8335 * can append the differences to the new append only file. */
8336 if (server.bgrewritechildpid != -1)
8337 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8338
8339 sdsfree(buf);
8340 }
8341
8342 /* In Redis commands are always executed in the context of a client, so in
8343 * order to load the append only file we need to create a fake client. */
8344 static struct redisClient *createFakeClient(void) {
8345 struct redisClient *c = zmalloc(sizeof(*c));
8346
8347 selectDb(c,0);
8348 c->fd = -1;
8349 c->querybuf = sdsempty();
8350 c->argc = 0;
8351 c->argv = NULL;
8352 c->flags = 0;
8353 /* We set the fake client as a slave waiting for the synchronization
8354 * so that Redis will not try to send replies to this client. */
8355 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8356 c->reply = listCreate();
8357 listSetFreeMethod(c->reply,decrRefCount);
8358 listSetDupMethod(c->reply,dupClientReplyValue);
8359 initClientMultiState(c);
8360 return c;
8361 }
8362
8363 static void freeFakeClient(struct redisClient *c) {
8364 sdsfree(c->querybuf);
8365 listRelease(c->reply);
8366 freeClientMultiState(c);
8367 zfree(c);
8368 }
8369
8370 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8371 * error (the append only file is zero-length) REDIS_ERR is returned. On
8372 * fatal error an error message is logged and the program exists. */
8373 int loadAppendOnlyFile(char *filename) {
8374 struct redisClient *fakeClient;
8375 FILE *fp = fopen(filename,"r");
8376 struct redis_stat sb;
8377 unsigned long long loadedkeys = 0;
8378 int appendonly = server.appendonly;
8379
8380 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8381 return REDIS_ERR;
8382
8383 if (fp == NULL) {
8384 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8385 exit(1);
8386 }
8387
8388 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8389 * to the same file we're about to read. */
8390 server.appendonly = 0;
8391
8392 fakeClient = createFakeClient();
8393 while(1) {
8394 int argc, j;
8395 unsigned long len;
8396 robj **argv;
8397 char buf[128];
8398 sds argsds;
8399 struct redisCommand *cmd;
8400
8401 if (fgets(buf,sizeof(buf),fp) == NULL) {
8402 if (feof(fp))
8403 break;
8404 else
8405 goto readerr;
8406 }
8407 if (buf[0] != '*') goto fmterr;
8408 argc = atoi(buf+1);
8409 argv = zmalloc(sizeof(robj*)*argc);
8410 for (j = 0; j < argc; j++) {
8411 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8412 if (buf[0] != '$') goto fmterr;
8413 len = strtol(buf+1,NULL,10);
8414 argsds = sdsnewlen(NULL,len);
8415 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8416 argv[j] = createObject(REDIS_STRING,argsds);
8417 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8418 }
8419
8420 /* Command lookup */
8421 cmd = lookupCommand(argv[0]->ptr);
8422 if (!cmd) {
8423 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8424 exit(1);
8425 }
8426 /* Try object encoding */
8427 if (cmd->flags & REDIS_CMD_BULK)
8428 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8429 /* Run the command in the context of a fake client */
8430 fakeClient->argc = argc;
8431 fakeClient->argv = argv;
8432 cmd->proc(fakeClient);
8433 /* Discard the reply objects list from the fake client */
8434 while(listLength(fakeClient->reply))
8435 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8436 /* Clean up, ready for the next command */
8437 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8438 zfree(argv);
8439 /* Handle swapping while loading big datasets when VM is on */
8440 loadedkeys++;
8441 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8442 while (zmalloc_used_memory() > server.vm_max_memory) {
8443 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8444 }
8445 }
8446 }
8447
8448 /* This point can only be reached when EOF is reached without errors.
8449 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8450 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8451
8452 fclose(fp);
8453 freeFakeClient(fakeClient);
8454 server.appendonly = appendonly;
8455 return REDIS_OK;
8456
8457 readerr:
8458 if (feof(fp)) {
8459 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8460 } else {
8461 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8462 }
8463 exit(1);
8464 fmterr:
8465 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8466 exit(1);
8467 }
8468
8469 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8470 static int fwriteBulkObject(FILE *fp, robj *obj) {
8471 char buf[128];
8472 int decrrc = 0;
8473
8474 /* Avoid the incr/decr ref count business if possible to help
8475 * copy-on-write (we are often in a child process when this function
8476 * is called).
8477 * Also makes sure that key objects don't get incrRefCount-ed when VM
8478 * is enabled */
8479 if (obj->encoding != REDIS_ENCODING_RAW) {
8480 obj = getDecodedObject(obj);
8481 decrrc = 1;
8482 }
8483 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8484 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8485 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8486 goto err;
8487 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8488 if (decrrc) decrRefCount(obj);
8489 return 1;
8490 err:
8491 if (decrrc) decrRefCount(obj);
8492 return 0;
8493 }
8494
8495 /* Write binary-safe string into a file in the bulkformat
8496 * $<count>\r\n<payload>\r\n */
8497 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8498 char buf[128];
8499
8500 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8501 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8502 if (len && fwrite(s,len,1,fp) == 0) return 0;
8503 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8504 return 1;
8505 }
8506
8507 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8508 static int fwriteBulkDouble(FILE *fp, double d) {
8509 char buf[128], dbuf[128];
8510
8511 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8512 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8513 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8514 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8515 return 1;
8516 }
8517
8518 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8519 static int fwriteBulkLong(FILE *fp, long l) {
8520 char buf[128], lbuf[128];
8521
8522 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8523 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8524 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8525 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8526 return 1;
8527 }
8528
8529 /* Write a sequence of commands able to fully rebuild the dataset into
8530 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8531 static int rewriteAppendOnlyFile(char *filename) {
8532 dictIterator *di = NULL;
8533 dictEntry *de;
8534 FILE *fp;
8535 char tmpfile[256];
8536 int j;
8537 time_t now = time(NULL);
8538
8539 /* Note that we have to use a different temp name here compared to the
8540 * one used by rewriteAppendOnlyFileBackground() function. */
8541 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8542 fp = fopen(tmpfile,"w");
8543 if (!fp) {
8544 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8545 return REDIS_ERR;
8546 }
8547 for (j = 0; j < server.dbnum; j++) {
8548 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8549 redisDb *db = server.db+j;
8550 dict *d = db->dict;
8551 if (dictSize(d) == 0) continue;
8552 di = dictGetIterator(d);
8553 if (!di) {
8554 fclose(fp);
8555 return REDIS_ERR;
8556 }
8557
8558 /* SELECT the new DB */
8559 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8560 if (fwriteBulkLong(fp,j) == 0) goto werr;
8561
8562 /* Iterate this DB writing every entry */
8563 while((de = dictNext(di)) != NULL) {
8564 robj *key, *o;
8565 time_t expiretime;
8566 int swapped;
8567
8568 key = dictGetEntryKey(de);
8569 /* If the value for this key is swapped, load a preview in memory.
8570 * We use a "swapped" flag to remember if we need to free the
8571 * value object instead to just increment the ref count anyway
8572 * in order to avoid copy-on-write of pages if we are forked() */
8573 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8574 key->storage == REDIS_VM_SWAPPING) {
8575 o = dictGetEntryVal(de);
8576 swapped = 0;
8577 } else {
8578 o = vmPreviewObject(key);
8579 swapped = 1;
8580 }
8581 expiretime = getExpire(db,key);
8582
8583 /* Save the key and associated value */
8584 if (o->type == REDIS_STRING) {
8585 /* Emit a SET command */
8586 char cmd[]="*3\r\n$3\r\nSET\r\n";
8587 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8588 /* Key and value */
8589 if (fwriteBulkObject(fp,key) == 0) goto werr;
8590 if (fwriteBulkObject(fp,o) == 0) goto werr;
8591 } else if (o->type == REDIS_LIST) {
8592 /* Emit the RPUSHes needed to rebuild the list */
8593 list *list = o->ptr;
8594 listNode *ln;
8595 listIter li;
8596
8597 listRewind(list,&li);
8598 while((ln = listNext(&li))) {
8599 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8600 robj *eleobj = listNodeValue(ln);
8601
8602 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8603 if (fwriteBulkObject(fp,key) == 0) goto werr;
8604 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8605 }
8606 } else if (o->type == REDIS_SET) {
8607 /* Emit the SADDs needed to rebuild the set */
8608 dict *set = o->ptr;
8609 dictIterator *di = dictGetIterator(set);
8610 dictEntry *de;
8611
8612 while((de = dictNext(di)) != NULL) {
8613 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8614 robj *eleobj = dictGetEntryKey(de);
8615
8616 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8617 if (fwriteBulkObject(fp,key) == 0) goto werr;
8618 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8619 }
8620 dictReleaseIterator(di);
8621 } else if (o->type == REDIS_ZSET) {
8622 /* Emit the ZADDs needed to rebuild the sorted set */
8623 zset *zs = o->ptr;
8624 dictIterator *di = dictGetIterator(zs->dict);
8625 dictEntry *de;
8626
8627 while((de = dictNext(di)) != NULL) {
8628 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8629 robj *eleobj = dictGetEntryKey(de);
8630 double *score = dictGetEntryVal(de);
8631
8632 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8633 if (fwriteBulkObject(fp,key) == 0) goto werr;
8634 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8635 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8636 }
8637 dictReleaseIterator(di);
8638 } else if (o->type == REDIS_HASH) {
8639 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8640
8641 /* Emit the HSETs needed to rebuild the hash */
8642 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8643 unsigned char *p = zipmapRewind(o->ptr);
8644 unsigned char *field, *val;
8645 unsigned int flen, vlen;
8646
8647 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8648 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8649 if (fwriteBulkObject(fp,key) == 0) goto werr;
8650 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8651 return -1;
8652 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8653 return -1;
8654 }
8655 } else {
8656 dictIterator *di = dictGetIterator(o->ptr);
8657 dictEntry *de;
8658
8659 while((de = dictNext(di)) != NULL) {
8660 robj *field = dictGetEntryKey(de);
8661 robj *val = dictGetEntryVal(de);
8662
8663 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8664 if (fwriteBulkObject(fp,key) == 0) goto werr;
8665 if (fwriteBulkObject(fp,field) == -1) return -1;
8666 if (fwriteBulkObject(fp,val) == -1) return -1;
8667 }
8668 dictReleaseIterator(di);
8669 }
8670 } else {
8671 redisPanic("Unknown object type");
8672 }
8673 /* Save the expire time */
8674 if (expiretime != -1) {
8675 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8676 /* If this key is already expired skip it */
8677 if (expiretime < now) continue;
8678 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8679 if (fwriteBulkObject(fp,key) == 0) goto werr;
8680 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8681 }
8682 if (swapped) decrRefCount(o);
8683 }
8684 dictReleaseIterator(di);
8685 }
8686
8687 /* Make sure data will not remain on the OS's output buffers */
8688 fflush(fp);
8689 aof_fsync(fileno(fp));
8690 fclose(fp);
8691
8692 /* Use RENAME to make sure the DB file is changed atomically only
8693 * if the generate DB file is ok. */
8694 if (rename(tmpfile,filename) == -1) {
8695 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8696 unlink(tmpfile);
8697 return REDIS_ERR;
8698 }
8699 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8700 return REDIS_OK;
8701
8702 werr:
8703 fclose(fp);
8704 unlink(tmpfile);
8705 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8706 if (di) dictReleaseIterator(di);
8707 return REDIS_ERR;
8708 }
8709
8710 /* This is how rewriting of the append only file in background works:
8711 *
8712 * 1) The user calls BGREWRITEAOF
8713 * 2) Redis calls this function, that forks():
8714 * 2a) the child rewrite the append only file in a temp file.
8715 * 2b) the parent accumulates differences in server.bgrewritebuf.
8716 * 3) When the child finished '2a' exists.
8717 * 4) The parent will trap the exit code, if it's OK, will append the
8718 * data accumulated into server.bgrewritebuf into the temp file, and
8719 * finally will rename(2) the temp file in the actual file name.
8720 * The the new file is reopened as the new append only file. Profit!
8721 */
8722 static int rewriteAppendOnlyFileBackground(void) {
8723 pid_t childpid;
8724
8725 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8726 if (server.vm_enabled) waitEmptyIOJobsQueue();
8727 if ((childpid = fork()) == 0) {
8728 /* Child */
8729 char tmpfile[256];
8730
8731 if (server.vm_enabled) vmReopenSwapFile();
8732 close(server.fd);
8733 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8734 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8735 _exit(0);
8736 } else {
8737 _exit(1);
8738 }
8739 } else {
8740 /* Parent */
8741 if (childpid == -1) {
8742 redisLog(REDIS_WARNING,
8743 "Can't rewrite append only file in background: fork: %s",
8744 strerror(errno));
8745 return REDIS_ERR;
8746 }
8747 redisLog(REDIS_NOTICE,
8748 "Background append only file rewriting started by pid %d",childpid);
8749 server.bgrewritechildpid = childpid;
8750 updateDictResizePolicy();
8751 /* We set appendseldb to -1 in order to force the next call to the
8752 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8753 * accumulated by the parent into server.bgrewritebuf will start
8754 * with a SELECT statement and it will be safe to merge. */
8755 server.appendseldb = -1;
8756 return REDIS_OK;
8757 }
8758 return REDIS_OK; /* unreached */
8759 }
8760
8761 static void bgrewriteaofCommand(redisClient *c) {
8762 if (server.bgrewritechildpid != -1) {
8763 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8764 return;
8765 }
8766 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8767 char *status = "+Background append only file rewriting started\r\n";
8768 addReplySds(c,sdsnew(status));
8769 } else {
8770 addReply(c,shared.err);
8771 }
8772 }
8773
8774 static void aofRemoveTempFile(pid_t childpid) {
8775 char tmpfile[256];
8776
8777 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8778 unlink(tmpfile);
8779 }
8780
8781 /* Virtual Memory is composed mainly of two subsystems:
8782 * - Blocking Virutal Memory
8783 * - Threaded Virtual Memory I/O
8784 * The two parts are not fully decoupled, but functions are split among two
8785 * different sections of the source code (delimited by comments) in order to
8786 * make more clear what functionality is about the blocking VM and what about
8787 * the threaded (not blocking) VM.
8788 *
8789 * Redis VM design:
8790 *
8791 * Redis VM is a blocking VM (one that blocks reading swapped values from
8792 * disk into memory when a value swapped out is needed in memory) that is made
8793 * unblocking by trying to examine the command argument vector in order to
8794 * load in background values that will likely be needed in order to exec
8795 * the command. The command is executed only once all the relevant keys
8796 * are loaded into memory.
8797 *
8798 * This basically is almost as simple of a blocking VM, but almost as parallel
8799 * as a fully non-blocking VM.
8800 */
8801
8802 /* Called when the user switches from "appendonly yes" to "appendonly no"
8803 * at runtime using the CONFIG command. */
8804 static void stopAppendOnly(void) {
8805 flushAppendOnlyFile();
8806 aof_fsync(server.appendfd);
8807 close(server.appendfd);
8808
8809 server.appendfd = -1;
8810 server.appendseldb = -1;
8811 server.appendonly = 0;
8812 /* rewrite operation in progress? kill it, wait child exit */
8813 if (server.bgsavechildpid != -1) {
8814 int statloc;
8815
8816 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8817 wait3(&statloc,0,NULL);
8818 /* reset the buffer accumulating changes while the child saves */
8819 sdsfree(server.bgrewritebuf);
8820 server.bgrewritebuf = sdsempty();
8821 server.bgsavechildpid = -1;
8822 }
8823 }
8824
8825 /* Called when the user switches from "appendonly no" to "appendonly yes"
8826 * at runtime using the CONFIG command. */
8827 static int startAppendOnly(void) {
8828 server.appendonly = 1;
8829 server.lastfsync = time(NULL);
8830 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8831 if (server.appendfd == -1) {
8832 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8833 return REDIS_ERR;
8834 }
8835 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8836 server.appendonly = 0;
8837 close(server.appendfd);
8838 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8839 return REDIS_ERR;
8840 }
8841 return REDIS_OK;
8842 }
8843
8844 /* =================== Virtual Memory - Blocking Side ====================== */
8845
8846 static void vmInit(void) {
8847 off_t totsize;
8848 int pipefds[2];
8849 size_t stacksize;
8850 struct flock fl;
8851
8852 if (server.vm_max_threads != 0)
8853 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8854
8855 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8856 /* Try to open the old swap file, otherwise create it */
8857 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8858 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8859 }
8860 if (server.vm_fp == NULL) {
8861 redisLog(REDIS_WARNING,
8862 "Can't open the swap file: %s. Exiting.",
8863 strerror(errno));
8864 exit(1);
8865 }
8866 server.vm_fd = fileno(server.vm_fp);
8867 /* Lock the swap file for writing, this is useful in order to avoid
8868 * another instance to use the same swap file for a config error. */
8869 fl.l_type = F_WRLCK;
8870 fl.l_whence = SEEK_SET;
8871 fl.l_start = fl.l_len = 0;
8872 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8873 redisLog(REDIS_WARNING,
8874 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8875 exit(1);
8876 }
8877 /* Initialize */
8878 server.vm_next_page = 0;
8879 server.vm_near_pages = 0;
8880 server.vm_stats_used_pages = 0;
8881 server.vm_stats_swapped_objects = 0;
8882 server.vm_stats_swapouts = 0;
8883 server.vm_stats_swapins = 0;
8884 totsize = server.vm_pages*server.vm_page_size;
8885 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8886 if (ftruncate(server.vm_fd,totsize) == -1) {
8887 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8888 strerror(errno));
8889 exit(1);
8890 } else {
8891 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8892 }
8893 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8894 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8895 (long long) (server.vm_pages+7)/8, server.vm_pages);
8896 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8897
8898 /* Initialize threaded I/O (used by Virtual Memory) */
8899 server.io_newjobs = listCreate();
8900 server.io_processing = listCreate();
8901 server.io_processed = listCreate();
8902 server.io_ready_clients = listCreate();
8903 pthread_mutex_init(&server.io_mutex,NULL);
8904 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8905 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8906 server.io_active_threads = 0;
8907 if (pipe(pipefds) == -1) {
8908 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8909 ,strerror(errno));
8910 exit(1);
8911 }
8912 server.io_ready_pipe_read = pipefds[0];
8913 server.io_ready_pipe_write = pipefds[1];
8914 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8915 /* LZF requires a lot of stack */
8916 pthread_attr_init(&server.io_threads_attr);
8917 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8918 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8919 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8920 /* Listen for events in the threaded I/O pipe */
8921 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8922 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8923 oom("creating file event");
8924 }
8925
8926 /* Mark the page as used */
8927 static void vmMarkPageUsed(off_t page) {
8928 off_t byte = page/8;
8929 int bit = page&7;
8930 redisAssert(vmFreePage(page) == 1);
8931 server.vm_bitmap[byte] |= 1<<bit;
8932 }
8933
8934 /* Mark N contiguous pages as used, with 'page' being the first. */
8935 static void vmMarkPagesUsed(off_t page, off_t count) {
8936 off_t j;
8937
8938 for (j = 0; j < count; j++)
8939 vmMarkPageUsed(page+j);
8940 server.vm_stats_used_pages += count;
8941 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8942 (long long)count, (long long)page);
8943 }
8944
8945 /* Mark the page as free */
8946 static void vmMarkPageFree(off_t page) {
8947 off_t byte = page/8;
8948 int bit = page&7;
8949 redisAssert(vmFreePage(page) == 0);
8950 server.vm_bitmap[byte] &= ~(1<<bit);
8951 }
8952
8953 /* Mark N contiguous pages as free, with 'page' being the first. */
8954 static void vmMarkPagesFree(off_t page, off_t count) {
8955 off_t j;
8956
8957 for (j = 0; j < count; j++)
8958 vmMarkPageFree(page+j);
8959 server.vm_stats_used_pages -= count;
8960 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8961 (long long)count, (long long)page);
8962 }
8963
8964 /* Test if the page is free */
8965 static int vmFreePage(off_t page) {
8966 off_t byte = page/8;
8967 int bit = page&7;
8968 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8969 }
8970
8971 /* Find N contiguous free pages storing the first page of the cluster in *first.
8972 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8973 * REDIS_ERR is returned.
8974 *
8975 * This function uses a simple algorithm: we try to allocate
8976 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8977 * again from the start of the swap file searching for free spaces.
8978 *
8979 * If it looks pretty clear that there are no free pages near our offset
8980 * we try to find less populated places doing a forward jump of
8981 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8982 * without hurry, and then we jump again and so forth...
8983 *
8984 * This function can be improved using a free list to avoid to guess
8985 * too much, since we could collect data about freed pages.
8986 *
8987 * note: I implemented this function just after watching an episode of
8988 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8989 */
8990 static int vmFindContiguousPages(off_t *first, off_t n) {
8991 off_t base, offset = 0, since_jump = 0, numfree = 0;
8992
8993 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8994 server.vm_near_pages = 0;
8995 server.vm_next_page = 0;
8996 }
8997 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8998 base = server.vm_next_page;
8999
9000 while(offset < server.vm_pages) {
9001 off_t this = base+offset;
9002
9003 /* If we overflow, restart from page zero */
9004 if (this >= server.vm_pages) {
9005 this -= server.vm_pages;
9006 if (this == 0) {
9007 /* Just overflowed, what we found on tail is no longer
9008 * interesting, as it's no longer contiguous. */
9009 numfree = 0;
9010 }
9011 }
9012 if (vmFreePage(this)) {
9013 /* This is a free page */
9014 numfree++;
9015 /* Already got N free pages? Return to the caller, with success */
9016 if (numfree == n) {
9017 *first = this-(n-1);
9018 server.vm_next_page = this+1;
9019 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9020 return REDIS_OK;
9021 }
9022 } else {
9023 /* The current one is not a free page */
9024 numfree = 0;
9025 }
9026
9027 /* Fast-forward if the current page is not free and we already
9028 * searched enough near this place. */
9029 since_jump++;
9030 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9031 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9032 since_jump = 0;
9033 /* Note that even if we rewind after the jump, we are don't need
9034 * to make sure numfree is set to zero as we only jump *if* it
9035 * is set to zero. */
9036 } else {
9037 /* Otherwise just check the next page */
9038 offset++;
9039 }
9040 }
9041 return REDIS_ERR;
9042 }
9043
9044 /* Write the specified object at the specified page of the swap file */
9045 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9046 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9047 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9048 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9049 redisLog(REDIS_WARNING,
9050 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9051 strerror(errno));
9052 return REDIS_ERR;
9053 }
9054 rdbSaveObject(server.vm_fp,o);
9055 fflush(server.vm_fp);
9056 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9057 return REDIS_OK;
9058 }
9059
9060 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9061 * needed to later retrieve the object into the key object.
9062 * If we can't find enough contiguous empty pages to swap the object on disk
9063 * REDIS_ERR is returned. */
9064 static int vmSwapObjectBlocking(robj *key, robj *val) {
9065 off_t pages = rdbSavedObjectPages(val,NULL);
9066 off_t page;
9067
9068 assert(key->storage == REDIS_VM_MEMORY);
9069 assert(key->refcount == 1);
9070 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9071 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9072 key->vm.page = page;
9073 key->vm.usedpages = pages;
9074 key->storage = REDIS_VM_SWAPPED;
9075 key->vtype = val->type;
9076 decrRefCount(val); /* Deallocate the object from memory. */
9077 vmMarkPagesUsed(page,pages);
9078 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9079 (unsigned char*) key->ptr,
9080 (unsigned long long) page, (unsigned long long) pages);
9081 server.vm_stats_swapped_objects++;
9082 server.vm_stats_swapouts++;
9083 return REDIS_OK;
9084 }
9085
9086 static robj *vmReadObjectFromSwap(off_t page, int type) {
9087 robj *o;
9088
9089 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9090 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9091 redisLog(REDIS_WARNING,
9092 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9093 strerror(errno));
9094 _exit(1);
9095 }
9096 o = rdbLoadObject(type,server.vm_fp);
9097 if (o == NULL) {
9098 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9099 _exit(1);
9100 }
9101 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9102 return o;
9103 }
9104
9105 /* Load the value object relative to the 'key' object from swap to memory.
9106 * The newly allocated object is returned.
9107 *
9108 * If preview is true the unserialized object is returned to the caller but
9109 * no changes are made to the key object, nor the pages are marked as freed */
9110 static robj *vmGenericLoadObject(robj *key, int preview) {
9111 robj *val;
9112
9113 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9114 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9115 if (!preview) {
9116 key->storage = REDIS_VM_MEMORY;
9117 key->vm.atime = server.unixtime;
9118 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9119 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9120 (unsigned char*) key->ptr);
9121 server.vm_stats_swapped_objects--;
9122 } else {
9123 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9124 (unsigned char*) key->ptr);
9125 }
9126 server.vm_stats_swapins++;
9127 return val;
9128 }
9129
9130 /* Plain object loading, from swap to memory */
9131 static robj *vmLoadObject(robj *key) {
9132 /* If we are loading the object in background, stop it, we
9133 * need to load this object synchronously ASAP. */
9134 if (key->storage == REDIS_VM_LOADING)
9135 vmCancelThreadedIOJob(key);
9136 return vmGenericLoadObject(key,0);
9137 }
9138
9139 /* Just load the value on disk, without to modify the key.
9140 * This is useful when we want to perform some operation on the value
9141 * without to really bring it from swap to memory, like while saving the
9142 * dataset or rewriting the append only log. */
9143 static robj *vmPreviewObject(robj *key) {
9144 return vmGenericLoadObject(key,1);
9145 }
9146
9147 /* How a good candidate is this object for swapping?
9148 * The better candidate it is, the greater the returned value.
9149 *
9150 * Currently we try to perform a fast estimation of the object size in
9151 * memory, and combine it with aging informations.
9152 *
9153 * Basically swappability = idle-time * log(estimated size)
9154 *
9155 * Bigger objects are preferred over smaller objects, but not
9156 * proportionally, this is why we use the logarithm. This algorithm is
9157 * just a first try and will probably be tuned later. */
9158 static double computeObjectSwappability(robj *o) {
9159 time_t age = server.unixtime - o->vm.atime;
9160 long asize = 0;
9161 list *l;
9162 dict *d;
9163 struct dictEntry *de;
9164 int z;
9165
9166 if (age <= 0) return 0;
9167 switch(o->type) {
9168 case REDIS_STRING:
9169 if (o->encoding != REDIS_ENCODING_RAW) {
9170 asize = sizeof(*o);
9171 } else {
9172 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9173 }
9174 break;
9175 case REDIS_LIST:
9176 l = o->ptr;
9177 listNode *ln = listFirst(l);
9178
9179 asize = sizeof(list);
9180 if (ln) {
9181 robj *ele = ln->value;
9182 long elesize;
9183
9184 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9185 (sizeof(*o)+sdslen(ele->ptr)) :
9186 sizeof(*o);
9187 asize += (sizeof(listNode)+elesize)*listLength(l);
9188 }
9189 break;
9190 case REDIS_SET:
9191 case REDIS_ZSET:
9192 z = (o->type == REDIS_ZSET);
9193 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9194
9195 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9196 if (z) asize += sizeof(zset)-sizeof(dict);
9197 if (dictSize(d)) {
9198 long elesize;
9199 robj *ele;
9200
9201 de = dictGetRandomKey(d);
9202 ele = dictGetEntryKey(de);
9203 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9204 (sizeof(*o)+sdslen(ele->ptr)) :
9205 sizeof(*o);
9206 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9207 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9208 }
9209 break;
9210 case REDIS_HASH:
9211 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9212 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9213 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9214 unsigned int klen, vlen;
9215 unsigned char *key, *val;
9216
9217 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9218 klen = 0;
9219 vlen = 0;
9220 }
9221 asize = len*(klen+vlen+3);
9222 } else if (o->encoding == REDIS_ENCODING_HT) {
9223 d = o->ptr;
9224 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9225 if (dictSize(d)) {
9226 long elesize;
9227 robj *ele;
9228
9229 de = dictGetRandomKey(d);
9230 ele = dictGetEntryKey(de);
9231 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9232 (sizeof(*o)+sdslen(ele->ptr)) :
9233 sizeof(*o);
9234 ele = dictGetEntryVal(de);
9235 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9236 (sizeof(*o)+sdslen(ele->ptr)) :
9237 sizeof(*o);
9238 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9239 }
9240 }
9241 break;
9242 }
9243 return (double)age*log(1+asize);
9244 }
9245
9246 /* Try to swap an object that's a good candidate for swapping.
9247 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9248 * to swap any object at all.
9249 *
9250 * If 'usethreaded' is true, Redis will try to swap the object in background
9251 * using I/O threads. */
9252 static int vmSwapOneObject(int usethreads) {
9253 int j, i;
9254 struct dictEntry *best = NULL;
9255 double best_swappability = 0;
9256 redisDb *best_db = NULL;
9257 robj *key, *val;
9258
9259 for (j = 0; j < server.dbnum; j++) {
9260 redisDb *db = server.db+j;
9261 /* Why maxtries is set to 100?
9262 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9263 * are swappable objects */
9264 int maxtries = 100;
9265
9266 if (dictSize(db->dict) == 0) continue;
9267 for (i = 0; i < 5; i++) {
9268 dictEntry *de;
9269 double swappability;
9270
9271 if (maxtries) maxtries--;
9272 de = dictGetRandomKey(db->dict);
9273 key = dictGetEntryKey(de);
9274 val = dictGetEntryVal(de);
9275 /* Only swap objects that are currently in memory.
9276 *
9277 * Also don't swap shared objects if threaded VM is on, as we
9278 * try to ensure that the main thread does not touch the
9279 * object while the I/O thread is using it, but we can't
9280 * control other keys without adding additional mutex. */
9281 if (key->storage != REDIS_VM_MEMORY ||
9282 (server.vm_max_threads != 0 && val->refcount != 1)) {
9283 if (maxtries) i--; /* don't count this try */
9284 continue;
9285 }
9286 swappability = computeObjectSwappability(val);
9287 if (!best || swappability > best_swappability) {
9288 best = de;
9289 best_swappability = swappability;
9290 best_db = db;
9291 }
9292 }
9293 }
9294 if (best == NULL) return REDIS_ERR;
9295 key = dictGetEntryKey(best);
9296 val = dictGetEntryVal(best);
9297
9298 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9299 key->ptr, best_swappability);
9300
9301 /* Unshare the key if needed */
9302 if (key->refcount > 1) {
9303 robj *newkey = dupStringObject(key);
9304 decrRefCount(key);
9305 key = dictGetEntryKey(best) = newkey;
9306 }
9307 /* Swap it */
9308 if (usethreads) {
9309 vmSwapObjectThreaded(key,val,best_db);
9310 return REDIS_OK;
9311 } else {
9312 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9313 dictGetEntryVal(best) = NULL;
9314 return REDIS_OK;
9315 } else {
9316 return REDIS_ERR;
9317 }
9318 }
9319 }
9320
9321 static int vmSwapOneObjectBlocking() {
9322 return vmSwapOneObject(0);
9323 }
9324
9325 static int vmSwapOneObjectThreaded() {
9326 return vmSwapOneObject(1);
9327 }
9328
9329 /* Return true if it's safe to swap out objects in a given moment.
9330 * Basically we don't want to swap objects out while there is a BGSAVE
9331 * or a BGAEOREWRITE running in backgroud. */
9332 static int vmCanSwapOut(void) {
9333 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9334 }
9335
9336 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9337 * and was deleted. Otherwise 0 is returned. */
9338 static int deleteIfSwapped(redisDb *db, robj *key) {
9339 dictEntry *de;
9340 robj *foundkey;
9341
9342 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9343 foundkey = dictGetEntryKey(de);
9344 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9345 deleteKey(db,key);
9346 return 1;
9347 }
9348
9349 /* =================== Virtual Memory - Threaded I/O ======================= */
9350
9351 static void freeIOJob(iojob *j) {
9352 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9353 j->type == REDIS_IOJOB_DO_SWAP ||
9354 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9355 decrRefCount(j->val);
9356 /* We don't decrRefCount the j->key field as we did't incremented
9357 * the count creating IO Jobs. This is because the key field here is
9358 * just used as an indentifier and if a key is removed the Job should
9359 * never be touched again. */
9360 zfree(j);
9361 }
9362
9363 /* Every time a thread finished a Job, it writes a byte into the write side
9364 * of an unix pipe in order to "awake" the main thread, and this function
9365 * is called. */
9366 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9367 int mask)
9368 {
9369 char buf[1];
9370 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9371 REDIS_NOTUSED(el);
9372 REDIS_NOTUSED(mask);
9373 REDIS_NOTUSED(privdata);
9374
9375 /* For every byte we read in the read side of the pipe, there is one
9376 * I/O job completed to process. */
9377 while((retval = read(fd,buf,1)) == 1) {
9378 iojob *j;
9379 listNode *ln;
9380 robj *key;
9381 struct dictEntry *de;
9382
9383 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9384
9385 /* Get the processed element (the oldest one) */
9386 lockThreadedIO();
9387 assert(listLength(server.io_processed) != 0);
9388 if (toprocess == -1) {
9389 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9390 if (toprocess <= 0) toprocess = 1;
9391 }
9392 ln = listFirst(server.io_processed);
9393 j = ln->value;
9394 listDelNode(server.io_processed,ln);
9395 unlockThreadedIO();
9396 /* If this job is marked as canceled, just ignore it */
9397 if (j->canceled) {
9398 freeIOJob(j);
9399 continue;
9400 }
9401 /* Post process it in the main thread, as there are things we
9402 * can do just here to avoid race conditions and/or invasive locks */
9403 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9404 de = dictFind(j->db->dict,j->key);
9405 assert(de != NULL);
9406 key = dictGetEntryKey(de);
9407 if (j->type == REDIS_IOJOB_LOAD) {
9408 redisDb *db;
9409
9410 /* Key loaded, bring it at home */
9411 key->storage = REDIS_VM_MEMORY;
9412 key->vm.atime = server.unixtime;
9413 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9414 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9415 (unsigned char*) key->ptr);
9416 server.vm_stats_swapped_objects--;
9417 server.vm_stats_swapins++;
9418 dictGetEntryVal(de) = j->val;
9419 incrRefCount(j->val);
9420 db = j->db;
9421 freeIOJob(j);
9422 /* Handle clients waiting for this key to be loaded. */
9423 handleClientsBlockedOnSwappedKey(db,key);
9424 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9425 /* Now we know the amount of pages required to swap this object.
9426 * Let's find some space for it, and queue this task again
9427 * rebranded as REDIS_IOJOB_DO_SWAP. */
9428 if (!vmCanSwapOut() ||
9429 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9430 {
9431 /* Ooops... no space or we can't swap as there is
9432 * a fork()ed Redis trying to save stuff on disk. */
9433 freeIOJob(j);
9434 key->storage = REDIS_VM_MEMORY; /* undo operation */
9435 } else {
9436 /* Note that we need to mark this pages as used now,
9437 * if the job will be canceled, we'll mark them as freed
9438 * again. */
9439 vmMarkPagesUsed(j->page,j->pages);
9440 j->type = REDIS_IOJOB_DO_SWAP;
9441 lockThreadedIO();
9442 queueIOJob(j);
9443 unlockThreadedIO();
9444 }
9445 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9446 robj *val;
9447
9448 /* Key swapped. We can finally free some memory. */
9449 if (key->storage != REDIS_VM_SWAPPING) {
9450 printf("key->storage: %d\n",key->storage);
9451 printf("key->name: %s\n",(char*)key->ptr);
9452 printf("key->refcount: %d\n",key->refcount);
9453 printf("val: %p\n",(void*)j->val);
9454 printf("val->type: %d\n",j->val->type);
9455 printf("val->ptr: %s\n",(char*)j->val->ptr);
9456 }
9457 redisAssert(key->storage == REDIS_VM_SWAPPING);
9458 val = dictGetEntryVal(de);
9459 key->vm.page = j->page;
9460 key->vm.usedpages = j->pages;
9461 key->storage = REDIS_VM_SWAPPED;
9462 key->vtype = j->val->type;
9463 decrRefCount(val); /* Deallocate the object from memory. */
9464 dictGetEntryVal(de) = NULL;
9465 redisLog(REDIS_DEBUG,
9466 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9467 (unsigned char*) key->ptr,
9468 (unsigned long long) j->page, (unsigned long long) j->pages);
9469 server.vm_stats_swapped_objects++;
9470 server.vm_stats_swapouts++;
9471 freeIOJob(j);
9472 /* Put a few more swap requests in queue if we are still
9473 * out of memory */
9474 if (trytoswap && vmCanSwapOut() &&
9475 zmalloc_used_memory() > server.vm_max_memory)
9476 {
9477 int more = 1;
9478 while(more) {
9479 lockThreadedIO();
9480 more = listLength(server.io_newjobs) <
9481 (unsigned) server.vm_max_threads;
9482 unlockThreadedIO();
9483 /* Don't waste CPU time if swappable objects are rare. */
9484 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9485 trytoswap = 0;
9486 break;
9487 }
9488 }
9489 }
9490 }
9491 processed++;
9492 if (processed == toprocess) return;
9493 }
9494 if (retval < 0 && errno != EAGAIN) {
9495 redisLog(REDIS_WARNING,
9496 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9497 strerror(errno));
9498 }
9499 }
9500
9501 static void lockThreadedIO(void) {
9502 pthread_mutex_lock(&server.io_mutex);
9503 }
9504
9505 static void unlockThreadedIO(void) {
9506 pthread_mutex_unlock(&server.io_mutex);
9507 }
9508
9509 /* Remove the specified object from the threaded I/O queue if still not
9510 * processed, otherwise make sure to flag it as canceled. */
9511 static void vmCancelThreadedIOJob(robj *o) {
9512 list *lists[3] = {
9513 server.io_newjobs, /* 0 */
9514 server.io_processing, /* 1 */
9515 server.io_processed /* 2 */
9516 };
9517 int i;
9518
9519 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9520 again:
9521 lockThreadedIO();
9522 /* Search for a matching key in one of the queues */
9523 for (i = 0; i < 3; i++) {
9524 listNode *ln;
9525 listIter li;
9526
9527 listRewind(lists[i],&li);
9528 while ((ln = listNext(&li)) != NULL) {
9529 iojob *job = ln->value;
9530
9531 if (job->canceled) continue; /* Skip this, already canceled. */
9532 if (job->key == o) {
9533 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9534 (void*)job, (char*)o->ptr, job->type, i);
9535 /* Mark the pages as free since the swap didn't happened
9536 * or happened but is now discarded. */
9537 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9538 vmMarkPagesFree(job->page,job->pages);
9539 /* Cancel the job. It depends on the list the job is
9540 * living in. */
9541 switch(i) {
9542 case 0: /* io_newjobs */
9543 /* If the job was yet not processed the best thing to do
9544 * is to remove it from the queue at all */
9545 freeIOJob(job);
9546 listDelNode(lists[i],ln);
9547 break;
9548 case 1: /* io_processing */
9549 /* Oh Shi- the thread is messing with the Job:
9550 *
9551 * Probably it's accessing the object if this is a
9552 * PREPARE_SWAP or DO_SWAP job.
9553 * If it's a LOAD job it may be reading from disk and
9554 * if we don't wait for the job to terminate before to
9555 * cancel it, maybe in a few microseconds data can be
9556 * corrupted in this pages. So the short story is:
9557 *
9558 * Better to wait for the job to move into the
9559 * next queue (processed)... */
9560
9561 /* We try again and again until the job is completed. */
9562 unlockThreadedIO();
9563 /* But let's wait some time for the I/O thread
9564 * to finish with this job. After all this condition
9565 * should be very rare. */
9566 usleep(1);
9567 goto again;
9568 case 2: /* io_processed */
9569 /* The job was already processed, that's easy...
9570 * just mark it as canceled so that we'll ignore it
9571 * when processing completed jobs. */
9572 job->canceled = 1;
9573 break;
9574 }
9575 /* Finally we have to adjust the storage type of the object
9576 * in order to "UNDO" the operaiton. */
9577 if (o->storage == REDIS_VM_LOADING)
9578 o->storage = REDIS_VM_SWAPPED;
9579 else if (o->storage == REDIS_VM_SWAPPING)
9580 o->storage = REDIS_VM_MEMORY;
9581 unlockThreadedIO();
9582 return;
9583 }
9584 }
9585 }
9586 unlockThreadedIO();
9587 assert(1 != 1); /* We should never reach this */
9588 }
9589
9590 static void *IOThreadEntryPoint(void *arg) {
9591 iojob *j;
9592 listNode *ln;
9593 REDIS_NOTUSED(arg);
9594
9595 pthread_detach(pthread_self());
9596 while(1) {
9597 /* Get a new job to process */
9598 lockThreadedIO();
9599 if (listLength(server.io_newjobs) == 0) {
9600 /* No new jobs in queue, exit. */
9601 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9602 (long) pthread_self());
9603 server.io_active_threads--;
9604 unlockThreadedIO();
9605 return NULL;
9606 }
9607 ln = listFirst(server.io_newjobs);
9608 j = ln->value;
9609 listDelNode(server.io_newjobs,ln);
9610 /* Add the job in the processing queue */
9611 j->thread = pthread_self();
9612 listAddNodeTail(server.io_processing,j);
9613 ln = listLast(server.io_processing); /* We use ln later to remove it */
9614 unlockThreadedIO();
9615 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9616 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9617
9618 /* Process the Job */
9619 if (j->type == REDIS_IOJOB_LOAD) {
9620 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9621 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9622 FILE *fp = fopen("/dev/null","w+");
9623 j->pages = rdbSavedObjectPages(j->val,fp);
9624 fclose(fp);
9625 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9626 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9627 j->canceled = 1;
9628 }
9629
9630 /* Done: insert the job into the processed queue */
9631 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9632 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9633 lockThreadedIO();
9634 listDelNode(server.io_processing,ln);
9635 listAddNodeTail(server.io_processed,j);
9636 unlockThreadedIO();
9637
9638 /* Signal the main thread there is new stuff to process */
9639 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9640 }
9641 return NULL; /* never reached */
9642 }
9643
9644 static void spawnIOThread(void) {
9645 pthread_t thread;
9646 sigset_t mask, omask;
9647 int err;
9648
9649 sigemptyset(&mask);
9650 sigaddset(&mask,SIGCHLD);
9651 sigaddset(&mask,SIGHUP);
9652 sigaddset(&mask,SIGPIPE);
9653 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9654 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9655 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9656 strerror(err));
9657 usleep(1000000);
9658 }
9659 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9660 server.io_active_threads++;
9661 }
9662
9663 /* We need to wait for the last thread to exit before we are able to
9664 * fork() in order to BGSAVE or BGREWRITEAOF. */
9665 static void waitEmptyIOJobsQueue(void) {
9666 while(1) {
9667 int io_processed_len;
9668
9669 lockThreadedIO();
9670 if (listLength(server.io_newjobs) == 0 &&
9671 listLength(server.io_processing) == 0 &&
9672 server.io_active_threads == 0)
9673 {
9674 unlockThreadedIO();
9675 return;
9676 }
9677 /* While waiting for empty jobs queue condition we post-process some
9678 * finshed job, as I/O threads may be hanging trying to write against
9679 * the io_ready_pipe_write FD but there are so much pending jobs that
9680 * it's blocking. */
9681 io_processed_len = listLength(server.io_processed);
9682 unlockThreadedIO();
9683 if (io_processed_len) {
9684 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9685 usleep(1000); /* 1 millisecond */
9686 } else {
9687 usleep(10000); /* 10 milliseconds */
9688 }
9689 }
9690 }
9691
9692 static void vmReopenSwapFile(void) {
9693 /* Note: we don't close the old one as we are in the child process
9694 * and don't want to mess at all with the original file object. */
9695 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9696 if (server.vm_fp == NULL) {
9697 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9698 server.vm_swap_file);
9699 _exit(1);
9700 }
9701 server.vm_fd = fileno(server.vm_fp);
9702 }
9703
9704 /* This function must be called while with threaded IO locked */
9705 static void queueIOJob(iojob *j) {
9706 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9707 (void*)j, j->type, (char*)j->key->ptr);
9708 listAddNodeTail(server.io_newjobs,j);
9709 if (server.io_active_threads < server.vm_max_threads)
9710 spawnIOThread();
9711 }
9712
9713 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9714 iojob *j;
9715
9716 assert(key->storage == REDIS_VM_MEMORY);
9717 assert(key->refcount == 1);
9718
9719 j = zmalloc(sizeof(*j));
9720 j->type = REDIS_IOJOB_PREPARE_SWAP;
9721 j->db = db;
9722 j->key = key;
9723 j->val = val;
9724 incrRefCount(val);
9725 j->canceled = 0;
9726 j->thread = (pthread_t) -1;
9727 key->storage = REDIS_VM_SWAPPING;
9728
9729 lockThreadedIO();
9730 queueIOJob(j);
9731 unlockThreadedIO();
9732 return REDIS_OK;
9733 }
9734
9735 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9736
9737 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9738 * If there is not already a job loading the key, it is craeted.
9739 * The key is added to the io_keys list in the client structure, and also
9740 * in the hash table mapping swapped keys to waiting clients, that is,
9741 * server.io_waited_keys. */
9742 static int waitForSwappedKey(redisClient *c, robj *key) {
9743 struct dictEntry *de;
9744 robj *o;
9745 list *l;
9746
9747 /* If the key does not exist or is already in RAM we don't need to
9748 * block the client at all. */
9749 de = dictFind(c->db->dict,key);
9750 if (de == NULL) return 0;
9751 o = dictGetEntryKey(de);
9752 if (o->storage == REDIS_VM_MEMORY) {
9753 return 0;
9754 } else if (o->storage == REDIS_VM_SWAPPING) {
9755 /* We were swapping the key, undo it! */
9756 vmCancelThreadedIOJob(o);
9757 return 0;
9758 }
9759
9760 /* OK: the key is either swapped, or being loaded just now. */
9761
9762 /* Add the key to the list of keys this client is waiting for.
9763 * This maps clients to keys they are waiting for. */
9764 listAddNodeTail(c->io_keys,key);
9765 incrRefCount(key);
9766
9767 /* Add the client to the swapped keys => clients waiting map. */
9768 de = dictFind(c->db->io_keys,key);
9769 if (de == NULL) {
9770 int retval;
9771
9772 /* For every key we take a list of clients blocked for it */
9773 l = listCreate();
9774 retval = dictAdd(c->db->io_keys,key,l);
9775 incrRefCount(key);
9776 assert(retval == DICT_OK);
9777 } else {
9778 l = dictGetEntryVal(de);
9779 }
9780 listAddNodeTail(l,c);
9781
9782 /* Are we already loading the key from disk? If not create a job */
9783 if (o->storage == REDIS_VM_SWAPPED) {
9784 iojob *j;
9785
9786 o->storage = REDIS_VM_LOADING;
9787 j = zmalloc(sizeof(*j));
9788 j->type = REDIS_IOJOB_LOAD;
9789 j->db = c->db;
9790 j->key = o;
9791 j->key->vtype = o->vtype;
9792 j->page = o->vm.page;
9793 j->val = NULL;
9794 j->canceled = 0;
9795 j->thread = (pthread_t) -1;
9796 lockThreadedIO();
9797 queueIOJob(j);
9798 unlockThreadedIO();
9799 }
9800 return 1;
9801 }
9802
9803 /* Preload keys for any command with first, last and step values for
9804 * the command keys prototype, as defined in the command table. */
9805 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9806 int j, last;
9807 if (cmd->vm_firstkey == 0) return;
9808 last = cmd->vm_lastkey;
9809 if (last < 0) last = argc+last;
9810 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9811 redisAssert(j < argc);
9812 waitForSwappedKey(c,argv[j]);
9813 }
9814 }
9815
9816 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9817 * Note that the number of keys to preload is user-defined, so we need to
9818 * apply a sanity check against argc. */
9819 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9820 int i, num;
9821 REDIS_NOTUSED(cmd);
9822
9823 num = atoi(argv[2]->ptr);
9824 if (num > (argc-3)) return;
9825 for (i = 0; i < num; i++) {
9826 waitForSwappedKey(c,argv[3+i]);
9827 }
9828 }
9829
9830 /* Preload keys needed to execute the entire MULTI/EXEC block.
9831 *
9832 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9833 * and will block the client when any command requires a swapped out value. */
9834 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9835 int i, margc;
9836 struct redisCommand *mcmd;
9837 robj **margv;
9838 REDIS_NOTUSED(cmd);
9839 REDIS_NOTUSED(argc);
9840 REDIS_NOTUSED(argv);
9841
9842 if (!(c->flags & REDIS_MULTI)) return;
9843 for (i = 0; i < c->mstate.count; i++) {
9844 mcmd = c->mstate.commands[i].cmd;
9845 margc = c->mstate.commands[i].argc;
9846 margv = c->mstate.commands[i].argv;
9847
9848 if (mcmd->vm_preload_proc != NULL) {
9849 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9850 } else {
9851 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9852 }
9853 }
9854 }
9855
9856 /* Is this client attempting to run a command against swapped keys?
9857 * If so, block it ASAP, load the keys in background, then resume it.
9858 *
9859 * The important idea about this function is that it can fail! If keys will
9860 * still be swapped when the client is resumed, this key lookups will
9861 * just block loading keys from disk. In practical terms this should only
9862 * happen with SORT BY command or if there is a bug in this function.
9863 *
9864 * Return 1 if the client is marked as blocked, 0 if the client can
9865 * continue as the keys it is going to access appear to be in memory. */
9866 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9867 if (cmd->vm_preload_proc != NULL) {
9868 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9869 } else {
9870 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9871 }
9872
9873 /* If the client was blocked for at least one key, mark it as blocked. */
9874 if (listLength(c->io_keys)) {
9875 c->flags |= REDIS_IO_WAIT;
9876 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9877 server.vm_blocked_clients++;
9878 return 1;
9879 } else {
9880 return 0;
9881 }
9882 }
9883
9884 /* Remove the 'key' from the list of blocked keys for a given client.
9885 *
9886 * The function returns 1 when there are no longer blocking keys after
9887 * the current one was removed (and the client can be unblocked). */
9888 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9889 list *l;
9890 listNode *ln;
9891 listIter li;
9892 struct dictEntry *de;
9893
9894 /* Remove the key from the list of keys this client is waiting for. */
9895 listRewind(c->io_keys,&li);
9896 while ((ln = listNext(&li)) != NULL) {
9897 if (equalStringObjects(ln->value,key)) {
9898 listDelNode(c->io_keys,ln);
9899 break;
9900 }
9901 }
9902 assert(ln != NULL);
9903
9904 /* Remove the client form the key => waiting clients map. */
9905 de = dictFind(c->db->io_keys,key);
9906 assert(de != NULL);
9907 l = dictGetEntryVal(de);
9908 ln = listSearchKey(l,c);
9909 assert(ln != NULL);
9910 listDelNode(l,ln);
9911 if (listLength(l) == 0)
9912 dictDelete(c->db->io_keys,key);
9913
9914 return listLength(c->io_keys) == 0;
9915 }
9916
9917 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9918 struct dictEntry *de;
9919 list *l;
9920 listNode *ln;
9921 int len;
9922
9923 de = dictFind(db->io_keys,key);
9924 if (!de) return;
9925
9926 l = dictGetEntryVal(de);
9927 len = listLength(l);
9928 /* Note: we can't use something like while(listLength(l)) as the list
9929 * can be freed by the calling function when we remove the last element. */
9930 while (len--) {
9931 ln = listFirst(l);
9932 redisClient *c = ln->value;
9933
9934 if (dontWaitForSwappedKey(c,key)) {
9935 /* Put the client in the list of clients ready to go as we
9936 * loaded all the keys about it. */
9937 listAddNodeTail(server.io_ready_clients,c);
9938 }
9939 }
9940 }
9941
9942 /* =========================== Remote Configuration ========================= */
9943
9944 static void configSetCommand(redisClient *c) {
9945 robj *o = getDecodedObject(c->argv[3]);
9946 long long ll;
9947
9948 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9949 zfree(server.dbfilename);
9950 server.dbfilename = zstrdup(o->ptr);
9951 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9952 zfree(server.requirepass);
9953 server.requirepass = zstrdup(o->ptr);
9954 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9955 zfree(server.masterauth);
9956 server.masterauth = zstrdup(o->ptr);
9957 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9958 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9959 ll < 0) goto badfmt;
9960 server.maxmemory = ll;
9961 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9962 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9963 ll < 0 || ll > LONG_MAX) goto badfmt;
9964 server.maxidletime = ll;
9965 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9966 if (!strcasecmp(o->ptr,"no")) {
9967 server.appendfsync = APPENDFSYNC_NO;
9968 } else if (!strcasecmp(o->ptr,"everysec")) {
9969 server.appendfsync = APPENDFSYNC_EVERYSEC;
9970 } else if (!strcasecmp(o->ptr,"always")) {
9971 server.appendfsync = APPENDFSYNC_ALWAYS;
9972 } else {
9973 goto badfmt;
9974 }
9975 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
9976 int yn = yesnotoi(o->ptr);
9977
9978 if (yn == -1) goto badfmt;
9979 server.no_appendfsync_on_rewrite = yn;
9980 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9981 int old = server.appendonly;
9982 int new = yesnotoi(o->ptr);
9983
9984 if (new == -1) goto badfmt;
9985 if (old != new) {
9986 if (new == 0) {
9987 stopAppendOnly();
9988 } else {
9989 if (startAppendOnly() == REDIS_ERR) {
9990 addReplySds(c,sdscatprintf(sdsempty(),
9991 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9992 decrRefCount(o);
9993 return;
9994 }
9995 }
9996 }
9997 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9998 int vlen, j;
9999 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10000
10001 /* Perform sanity check before setting the new config:
10002 * - Even number of args
10003 * - Seconds >= 1, changes >= 0 */
10004 if (vlen & 1) {
10005 sdsfreesplitres(v,vlen);
10006 goto badfmt;
10007 }
10008 for (j = 0; j < vlen; j++) {
10009 char *eptr;
10010 long val;
10011
10012 val = strtoll(v[j], &eptr, 10);
10013 if (eptr[0] != '\0' ||
10014 ((j & 1) == 0 && val < 1) ||
10015 ((j & 1) == 1 && val < 0)) {
10016 sdsfreesplitres(v,vlen);
10017 goto badfmt;
10018 }
10019 }
10020 /* Finally set the new config */
10021 resetServerSaveParams();
10022 for (j = 0; j < vlen; j += 2) {
10023 time_t seconds;
10024 int changes;
10025
10026 seconds = strtoll(v[j],NULL,10);
10027 changes = strtoll(v[j+1],NULL,10);
10028 appendServerSaveParams(seconds, changes);
10029 }
10030 sdsfreesplitres(v,vlen);
10031 } else {
10032 addReplySds(c,sdscatprintf(sdsempty(),
10033 "-ERR not supported CONFIG parameter %s\r\n",
10034 (char*)c->argv[2]->ptr));
10035 decrRefCount(o);
10036 return;
10037 }
10038 decrRefCount(o);
10039 addReply(c,shared.ok);
10040 return;
10041
10042 badfmt: /* Bad format errors */
10043 addReplySds(c,sdscatprintf(sdsempty(),
10044 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10045 (char*)o->ptr,
10046 (char*)c->argv[2]->ptr));
10047 decrRefCount(o);
10048 }
10049
10050 static void configGetCommand(redisClient *c) {
10051 robj *o = getDecodedObject(c->argv[2]);
10052 robj *lenobj = createObject(REDIS_STRING,NULL);
10053 char *pattern = o->ptr;
10054 int matches = 0;
10055
10056 addReply(c,lenobj);
10057 decrRefCount(lenobj);
10058
10059 if (stringmatch(pattern,"dbfilename",0)) {
10060 addReplyBulkCString(c,"dbfilename");
10061 addReplyBulkCString(c,server.dbfilename);
10062 matches++;
10063 }
10064 if (stringmatch(pattern,"requirepass",0)) {
10065 addReplyBulkCString(c,"requirepass");
10066 addReplyBulkCString(c,server.requirepass);
10067 matches++;
10068 }
10069 if (stringmatch(pattern,"masterauth",0)) {
10070 addReplyBulkCString(c,"masterauth");
10071 addReplyBulkCString(c,server.masterauth);
10072 matches++;
10073 }
10074 if (stringmatch(pattern,"maxmemory",0)) {
10075 char buf[128];
10076
10077 ll2string(buf,128,server.maxmemory);
10078 addReplyBulkCString(c,"maxmemory");
10079 addReplyBulkCString(c,buf);
10080 matches++;
10081 }
10082 if (stringmatch(pattern,"timeout",0)) {
10083 char buf[128];
10084
10085 ll2string(buf,128,server.maxidletime);
10086 addReplyBulkCString(c,"timeout");
10087 addReplyBulkCString(c,buf);
10088 matches++;
10089 }
10090 if (stringmatch(pattern,"appendonly",0)) {
10091 addReplyBulkCString(c,"appendonly");
10092 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10093 matches++;
10094 }
10095 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10096 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10097 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10098 matches++;
10099 }
10100 if (stringmatch(pattern,"appendfsync",0)) {
10101 char *policy;
10102
10103 switch(server.appendfsync) {
10104 case APPENDFSYNC_NO: policy = "no"; break;
10105 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10106 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10107 default: policy = "unknown"; break; /* too harmless to panic */
10108 }
10109 addReplyBulkCString(c,"appendfsync");
10110 addReplyBulkCString(c,policy);
10111 matches++;
10112 }
10113 if (stringmatch(pattern,"save",0)) {
10114 sds buf = sdsempty();
10115 int j;
10116
10117 for (j = 0; j < server.saveparamslen; j++) {
10118 buf = sdscatprintf(buf,"%ld %d",
10119 server.saveparams[j].seconds,
10120 server.saveparams[j].changes);
10121 if (j != server.saveparamslen-1)
10122 buf = sdscatlen(buf," ",1);
10123 }
10124 addReplyBulkCString(c,"save");
10125 addReplyBulkCString(c,buf);
10126 sdsfree(buf);
10127 matches++;
10128 }
10129 decrRefCount(o);
10130 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10131 }
10132
10133 static void configCommand(redisClient *c) {
10134 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10135 if (c->argc != 4) goto badarity;
10136 configSetCommand(c);
10137 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10138 if (c->argc != 3) goto badarity;
10139 configGetCommand(c);
10140 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10141 if (c->argc != 2) goto badarity;
10142 server.stat_numcommands = 0;
10143 server.stat_numconnections = 0;
10144 server.stat_expiredkeys = 0;
10145 server.stat_starttime = time(NULL);
10146 addReply(c,shared.ok);
10147 } else {
10148 addReplySds(c,sdscatprintf(sdsempty(),
10149 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10150 }
10151 return;
10152
10153 badarity:
10154 addReplySds(c,sdscatprintf(sdsempty(),
10155 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10156 (char*) c->argv[1]->ptr));
10157 }
10158
10159 /* =========================== Pubsub implementation ======================== */
10160
10161 static void freePubsubPattern(void *p) {
10162 pubsubPattern *pat = p;
10163
10164 decrRefCount(pat->pattern);
10165 zfree(pat);
10166 }
10167
10168 static int listMatchPubsubPattern(void *a, void *b) {
10169 pubsubPattern *pa = a, *pb = b;
10170
10171 return (pa->client == pb->client) &&
10172 (equalStringObjects(pa->pattern,pb->pattern));
10173 }
10174
10175 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10176 * 0 if the client was already subscribed to that channel. */
10177 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10178 struct dictEntry *de;
10179 list *clients = NULL;
10180 int retval = 0;
10181
10182 /* Add the channel to the client -> channels hash table */
10183 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10184 retval = 1;
10185 incrRefCount(channel);
10186 /* Add the client to the channel -> list of clients hash table */
10187 de = dictFind(server.pubsub_channels,channel);
10188 if (de == NULL) {
10189 clients = listCreate();
10190 dictAdd(server.pubsub_channels,channel,clients);
10191 incrRefCount(channel);
10192 } else {
10193 clients = dictGetEntryVal(de);
10194 }
10195 listAddNodeTail(clients,c);
10196 }
10197 /* Notify the client */
10198 addReply(c,shared.mbulk3);
10199 addReply(c,shared.subscribebulk);
10200 addReplyBulk(c,channel);
10201 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10202 return retval;
10203 }
10204
10205 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10206 * 0 if the client was not subscribed to the specified channel. */
10207 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10208 struct dictEntry *de;
10209 list *clients;
10210 listNode *ln;
10211 int retval = 0;
10212
10213 /* Remove the channel from the client -> channels hash table */
10214 incrRefCount(channel); /* channel may be just a pointer to the same object
10215 we have in the hash tables. Protect it... */
10216 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10217 retval = 1;
10218 /* Remove the client from the channel -> clients list hash table */
10219 de = dictFind(server.pubsub_channels,channel);
10220 assert(de != NULL);
10221 clients = dictGetEntryVal(de);
10222 ln = listSearchKey(clients,c);
10223 assert(ln != NULL);
10224 listDelNode(clients,ln);
10225 if (listLength(clients) == 0) {
10226 /* Free the list and associated hash entry at all if this was
10227 * the latest client, so that it will be possible to abuse
10228 * Redis PUBSUB creating millions of channels. */
10229 dictDelete(server.pubsub_channels,channel);
10230 }
10231 }
10232 /* Notify the client */
10233 if (notify) {
10234 addReply(c,shared.mbulk3);
10235 addReply(c,shared.unsubscribebulk);
10236 addReplyBulk(c,channel);
10237 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10238 listLength(c->pubsub_patterns));
10239
10240 }
10241 decrRefCount(channel); /* it is finally safe to release it */
10242 return retval;
10243 }
10244
10245 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10246 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10247 int retval = 0;
10248
10249 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10250 retval = 1;
10251 pubsubPattern *pat;
10252 listAddNodeTail(c->pubsub_patterns,pattern);
10253 incrRefCount(pattern);
10254 pat = zmalloc(sizeof(*pat));
10255 pat->pattern = getDecodedObject(pattern);
10256 pat->client = c;
10257 listAddNodeTail(server.pubsub_patterns,pat);
10258 }
10259 /* Notify the client */
10260 addReply(c,shared.mbulk3);
10261 addReply(c,shared.psubscribebulk);
10262 addReplyBulk(c,pattern);
10263 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10264 return retval;
10265 }
10266
10267 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10268 * 0 if the client was not subscribed to the specified channel. */
10269 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10270 listNode *ln;
10271 pubsubPattern pat;
10272 int retval = 0;
10273
10274 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10275 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10276 retval = 1;
10277 listDelNode(c->pubsub_patterns,ln);
10278 pat.client = c;
10279 pat.pattern = pattern;
10280 ln = listSearchKey(server.pubsub_patterns,&pat);
10281 listDelNode(server.pubsub_patterns,ln);
10282 }
10283 /* Notify the client */
10284 if (notify) {
10285 addReply(c,shared.mbulk3);
10286 addReply(c,shared.punsubscribebulk);
10287 addReplyBulk(c,pattern);
10288 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10289 listLength(c->pubsub_patterns));
10290 }
10291 decrRefCount(pattern);
10292 return retval;
10293 }
10294
10295 /* Unsubscribe from all the channels. Return the number of channels the
10296 * client was subscribed from. */
10297 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10298 dictIterator *di = dictGetIterator(c->pubsub_channels);
10299 dictEntry *de;
10300 int count = 0;
10301
10302 while((de = dictNext(di)) != NULL) {
10303 robj *channel = dictGetEntryKey(de);
10304
10305 count += pubsubUnsubscribeChannel(c,channel,notify);
10306 }
10307 dictReleaseIterator(di);
10308 return count;
10309 }
10310
10311 /* Unsubscribe from all the patterns. Return the number of patterns the
10312 * client was subscribed from. */
10313 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10314 listNode *ln;
10315 listIter li;
10316 int count = 0;
10317
10318 listRewind(c->pubsub_patterns,&li);
10319 while ((ln = listNext(&li)) != NULL) {
10320 robj *pattern = ln->value;
10321
10322 count += pubsubUnsubscribePattern(c,pattern,notify);
10323 }
10324 return count;
10325 }
10326
10327 /* Publish a message */
10328 static int pubsubPublishMessage(robj *channel, robj *message) {
10329 int receivers = 0;
10330 struct dictEntry *de;
10331 listNode *ln;
10332 listIter li;
10333
10334 /* Send to clients listening for that channel */
10335 de = dictFind(server.pubsub_channels,channel);
10336 if (de) {
10337 list *list = dictGetEntryVal(de);
10338 listNode *ln;
10339 listIter li;
10340
10341 listRewind(list,&li);
10342 while ((ln = listNext(&li)) != NULL) {
10343 redisClient *c = ln->value;
10344
10345 addReply(c,shared.mbulk3);
10346 addReply(c,shared.messagebulk);
10347 addReplyBulk(c,channel);
10348 addReplyBulk(c,message);
10349 receivers++;
10350 }
10351 }
10352 /* Send to clients listening to matching channels */
10353 if (listLength(server.pubsub_patterns)) {
10354 listRewind(server.pubsub_patterns,&li);
10355 channel = getDecodedObject(channel);
10356 while ((ln = listNext(&li)) != NULL) {
10357 pubsubPattern *pat = ln->value;
10358
10359 if (stringmatchlen((char*)pat->pattern->ptr,
10360 sdslen(pat->pattern->ptr),
10361 (char*)channel->ptr,
10362 sdslen(channel->ptr),0)) {
10363 addReply(pat->client,shared.mbulk4);
10364 addReply(pat->client,shared.pmessagebulk);
10365 addReplyBulk(pat->client,pat->pattern);
10366 addReplyBulk(pat->client,channel);
10367 addReplyBulk(pat->client,message);
10368 receivers++;
10369 }
10370 }
10371 decrRefCount(channel);
10372 }
10373 return receivers;
10374 }
10375
10376 static void subscribeCommand(redisClient *c) {
10377 int j;
10378
10379 for (j = 1; j < c->argc; j++)
10380 pubsubSubscribeChannel(c,c->argv[j]);
10381 }
10382
10383 static void unsubscribeCommand(redisClient *c) {
10384 if (c->argc == 1) {
10385 pubsubUnsubscribeAllChannels(c,1);
10386 return;
10387 } else {
10388 int j;
10389
10390 for (j = 1; j < c->argc; j++)
10391 pubsubUnsubscribeChannel(c,c->argv[j],1);
10392 }
10393 }
10394
10395 static void psubscribeCommand(redisClient *c) {
10396 int j;
10397
10398 for (j = 1; j < c->argc; j++)
10399 pubsubSubscribePattern(c,c->argv[j]);
10400 }
10401
10402 static void punsubscribeCommand(redisClient *c) {
10403 if (c->argc == 1) {
10404 pubsubUnsubscribeAllPatterns(c,1);
10405 return;
10406 } else {
10407 int j;
10408
10409 for (j = 1; j < c->argc; j++)
10410 pubsubUnsubscribePattern(c,c->argv[j],1);
10411 }
10412 }
10413
10414 static void publishCommand(redisClient *c) {
10415 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10416 addReplyLongLong(c,receivers);
10417 }
10418
10419 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10420 *
10421 * The implementation uses a per-DB hash table mapping keys to list of clients
10422 * WATCHing those keys, so that given a key that is going to be modified
10423 * we can mark all the associated clients as dirty.
10424 *
10425 * Also every client contains a list of WATCHed keys so that's possible to
10426 * un-watch such keys when the client is freed or when UNWATCH is called. */
10427
10428 /* In the client->watched_keys list we need to use watchedKey structures
10429 * as in order to identify a key in Redis we need both the key name and the
10430 * DB */
10431 typedef struct watchedKey {
10432 robj *key;
10433 redisDb *db;
10434 } watchedKey;
10435
10436 /* Watch for the specified key */
10437 static void watchForKey(redisClient *c, robj *key) {
10438 list *clients = NULL;
10439 listIter li;
10440 listNode *ln;
10441 watchedKey *wk;
10442
10443 /* Check if we are already watching for this key */
10444 listRewind(c->watched_keys,&li);
10445 while((ln = listNext(&li))) {
10446 wk = listNodeValue(ln);
10447 if (wk->db == c->db && equalStringObjects(key,wk->key))
10448 return; /* Key already watched */
10449 }
10450 /* This key is not already watched in this DB. Let's add it */
10451 clients = dictFetchValue(c->db->watched_keys,key);
10452 if (!clients) {
10453 clients = listCreate();
10454 dictAdd(c->db->watched_keys,key,clients);
10455 incrRefCount(key);
10456 }
10457 listAddNodeTail(clients,c);
10458 /* Add the new key to the lits of keys watched by this client */
10459 wk = zmalloc(sizeof(*wk));
10460 wk->key = key;
10461 wk->db = c->db;
10462 incrRefCount(key);
10463 listAddNodeTail(c->watched_keys,wk);
10464 }
10465
10466 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10467 * flag is up to the caller. */
10468 static void unwatchAllKeys(redisClient *c) {
10469 listIter li;
10470 listNode *ln;
10471
10472 if (listLength(c->watched_keys) == 0) return;
10473 listRewind(c->watched_keys,&li);
10474 while((ln = listNext(&li))) {
10475 list *clients;
10476 watchedKey *wk;
10477
10478 /* Lookup the watched key -> clients list and remove the client
10479 * from the list */
10480 wk = listNodeValue(ln);
10481 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10482 assert(clients != NULL);
10483 listDelNode(clients,listSearchKey(clients,c));
10484 /* Kill the entry at all if this was the only client */
10485 if (listLength(clients) == 0)
10486 dictDelete(wk->db->watched_keys, wk->key);
10487 /* Remove this watched key from the client->watched list */
10488 listDelNode(c->watched_keys,ln);
10489 decrRefCount(wk->key);
10490 zfree(wk);
10491 }
10492 }
10493
10494 /* "Touch" a key, so that if this key is being WATCHed by some client the
10495 * next EXEC will fail. */
10496 static void touchWatchedKey(redisDb *db, robj *key) {
10497 list *clients;
10498 listIter li;
10499 listNode *ln;
10500
10501 if (dictSize(db->watched_keys) == 0) return;
10502 clients = dictFetchValue(db->watched_keys, key);
10503 if (!clients) return;
10504
10505 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10506 /* Check if we are already watching for this key */
10507 listRewind(clients,&li);
10508 while((ln = listNext(&li))) {
10509 redisClient *c = listNodeValue(ln);
10510
10511 c->flags |= REDIS_DIRTY_CAS;
10512 }
10513 }
10514
10515 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10516 * flush but will be deleted as effect of the flushing operation should
10517 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10518 * a FLUSHALL operation (all the DBs flushed). */
10519 static void touchWatchedKeysOnFlush(int dbid) {
10520 listIter li1, li2;
10521 listNode *ln;
10522
10523 /* For every client, check all the waited keys */
10524 listRewind(server.clients,&li1);
10525 while((ln = listNext(&li1))) {
10526 redisClient *c = listNodeValue(ln);
10527 listRewind(c->watched_keys,&li2);
10528 while((ln = listNext(&li2))) {
10529 watchedKey *wk = listNodeValue(ln);
10530
10531 /* For every watched key matching the specified DB, if the
10532 * key exists, mark the client as dirty, as the key will be
10533 * removed. */
10534 if (dbid == -1 || wk->db->id == dbid) {
10535 if (dictFind(wk->db->dict, wk->key) != NULL)
10536 c->flags |= REDIS_DIRTY_CAS;
10537 }
10538 }
10539 }
10540 }
10541
10542 static void watchCommand(redisClient *c) {
10543 int j;
10544
10545 if (c->flags & REDIS_MULTI) {
10546 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10547 return;
10548 }
10549 for (j = 1; j < c->argc; j++)
10550 watchForKey(c,c->argv[j]);
10551 addReply(c,shared.ok);
10552 }
10553
10554 static void unwatchCommand(redisClient *c) {
10555 unwatchAllKeys(c);
10556 c->flags &= (~REDIS_DIRTY_CAS);
10557 addReply(c,shared.ok);
10558 }
10559
10560 /* ================================= Debugging ============================== */
10561
10562 /* Compute the sha1 of string at 's' with 'len' bytes long.
10563 * The SHA1 is then xored againt the string pointed by digest.
10564 * Since xor is commutative, this operation is used in order to
10565 * "add" digests relative to unordered elements.
10566 *
10567 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10568 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10569 SHA1_CTX ctx;
10570 unsigned char hash[20], *s = ptr;
10571 int j;
10572
10573 SHA1Init(&ctx);
10574 SHA1Update(&ctx,s,len);
10575 SHA1Final(hash,&ctx);
10576
10577 for (j = 0; j < 20; j++)
10578 digest[j] ^= hash[j];
10579 }
10580
10581 static void xorObjectDigest(unsigned char *digest, robj *o) {
10582 o = getDecodedObject(o);
10583 xorDigest(digest,o->ptr,sdslen(o->ptr));
10584 decrRefCount(o);
10585 }
10586
10587 /* This function instead of just computing the SHA1 and xoring it
10588 * against diget, also perform the digest of "digest" itself and
10589 * replace the old value with the new one.
10590 *
10591 * So the final digest will be:
10592 *
10593 * digest = SHA1(digest xor SHA1(data))
10594 *
10595 * This function is used every time we want to preserve the order so
10596 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10597 *
10598 * Also note that mixdigest("foo") followed by mixdigest("bar")
10599 * will lead to a different digest compared to "fo", "obar".
10600 */
10601 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10602 SHA1_CTX ctx;
10603 char *s = ptr;
10604
10605 xorDigest(digest,s,len);
10606 SHA1Init(&ctx);
10607 SHA1Update(&ctx,digest,20);
10608 SHA1Final(digest,&ctx);
10609 }
10610
10611 static void mixObjectDigest(unsigned char *digest, robj *o) {
10612 o = getDecodedObject(o);
10613 mixDigest(digest,o->ptr,sdslen(o->ptr));
10614 decrRefCount(o);
10615 }
10616
10617 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10618 * are not ordered, we use a trick: every aggregate digest is the xor
10619 * of the digests of their elements. This way the order will not change
10620 * the result. For list instead we use a feedback entering the output digest
10621 * as input in order to ensure that a different ordered list will result in
10622 * a different digest. */
10623 static void computeDatasetDigest(unsigned char *final) {
10624 unsigned char digest[20];
10625 char buf[128];
10626 dictIterator *di = NULL;
10627 dictEntry *de;
10628 int j;
10629 uint32_t aux;
10630
10631 memset(final,0,20); /* Start with a clean result */
10632
10633 for (j = 0; j < server.dbnum; j++) {
10634 redisDb *db = server.db+j;
10635
10636 if (dictSize(db->dict) == 0) continue;
10637 di = dictGetIterator(db->dict);
10638
10639 /* hash the DB id, so the same dataset moved in a different
10640 * DB will lead to a different digest */
10641 aux = htonl(j);
10642 mixDigest(final,&aux,sizeof(aux));
10643
10644 /* Iterate this DB writing every entry */
10645 while((de = dictNext(di)) != NULL) {
10646 robj *key, *o, *kcopy;
10647 time_t expiretime;
10648
10649 memset(digest,0,20); /* This key-val digest */
10650 key = dictGetEntryKey(de);
10651
10652 if (!server.vm_enabled) {
10653 mixObjectDigest(digest,key);
10654 o = dictGetEntryVal(de);
10655 } else {
10656 /* Don't work with the key directly as when VM is active
10657 * this is unsafe: TODO: fix decrRefCount to check if the
10658 * count really reached 0 to avoid this mess */
10659 kcopy = dupStringObject(key);
10660 mixObjectDigest(digest,kcopy);
10661 o = lookupKeyRead(db,kcopy);
10662 decrRefCount(kcopy);
10663 }
10664 aux = htonl(o->type);
10665 mixDigest(digest,&aux,sizeof(aux));
10666 expiretime = getExpire(db,key);
10667
10668 /* Save the key and associated value */
10669 if (o->type == REDIS_STRING) {
10670 mixObjectDigest(digest,o);
10671 } else if (o->type == REDIS_LIST) {
10672 list *list = o->ptr;
10673 listNode *ln;
10674 listIter li;
10675
10676 listRewind(list,&li);
10677 while((ln = listNext(&li))) {
10678 robj *eleobj = listNodeValue(ln);
10679
10680 mixObjectDigest(digest,eleobj);
10681 }
10682 } else if (o->type == REDIS_SET) {
10683 dict *set = o->ptr;
10684 dictIterator *di = dictGetIterator(set);
10685 dictEntry *de;
10686
10687 while((de = dictNext(di)) != NULL) {
10688 robj *eleobj = dictGetEntryKey(de);
10689
10690 xorObjectDigest(digest,eleobj);
10691 }
10692 dictReleaseIterator(di);
10693 } else if (o->type == REDIS_ZSET) {
10694 zset *zs = o->ptr;
10695 dictIterator *di = dictGetIterator(zs->dict);
10696 dictEntry *de;
10697
10698 while((de = dictNext(di)) != NULL) {
10699 robj *eleobj = dictGetEntryKey(de);
10700 double *score = dictGetEntryVal(de);
10701 unsigned char eledigest[20];
10702
10703 snprintf(buf,sizeof(buf),"%.17g",*score);
10704 memset(eledigest,0,20);
10705 mixObjectDigest(eledigest,eleobj);
10706 mixDigest(eledigest,buf,strlen(buf));
10707 xorDigest(digest,eledigest,20);
10708 }
10709 dictReleaseIterator(di);
10710 } else if (o->type == REDIS_HASH) {
10711 hashIterator *hi;
10712 robj *obj;
10713
10714 hi = hashInitIterator(o);
10715 while (hashNext(hi) != REDIS_ERR) {
10716 unsigned char eledigest[20];
10717
10718 memset(eledigest,0,20);
10719 obj = hashCurrent(hi,REDIS_HASH_KEY);
10720 mixObjectDigest(eledigest,obj);
10721 decrRefCount(obj);
10722 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10723 mixObjectDigest(eledigest,obj);
10724 decrRefCount(obj);
10725 xorDigest(digest,eledigest,20);
10726 }
10727 hashReleaseIterator(hi);
10728 } else {
10729 redisPanic("Unknown object type");
10730 }
10731 /* If the key has an expire, add it to the mix */
10732 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10733 /* We can finally xor the key-val digest to the final digest */
10734 xorDigest(final,digest,20);
10735 }
10736 dictReleaseIterator(di);
10737 }
10738 }
10739
10740 static void debugCommand(redisClient *c) {
10741 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10742 *((char*)-1) = 'x';
10743 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10744 if (rdbSave(server.dbfilename) != REDIS_OK) {
10745 addReply(c,shared.err);
10746 return;
10747 }
10748 emptyDb();
10749 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10750 addReply(c,shared.err);
10751 return;
10752 }
10753 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10754 addReply(c,shared.ok);
10755 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10756 emptyDb();
10757 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10758 addReply(c,shared.err);
10759 return;
10760 }
10761 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10762 addReply(c,shared.ok);
10763 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10764 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10765 robj *key, *val;
10766
10767 if (!de) {
10768 addReply(c,shared.nokeyerr);
10769 return;
10770 }
10771 key = dictGetEntryKey(de);
10772 val = dictGetEntryVal(de);
10773 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10774 key->storage == REDIS_VM_SWAPPING)) {
10775 char *strenc;
10776 char buf[128];
10777
10778 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10779 strenc = strencoding[val->encoding];
10780 } else {
10781 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10782 strenc = buf;
10783 }
10784 addReplySds(c,sdscatprintf(sdsempty(),
10785 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10786 "encoding:%s serializedlength:%lld\r\n",
10787 (void*)key, key->refcount, (void*)val, val->refcount,
10788 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10789 } else {
10790 addReplySds(c,sdscatprintf(sdsempty(),
10791 "+Key at:%p refcount:%d, value swapped at: page %llu "
10792 "using %llu pages\r\n",
10793 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10794 (unsigned long long) key->vm.usedpages));
10795 }
10796 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10797 lookupKeyRead(c->db,c->argv[2]);
10798 addReply(c,shared.ok);
10799 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10800 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10801 robj *key, *val;
10802
10803 if (!server.vm_enabled) {
10804 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10805 return;
10806 }
10807 if (!de) {
10808 addReply(c,shared.nokeyerr);
10809 return;
10810 }
10811 key = dictGetEntryKey(de);
10812 val = dictGetEntryVal(de);
10813 /* If the key is shared we want to create a copy */
10814 if (key->refcount > 1) {
10815 robj *newkey = dupStringObject(key);
10816 decrRefCount(key);
10817 key = dictGetEntryKey(de) = newkey;
10818 }
10819 /* Swap it */
10820 if (key->storage != REDIS_VM_MEMORY) {
10821 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10822 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10823 dictGetEntryVal(de) = NULL;
10824 addReply(c,shared.ok);
10825 } else {
10826 addReply(c,shared.err);
10827 }
10828 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10829 long keys, j;
10830 robj *key, *val;
10831 char buf[128];
10832
10833 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10834 return;
10835 for (j = 0; j < keys; j++) {
10836 snprintf(buf,sizeof(buf),"key:%lu",j);
10837 key = createStringObject(buf,strlen(buf));
10838 if (lookupKeyRead(c->db,key) != NULL) {
10839 decrRefCount(key);
10840 continue;
10841 }
10842 snprintf(buf,sizeof(buf),"value:%lu",j);
10843 val = createStringObject(buf,strlen(buf));
10844 dictAdd(c->db->dict,key,val);
10845 }
10846 addReply(c,shared.ok);
10847 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10848 unsigned char digest[20];
10849 sds d = sdsnew("+");
10850 int j;
10851
10852 computeDatasetDigest(digest);
10853 for (j = 0; j < 20; j++)
10854 d = sdscatprintf(d, "%02x",digest[j]);
10855
10856 d = sdscatlen(d,"\r\n",2);
10857 addReplySds(c,d);
10858 } else {
10859 addReplySds(c,sdsnew(
10860 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10861 }
10862 }
10863
10864 static void _redisAssert(char *estr, char *file, int line) {
10865 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10866 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10867 #ifdef HAVE_BACKTRACE
10868 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10869 *((char*)-1) = 'x';
10870 #endif
10871 }
10872
10873 static void _redisPanic(char *msg, char *file, int line) {
10874 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10875 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10876 #ifdef HAVE_BACKTRACE
10877 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10878 *((char*)-1) = 'x';
10879 #endif
10880 }
10881
10882 /* =================================== Main! ================================ */
10883
10884 #ifdef __linux__
10885 int linuxOvercommitMemoryValue(void) {
10886 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10887 char buf[64];
10888
10889 if (!fp) return -1;
10890 if (fgets(buf,64,fp) == NULL) {
10891 fclose(fp);
10892 return -1;
10893 }
10894 fclose(fp);
10895
10896 return atoi(buf);
10897 }
10898
10899 void linuxOvercommitMemoryWarning(void) {
10900 if (linuxOvercommitMemoryValue() == 0) {
10901 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10902 }
10903 }
10904 #endif /* __linux__ */
10905
10906 static void daemonize(void) {
10907 int fd;
10908 FILE *fp;
10909
10910 if (fork() != 0) exit(0); /* parent exits */
10911 setsid(); /* create a new session */
10912
10913 /* Every output goes to /dev/null. If Redis is daemonized but
10914 * the 'logfile' is set to 'stdout' in the configuration file
10915 * it will not log at all. */
10916 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10917 dup2(fd, STDIN_FILENO);
10918 dup2(fd, STDOUT_FILENO);
10919 dup2(fd, STDERR_FILENO);
10920 if (fd > STDERR_FILENO) close(fd);
10921 }
10922 /* Try to write the pid file */
10923 fp = fopen(server.pidfile,"w");
10924 if (fp) {
10925 fprintf(fp,"%d\n",getpid());
10926 fclose(fp);
10927 }
10928 }
10929
10930 static void version() {
10931 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10932 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
10933 exit(0);
10934 }
10935
10936 static void usage() {
10937 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10938 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10939 exit(1);
10940 }
10941
10942 int main(int argc, char **argv) {
10943 time_t start;
10944
10945 initServerConfig();
10946 if (argc == 2) {
10947 if (strcmp(argv[1], "-v") == 0 ||
10948 strcmp(argv[1], "--version") == 0) version();
10949 if (strcmp(argv[1], "--help") == 0) usage();
10950 resetServerSaveParams();
10951 loadServerConfig(argv[1]);
10952 } else if ((argc > 2)) {
10953 usage();
10954 } else {
10955 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10956 }
10957 if (server.daemonize) daemonize();
10958 initServer();
10959 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10960 #ifdef __linux__
10961 linuxOvercommitMemoryWarning();
10962 #endif
10963 start = time(NULL);
10964 if (server.appendonly) {
10965 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10966 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10967 } else {
10968 if (rdbLoad(server.dbfilename) == REDIS_OK)
10969 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10970 }
10971 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10972 aeSetBeforeSleepProc(server.el,beforeSleep);
10973 aeMain(server.el);
10974 aeDeleteEventLoop(server.el);
10975 return 0;
10976 }
10977
10978 /* ============================= Backtrace support ========================= */
10979
10980 #ifdef HAVE_BACKTRACE
10981 static char *findFuncName(void *pointer, unsigned long *offset);
10982
10983 static void *getMcontextEip(ucontext_t *uc) {
10984 #if defined(__FreeBSD__)
10985 return (void*) uc->uc_mcontext.mc_eip;
10986 #elif defined(__dietlibc__)
10987 return (void*) uc->uc_mcontext.eip;
10988 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10989 #if __x86_64__
10990 return (void*) uc->uc_mcontext->__ss.__rip;
10991 #else
10992 return (void*) uc->uc_mcontext->__ss.__eip;
10993 #endif
10994 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10995 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10996 return (void*) uc->uc_mcontext->__ss.__rip;
10997 #else
10998 return (void*) uc->uc_mcontext->__ss.__eip;
10999 #endif
11000 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11001 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11002 #elif defined(__ia64__) /* Linux IA64 */
11003 return (void*) uc->uc_mcontext.sc_ip;
11004 #else
11005 return NULL;
11006 #endif
11007 }
11008
11009 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11010 void *trace[100];
11011 char **messages = NULL;
11012 int i, trace_size = 0;
11013 unsigned long offset=0;
11014 ucontext_t *uc = (ucontext_t*) secret;
11015 sds infostring;
11016 REDIS_NOTUSED(info);
11017
11018 redisLog(REDIS_WARNING,
11019 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11020 infostring = genRedisInfoString();
11021 redisLog(REDIS_WARNING, "%s",infostring);
11022 /* It's not safe to sdsfree() the returned string under memory
11023 * corruption conditions. Let it leak as we are going to abort */
11024
11025 trace_size = backtrace(trace, 100);
11026 /* overwrite sigaction with caller's address */
11027 if (getMcontextEip(uc) != NULL) {
11028 trace[1] = getMcontextEip(uc);
11029 }
11030 messages = backtrace_symbols(trace, trace_size);
11031
11032 for (i=1; i<trace_size; ++i) {
11033 char *fn = findFuncName(trace[i], &offset), *p;
11034
11035 p = strchr(messages[i],'+');
11036 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11037 redisLog(REDIS_WARNING,"%s", messages[i]);
11038 } else {
11039 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11040 }
11041 }
11042 /* free(messages); Don't call free() with possibly corrupted memory. */
11043 _exit(0);
11044 }
11045
11046 static void sigtermHandler(int sig) {
11047 REDIS_NOTUSED(sig);
11048
11049 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11050 server.shutdown_asap = 1;
11051 }
11052
11053 static void setupSigSegvAction(void) {
11054 struct sigaction act;
11055
11056 sigemptyset (&act.sa_mask);
11057 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11058 * is used. Otherwise, sa_handler is used */
11059 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11060 act.sa_sigaction = segvHandler;
11061 sigaction (SIGSEGV, &act, NULL);
11062 sigaction (SIGBUS, &act, NULL);
11063 sigaction (SIGFPE, &act, NULL);
11064 sigaction (SIGILL, &act, NULL);
11065 sigaction (SIGBUS, &act, NULL);
11066
11067 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11068 act.sa_handler = sigtermHandler;
11069 sigaction (SIGTERM, &act, NULL);
11070 return;
11071 }
11072
11073 #include "staticsymbols.h"
11074 /* This function try to convert a pointer into a function name. It's used in
11075 * oreder to provide a backtrace under segmentation fault that's able to
11076 * display functions declared as static (otherwise the backtrace is useless). */
11077 static char *findFuncName(void *pointer, unsigned long *offset){
11078 int i, ret = -1;
11079 unsigned long off, minoff = 0;
11080
11081 /* Try to match against the Symbol with the smallest offset */
11082 for (i=0; symsTable[i].pointer; i++) {
11083 unsigned long lp = (unsigned long) pointer;
11084
11085 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11086 off=lp-symsTable[i].pointer;
11087 if (ret < 0 || off < minoff) {
11088 minoff=off;
11089 ret=i;
11090 }
11091 }
11092 }
11093 if (ret == -1) return NULL;
11094 *offset = minoff;
11095 return symsTable[ret].name;
11096 }
11097 #else /* HAVE_BACKTRACE */
11098 static void setupSigSegvAction(void) {
11099 }
11100 #endif /* HAVE_BACKTRACE */
11101
11102
11103
11104 /* The End */
11105
11106
11107